Skip to content

Commit

Permalink
Add "Babel" to list of false positives
Browse files Browse the repository at this point in the history
Closes #29
  • Loading branch information
stscoundrel committed Feb 18, 2024
1 parent 6dd880c commit a293f4c
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 4 deletions.
1 change: 1 addition & 0 deletions parser/src/parser/entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
# Headwords that look correct, but were false positives.
# Should be mapped with "part of previous entry" status.
FALSE_POSITIVE_HEADWORDS: Final[Sequence[str]] = (
"Babel,",
"Hesiodus,",
"Hibertz,",
"Højsgaard,",
Expand Down
14 changes: 10 additions & 4 deletions parser/tests/test_page_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,16 +54,22 @@ def test_splits_page_correctly() -> None:

expected_b_headwords = [
"Bable",
"Babel", # TODO: GH-29, should be part of previous entry.
"Babel,", # Part of previous, therefore the comma.
"Babler",
]

assert [entry.headword for entry in b_entries] == expected_b_headwords

expected_b_statuses = [
EntryStatus.VALID,
EntryStatus.PART_OF_PREVIOUS_ENTRY,
EntryStatus.VALID,
]

assert [b_entry.status for b_entry in b_entries] == expected_b_statuses

# Assert content of the last first, ensure nothing was cut off.
assert (
b_entries[0].definitions == "go. 1) at tale uforståe- ligt. Moth;"
) # TODO: GH-29, part of content in next entry.
assert b_entries[0].definitions == "go. 1) at tale uforståe- ligt. Moth;"

# Axeltorg was originally line-splitted headword. Ensure no content was lost in parsing.
expected_content = (
Expand Down

0 comments on commit a293f4c

Please sign in to comment.