Skip to content

Commit

Permalink
Entries: parse multiple definitions as list
Browse files Browse the repository at this point in the history
  • Loading branch information
stscoundrel committed Feb 18, 2024
1 parent a293f4c commit 779183a
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 0 deletions.
36 changes: 36 additions & 0 deletions parser/src/parser/entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,42 @@ class Entry(NamedTuple):
definitions: str
status: EntryStatus

@property
def definitions_list(self) -> list[str]:
entry_starts = ["1)", "2)", "3)", "4)"]

# We only want to process "intact" looking definitions.
if "1) " not in self.definitions:
return [self.definitions]

lines = []
line = ""

words = self.definitions.split(" ")

# Walk through words, parsing new lines with new numbered indexes.
for word in words:
if word in entry_starts:
lines.append(line.strip())
line = ""

line = f"{line} {word}"

lines.append(line.strip())

# Sanity: we want each line to be properly numbered with no jumps.
expected_number = 1

for line in lines:
first = line[0]
if first.isnumeric():
if int(first) != expected_number:
return [self.definitions]

expected_number += 1

return lines

def to_json(self) -> dict[str, str]:
return {
"headword": self.headword,
Expand Down
62 changes: 62 additions & 0 deletions parser/tests/test_entry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from src.parser.entry import Entry, EntryStatus


def test_parses_single_definition_as_list() -> None:
entry = Entry(headword="Foo", definitions="Foo bar baz", status=EntryStatus.VALID)

expected_definitions = ["Foo bar baz"]

assert entry.definitions_list == expected_definitions


def test_parses_single_numbered_definition_as_list() -> None:
entry = Entry(
headword="Bable",
definitions="go. 1) at tale uforståe- ligt. Moth;",
status=EntryStatus.VALID,
)

expected_definitions = ["go.", "1) at tale uforståe- ligt. Moth;"]

assert entry.definitions_list == expected_definitions


def test_parses_multiple_definitions_as_list() -> None:
entry = Entry(
headword="Afkom",
definitions=(
"go. 1) komme (bort) fra; kommer schiff paa grund oc kand ey med mindre affkomme. "
"N. D. Mag. VI. 104; som ey bygge oc boo paa wort oc kronens goilz oc ey ere aff- "
"komne met wore mynde (1506). Ro- senv., Gl. L. V. 202; po thet Hans Ure motte thets "
"bedre affkomme oc guit bliffae trette oc uenighed (1549). Rosenv., Gl. D. I. 71; N. D. Mag. 1. 814. "
"— 2) komme af; at ther will afikomme eth stort oprør och forderflue (1526). N. D. Mag. V. 215; der met "
"er det affkommen, mand neppe kiender slecten. Hvitf. VIII 365. — 3) aflægges (t. abkommen.); at the ismaa "
"markede ere aflagde, oc att ingen haffaer fordelle ther af, at the ere afkommen (1542). D. Mag. IV. 288. — "
"4) overkomme; at voris depu- terede samme miinstringer, naar de afkomme kand, self skall bjwaane (1890). "
"Geh. Ark. Årsb. IL 294."
),
status=EntryStatus.VALID,
)

expected_definitions = [
"go.",
"1) komme (bort) fra; kommer schiff paa grund oc kand ey med mindre affkomme. N. D. Mag. VI. 104; som ey bygge oc boo paa wort oc kronens goilz oc ey ere aff- komne met wore mynde (1506). Ro- senv., Gl. L. V. 202; po thet Hans Ure motte thets bedre affkomme oc guit bliffae trette oc uenighed (1549). Rosenv., Gl. D. I. 71; N. D. Mag. 1. 814. —", # noqa: E501
"2) komme af; at ther will afikomme eth stort oprør och forderflue (1526). N. D. Mag. V. 215; der met er det affkommen, mand neppe kiender slecten. Hvitf. VIII 365. —", # noqa: E501
"3) aflægges (t. abkommen.); at the ismaa markede ere aflagde, oc att ingen haffaer fordelle ther af, at the ere afkommen (1542). D. Mag. IV. 288. —", # noqa: E501
"4) overkomme; at voris depu- terede samme miinstringer, naar de afkomme kand, self skall bjwaane (1890). Geh. Ark. Årsb. IL 294.", # noqa: E501
]

assert entry.definitions_list == expected_definitions


def test_parses_incorrectly_numbered_definitions_list() -> None:
entry = Entry(
headword="Foo",
definitions="Foo bar baz 1) bar bar 3) baz baz 2) foo foo ",
status=EntryStatus.VALID,
)

# Should not split if numbering does not make sense.
expected_definitions = ["Foo bar baz 1) bar bar 3) baz baz 2) foo foo "]

assert entry.definitions_list == expected_definitions

0 comments on commit 779183a

Please sign in to comment.