Skip to content

Commit

Permalink
Add simple content parsing
Browse files Browse the repository at this point in the history
Closes #17
  • Loading branch information
stscoundrel committed Nov 26, 2023
1 parent 38eef14 commit 645e8f2
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 6 deletions.
17 changes: 11 additions & 6 deletions parser/src/parser/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,21 @@ class Entry(NamedTuple):
definitions: str

@staticmethod
def from_raw_entry(raw_entry: str) -> "Entry":
def _clean_definitions(raw_definitions: str) -> str:
# Drop all linebreaks.
cleaned_definitions = raw_definitions.replace("\n", "")

return " ".join(
[splitted for splitted in cleaned_definitions.split(" ") if splitted != ""]
)

@classmethod
def from_raw_entry(cls, raw_entry: str) -> "Entry":
# Naive expectation: first word is headword.
# TODO: GH-16 better detection.
parts = raw_entry.split(" ", maxsplit=1)

return Entry(
headword=parts[0],
# TODO: GH-17 clean up content.
definitions=parts[1],
)
return Entry(headword=parts[0], definitions=cls._clean_definitions(parts[1]))


class Page:
Expand Down
13 changes: 13 additions & 0 deletions parser/tests/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,4 +60,17 @@ def test_parses_simple_entries() -> None:
"Afkynde,",
]

expected_content = (
"go. 1) komme (bort) fra; kommer schiff paa grund oc kand ey med mindre affkomme. "
"N. D. Mag. VI. 104; som ey bygge oc boo paa wort oc kronens goilz oc ey ere aff- "
"komne met wore mynde (1506). Ro- senv., Gl. L. V. 202; po thet Hans Ure motte thets "
"bedre affkomme oc guit bliffae trette oc uenighed (1549). Rosenv., Gl. D. I. 71; N. D. Mag. 1. 814. "
"— 2) komme af; at ther will afikomme eth stort oprør och forderflue (1526). N. D. Mag. V. 215; der met "
"er det affkommen, mand neppe kiender slecten. Hvitf. VIII 365. — 3) aflægges (t. abkommen.); at the ismaa "
"markede ere aflagde, oc att ingen haffaer fordelle ther af, at the ere afkommen (1542). D. Mag. IV. 288. — "
"4) overkomme; at voris depu- terede samme miinstringer, naar de afkomme kand, self skall bjwaane (1890). Geh. Ark. Årsb. IL 294."
)

assert [entry.headword for entry in entries] == expected_headwords

assert entries[1].definitions == expected_content

0 comments on commit 645e8f2

Please sign in to comment.