From 603e47a630281b3464c45b5eedd88d2f8741b8dd Mon Sep 17 00:00:00 2001 From: Sampo Silvennoinen Date: Sun, 26 Nov 2023 17:56:58 +0200 Subject: [PATCH] Add simple content parsing Closes #17 --- parser/src/parser/page.py | 17 +++++++++++------ parser/tests/test_page.py | 14 ++++++++++++++ 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/parser/src/parser/page.py b/parser/src/parser/page.py index 7d51576..ddcef89 100644 --- a/parser/src/parser/page.py +++ b/parser/src/parser/page.py @@ -7,16 +7,21 @@ class Entry(NamedTuple): definitions: str @staticmethod - def from_raw_entry(raw_entry: str) -> "Entry": + def _clean_definitions(raw_definitions: str) -> str: + # Drop all linebreaks. + cleaned_definitions = raw_definitions.replace("\n", "") + + return " ".join( + [splitted for splitted in cleaned_definitions.split(" ") if splitted != ""] + ) + + @classmethod + def from_raw_entry(cls, raw_entry: str) -> "Entry": # Naive expectation: first word is headword. # TODO: GH-16 better detection. parts = raw_entry.split(" ", maxsplit=1) - return Entry( - headword=parts[0], - # TODO: GH-17 clean up content. - definitions=parts[1], - ) + return Entry(headword=parts[0], definitions=cls._clean_definitions(parts[1])) class Page: diff --git a/parser/tests/test_page.py b/parser/tests/test_page.py index 8af2a9f..fb01f0b 100644 --- a/parser/tests/test_page.py +++ b/parser/tests/test_page.py @@ -60,4 +60,18 @@ def test_parses_simple_entries() -> None: "Afkynde,", ] + expected_content = ( + "go. 1) komme (bort) fra; kommer schiff paa grund oc kand ey med mindre affkomme. " + "N. D. Mag. VI. 104; som ey bygge oc boo paa wort oc kronens goilz oc ey ere aff- " + "komne met wore mynde (1506). Ro- senv., Gl. L. V. 202; po thet Hans Ure motte thets " + "bedre affkomme oc guit bliffae trette oc uenighed (1549). Rosenv., Gl. D. I. 71; N. D. Mag. 1. 814. " + "— 2) komme af; at ther will afikomme eth stort oprør och forderflue (1526). N. D. Mag. V. 215; der met " + "er det affkommen, mand neppe kiender slecten. Hvitf. VIII 365. — 3) aflægges (t. abkommen.); at the ismaa " + "markede ere aflagde, oc att ingen haffaer fordelle ther af, at the ere afkommen (1542). D. Mag. IV. 288. — " + "4) overkomme; at voris depu- terede samme miinstringer, naar de afkomme kand, self skall bjwaane (1890). " + "Geh. Ark. Årsb. IL 294." + ) + assert [entry.headword for entry in entries] == expected_headwords + + assert entries[1].definitions == expected_content