From e23b6c43b7567cbd198f64b2c6804be1092c57a0 Mon Sep 17 00:00:00 2001 From: Sampo Silvennoinen Date: Sat, 18 Nov 2023 16:08:54 +0200 Subject: [PATCH] Add simple entry parsing Naive approach, which leaves most edge cases unsolved --- parser/src/parser/columns.py | 2 +- parser/src/parser/page.py | 56 +++++++++++++++++++++++++++++++++++- parser/tests/test_page.py | 25 ++++++++++++++++ 3 files changed, 81 insertions(+), 2 deletions(-) diff --git a/parser/src/parser/columns.py b/parser/src/parser/columns.py index 796ef4b..eeace98 100644 --- a/parser/src/parser/columns.py +++ b/parser/src/parser/columns.py @@ -1,7 +1,7 @@ from typing import Final _divider: Final[str] = "|" -_spaces_divider: Final[str] = " " +_spaces_divider: Final[str] = " " def _get_column_divider(line: str) -> str: diff --git a/parser/src/parser/page.py b/parser/src/parser/page.py index 9c0c4c8..7d51576 100644 --- a/parser/src/parser/page.py +++ b/parser/src/parser/page.py @@ -1,3 +1,24 @@ +import re +from typing import NamedTuple + + +class Entry(NamedTuple): + headword: str + definitions: str + + @staticmethod + def from_raw_entry(raw_entry: str) -> "Entry": + # Naive expectation: first word is headword. + # TODO: GH-16 better detection. + parts = raw_entry.split(" ", maxsplit=1) + + return Entry( + headword=parts[0], + # TODO: GH-17 clean up content. + definitions=parts[1], + ) + + class Page: _meta_parts: list[str] | None = None @@ -15,8 +36,15 @@ def _get_meta_parts(self) -> list[str]: return self._meta_parts + def get_separators_for(self, letter: str) -> list[str]: + return [f"— {letter}", f" {letter}"] + def get_entry_separators(self) -> set[str]: - return {f"— {letter}" for letter in self.get_letters_in_page()} + return { + separator + for letter in self.get_letters_in_page() + for separator in self.get_separators_for(letter) + } def is_left_side_page(self) -> bool: return self._get_meta_parts()[0].isnumeric() @@ -40,3 +68,29 @@ def get_letters_in_page(self) -> set[str]: letters.add(self._get_meta_parts()[1][0].upper()) return letters + + def get_entries(self) -> list[Entry]: + raw_entries = ["\n".join(self.content)] + + for letter in self.get_letters_in_page(): + # TODO: GH-14 may need additional separators, perhaps based on + # the end of previous definition. + separators_regex = "|".join(self.get_separators_for(letter)) + + # Unsplit content should always be in the last entry. + entries_for_letter = re.split(separators_regex, raw_entries[-1]) + + # TODO: GH-13. Only append if not the first entry. + # If first entry, detect if start of entry or not. + # Probably needs to support incomplete entries to be patched later. + + # Append base letter back + entries_for_letter = [f"{letter}{entry}" for entry in entries_for_letter] + + raw_entries = raw_entries[0:-2] + entries_for_letter + + # Format string entries to structures. + # TODO: GH-16 recognize incorrect headwords, append to previous entries. + entries = [Entry.from_raw_entry(raw_entry) for raw_entry in raw_entries] + + return entries diff --git a/parser/tests/test_page.py b/parser/tests/test_page.py index a10607b..8af2a9f 100644 --- a/parser/tests/test_page.py +++ b/parser/tests/test_page.py @@ -36,3 +36,28 @@ def test_page_letters_meta() -> None: assert page1.get_letters_in_page() == {"A"} assert page2.get_letters_in_page() == {"A"} assert page3.get_letters_in_page() == {"J", "K"} + + +def test_parses_simple_entries() -> None: + """ + Simple entries: one letter in more-or-less straightforward OCR'd page. + """ + one_letter_left_page_input = _single_column_test_file("simple-page.txt") + + page = Page(one_letter_left_page_input) + entries = page.get_entries() + + expected_headwords = [ + "Aaf", # Incorrect! TODO: GH-13 + "Afkomme,", + "Afkomst,", + "Afkontrafej,", + "Afkon-\n\n", # Incorrect! TODO: GH-15 + "Afkort,", + "Afkorte,", + "Afkortelse,", + "Afkvædet,", + "Afkynde,", + ] + + assert [entry.headword for entry in entries] == expected_headwords