Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/simple enty parsing #18

Merged
merged 1 commit into from
Nov 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion parser/src/parser/columns.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Final

_divider: Final[str] = "|"
_spaces_divider: Final[str] = " "
_spaces_divider: Final[str] = " "


def _get_column_divider(line: str) -> str:
Expand Down
56 changes: 55 additions & 1 deletion parser/src/parser/page.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,24 @@
import re
from typing import NamedTuple


class Entry(NamedTuple):
headword: str
definitions: str

@staticmethod
def from_raw_entry(raw_entry: str) -> "Entry":
# Naive expectation: first word is headword.
# TODO: GH-16 better detection.
parts = raw_entry.split(" ", maxsplit=1)

return Entry(
headword=parts[0],
# TODO: GH-17 clean up content.
definitions=parts[1],
)


class Page:
_meta_parts: list[str] | None = None

Expand All @@ -15,8 +36,15 @@ def _get_meta_parts(self) -> list[str]:

return self._meta_parts

def get_separators_for(self, letter: str) -> list[str]:
return [f"— {letter}", f" {letter}"]

def get_entry_separators(self) -> set[str]:
return {f"— {letter}" for letter in self.get_letters_in_page()}
return {
separator
for letter in self.get_letters_in_page()
for separator in self.get_separators_for(letter)
}

def is_left_side_page(self) -> bool:
return self._get_meta_parts()[0].isnumeric()
Expand All @@ -40,3 +68,29 @@ def get_letters_in_page(self) -> set[str]:
letters.add(self._get_meta_parts()[1][0].upper())

return letters

def get_entries(self) -> list[Entry]:
raw_entries = ["\n".join(self.content)]

for letter in self.get_letters_in_page():
# TODO: GH-14 may need additional separators, perhaps based on
# the end of previous definition.
separators_regex = "|".join(self.get_separators_for(letter))

# Unsplit content should always be in the last entry.
entries_for_letter = re.split(separators_regex, raw_entries[-1])

# TODO: GH-13. Only append if not the first entry.
# If first entry, detect if start of entry or not.
# Probably needs to support incomplete entries to be patched later.

# Append base letter back
entries_for_letter = [f"{letter}{entry}" for entry in entries_for_letter]

raw_entries = raw_entries[0:-2] + entries_for_letter

# Format string entries to structures.
# TODO: GH-16 recognize incorrect headwords, append to previous entries.
entries = [Entry.from_raw_entry(raw_entry) for raw_entry in raw_entries]

return entries
25 changes: 25 additions & 0 deletions parser/tests/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,28 @@ def test_page_letters_meta() -> None:
assert page1.get_letters_in_page() == {"A"}
assert page2.get_letters_in_page() == {"A"}
assert page3.get_letters_in_page() == {"J", "K"}


def test_parses_simple_entries() -> None:
"""
Simple entries: one letter in more-or-less straightforward OCR'd page.
"""
one_letter_left_page_input = _single_column_test_file("simple-page.txt")

page = Page(one_letter_left_page_input)
entries = page.get_entries()

expected_headwords = [
"Aaf", # Incorrect! TODO: GH-13
"Afkomme,",
"Afkomst,",
"Afkontrafej,",
"Afkon-\n\n", # Incorrect! TODO: GH-15
"Afkort,",
"Afkorte,",
"Afkortelse,",
"Afkvædet,",
"Afkynde,",
]

assert [entry.headword for entry in entries] == expected_headwords