Skip to content

Commit

Permalink
Entries: add proofreader for known headword typos
Browse files Browse the repository at this point in the history
Typos are caused by OCR. Manually patch them based on dictionary of typos to correct forms.

For example:
- Azelkøbstad -> should be Axelkøbstad
- Azelvej -> should be Axelvej

Closes #34
  • Loading branch information
stscoundrel committed Dec 23, 2023
1 parent 3f49352 commit d760a28
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 5 deletions.
20 changes: 17 additions & 3 deletions parser/src/parser/entry.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
from enum import Enum
from typing import NamedTuple
from typing import Final, NamedTuple

KNOWN_HEADWORD_TYPOS_TO_CORRECT_VERSIONS: Final[dict[str, str]] = {
"Azelkøbstad": "Axelkøbstad",
"Azelvej": "Axelvej",
}


class EntryStatus(Enum):
Expand Down Expand Up @@ -28,7 +33,15 @@ def _clean_headword(raw_headword: str) -> str:
return raw_headword.replace("\n", "").strip()

@staticmethod
def _clean_headword_presentation(raw_headword: str) -> str:
def _proofread_headword(raw_headword: str) -> str:
# OCR results in decent, but not 100% correct results. There are inevitable typos.
# While there are no doubt typos in definitions, it is more important to patch
# them to headwords, as that is how words are searched. Add them to mapping as
# we come across them.
return KNOWN_HEADWORD_TYPOS_TO_CORRECT_VERSIONS.get(raw_headword, raw_headword)

@classmethod
def _clean_headword_presentation(cls, raw_headword: str) -> str:
formatted_headword = raw_headword

# Drop ending commas when present.
Expand All @@ -38,7 +51,8 @@ def _clean_headword_presentation(raw_headword: str) -> str:
# forms of capitalization. It is essentially error in OCR.
formatted_headword = formatted_headword[0:-1].capitalize()

return formatted_headword
# Fix known typos.
return cls._proofread_headword(formatted_headword)

@classmethod
def from_raw_entry(cls, raw_entry: str) -> "Entry":
Expand Down
4 changes: 2 additions & 2 deletions parser/tests/test_page_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,12 @@ def test_splits_page_correctly() -> None:

expected_a_headwords = [
"hvilken", # Partial entry.
"Azelkøbstad",
"Axelkøbstad",
"Axelskav",
"Axeltorg",
"Axelseng",
"Axeltand",
"Azelvej",
"Axelvej",
"Axel",
"Axelmærke",
"Axeniere",
Expand Down

0 comments on commit d760a28

Please sign in to comment.