Entries: add proofreader for known headword typos

Typos are caused by OCR. Manually patch them based on dictionary of typos to correct forms. For example: - Azelkøbstad -> should be Axelkøbstad - Azelvej -> should be Axelvej Closes #34
stscoundrel · Dec 23, 2023 · d760a28 · d760a28
1 parent 3f49352
commit d760a28
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 5 deletions.
diff --git a/parser/src/parser/entry.py b/parser/src/parser/entry.py
@@ -1,5 +1,10 @@
 from enum import Enum
-from typing import NamedTuple
+from typing import Final, NamedTuple
+
+KNOWN_HEADWORD_TYPOS_TO_CORRECT_VERSIONS: Final[dict[str, str]] = {
+    "Azelkøbstad": "Axelkøbstad",
+    "Azelvej": "Axelvej",
+}
 
 
 class EntryStatus(Enum):
@@ -28,7 +33,15 @@ def _clean_headword(raw_headword: str) -> str:
         return raw_headword.replace("\n", "").strip()
 
     @staticmethod
-    def _clean_headword_presentation(raw_headword: str) -> str:
+    def _proofread_headword(raw_headword: str) -> str:
+        # OCR results in decent, but not 100% correct results. There are inevitable typos.
+        # While there are no doubt typos in definitions, it is more important to patch
+        # them to headwords, as that is how words are searched. Add them to mapping as
+        # we come across them.
+        return KNOWN_HEADWORD_TYPOS_TO_CORRECT_VERSIONS.get(raw_headword, raw_headword)
+
+    @classmethod
+    def _clean_headword_presentation(cls, raw_headword: str) -> str:
         formatted_headword = raw_headword
 
         # Drop ending commas when present.
@@ -38,7 +51,8 @@ def _clean_headword_presentation(raw_headword: str) -> str:
             # forms of capitalization. It is essentially error in OCR.
             formatted_headword = formatted_headword[0:-1].capitalize()
 
-        return formatted_headword
+        # Fix known typos.
+        return cls._proofread_headword(formatted_headword)
 
     @classmethod
     def from_raw_entry(cls, raw_entry: str) -> "Entry":

diff --git a/parser/tests/test_page_splitter.py b/parser/tests/test_page_splitter.py
@@ -29,12 +29,12 @@ def test_splits_page_correctly() -> None:
 
     expected_a_headwords = [
         "hvilken",  # Partial entry.
-        "Azelkøbstad",
+        "Axelkøbstad",
         "Axelskav",
         "Axeltorg",
         "Axelseng",
         "Axeltand",
-        "Azelvej",
+        "Axelvej",
         "Axel",
         "Axelmærke",
         "Axeniere",