Skip to content

Commit

Permalink
Merge pull request #10 from stscoundrel/feature/improved-column-detec…
Browse files Browse the repository at this point in the history
…tion

Columns: detect specific space groups as dividers
  • Loading branch information
stscoundrel authored Nov 19, 2023
2 parents d4eda3a + 9bf377b commit d13351d
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 1 deletion.
30 changes: 29 additions & 1 deletion parser/src/parser/columns.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,42 @@
from typing import Final

_divider: Final[str] = "|"
_spaces_divider: Final[str] = " "


def _get_column_divider(line: str) -> str:
if _divider in line:
# Divider was read in OCR, simple case.
return _divider

# If divider is missing, it was probably misread.
# If divider is missing, it generally means there will be a
# grouping of spaces where the divider should be.
# Four spaces seems common enough case to act as divider.
return _spaces_divider


def _get_divided_lines(line: str) -> list[str]:
divider = _get_column_divider(line)
divided = line.split(divider)

# If we're dealing with spaces divider, we want to preserve
# the whitespace after the split. Therefore, append it to
# all but the first item in split list.
if divider == _spaces_divider:
print("APPENDING SPACES")
for idx, divided_part in enumerate(divided[1:]):
divided[idx + 1] = f"{_spaces_divider}{divided_part}"

return divided


def parse_column(page: list[str]) -> list[str]:
left_column = []
right_column = []

for line in page[1:]: # First line is meta info.
divided = line.split(_divider)
divided = _get_divided_lines(line)

match len(divided):
case 1:
Expand Down
4 changes: 4 additions & 0 deletions parser/tests/test_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,20 @@ def test_column_combining() -> None:
"Lorem ipsum | Country roads",
"dolor sit amet | take me home",
"dolor sit igitur | to the place I belong",
# Missing divider, to be recognized with group of spaces.
"werden. Tavsen. 67; smlgn. Sch. u. Ablat se oblat.",
]

expected = [
"Meta line about the page",
"Lorem ipsum ",
"dolor sit amet ",
"dolor sit igitur ",
"werden. Tavsen. 67; smlgn. Sch. u.",
" Country roads",
" take me home",
" to the place I belong",
" Ablat se oblat.",
]

result = columns.parse_column(input)
Expand Down

0 comments on commit d13351d

Please sign in to comment.