Skip to content

Commit

Permalink
ci: update pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
juzgadores committed Dec 18, 2024
1 parent 06a3ebc commit 831e929
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 10 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ name: Release Juzgadores 2024-2025 data
on:
release:
types: [created]
workflow_dispatch:

jobs:
build:
Expand All @@ -24,7 +25,7 @@ jobs:
- name: Generate files
run: |
python -m dof_utils.extract_juzgadores_202 data/15122024-MAT.pdf ./output
python -m dof_utils.extract_juzgadores data/15122024-MAT.pdf ./output
- name: Upload Release Assets
env:
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ Utilities for extracting data from DOF (Diario Oficial de la Federación) public
Public data downloaded from: <https://www.dof.gob.mx/2024/CJF/listado_peritos_PJF_2025.pdf>

```bash
python -m dof_utils.extract_juzgadores_202 data/15122024-MAT.pdf ./output
python -m dof_utils.extract_juzgadores data/15122024-MAT.pdf ./output
```
File renamed without changes.
27 changes: 20 additions & 7 deletions dof_utils/pdf_table_extractor.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,24 @@
from dataclasses import dataclass
from pathlib import Path
from typing import Callable, TypeAlias
from typing import Any, Callable, Protocol, TypeAlias, TypedDict

import pandas as pd
import pdfplumber
from pdfplumber.page import Page
from typing_extensions import Protocol


TableRow: TypeAlias = list[str]
TableRow: TypeAlias = list[str | None]
TableData: TypeAlias = list[TableRow]
ExtractedTable: TypeAlias = tuple[int, str, TableData]
BBox: TypeAlias = tuple[float, float, float, float]


class TableState(TypedDict):
tables: list[ExtractedTable]
table: TableData
heading: str
table_id: int


class TableValidator(Protocol):
"""Protocol for table validation"""

Expand Down Expand Up @@ -47,11 +52,14 @@ def __init__(
self.post_process = post_process or (lambda x: x)

def process_table_data(
self, table_data: TableData, heading: str, current_state: dict[str, any]
self, table_data: TableData, heading: str, current_state: TableState
) -> None:
"""Process extracted table data and update current state."""
# Replace newlines with spaces in all table data cells
table_data = [[cell.replace("\n", " ") for cell in row] for row in table_data]
table_data = [
[cell.replace("\n", " ") if cell is not None else "" for cell in row]
for row in table_data
]

if not table_data or not self.validator.is_valid_table(table_data):
return
Expand All @@ -77,7 +85,12 @@ def process_table_data(

def extract_tables(self, pdf_path: Path) -> list[ExtractedTable]:
"""Extract and merge tables from PDF with their headings."""
state = {"tables": [], "table": [], "heading": "", "table_id": 0}
state: TableState = {
"tables": [],
"table": [],
"heading": "",
"table_id": 0,
}

with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ include = '\.pyi?$'
[tool.isort]
profile = "black"
multi_line_output = 3
line-length = 88

[tool.mypy]
python_version = "3.13"
Expand Down

0 comments on commit 831e929

Please sign in to comment.