Skip to content

Commit

Permalink
add error checking/handling based on number of crashes and pdfs
Browse files Browse the repository at this point in the history
  • Loading branch information
johnclary committed Aug 21, 2024
1 parent a59d048 commit 998900c
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 7 deletions.
29 changes: 25 additions & 4 deletions atd-etl/cris_import/cris_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@ def main(cli_args):

logger.info(f"{len(extracts_todo)} extract(s) to process")

if not extracts_todo:
return
if cli_args.s3_download and not extracts_todo:
# always short circuit if we find nothing in S3
raise Exception("No extracts found in S3 bucket")

for extract in extracts_todo:
records_processed = {
Expand All @@ -49,14 +50,34 @@ def main(cli_args):
if cli_args.csv:
csv_records_processed_dict = process_csvs(extract_dir)
records_processed.update(csv_records_processed_dict)
if cli_args.pdf:

no_crashes_found = (
True if cli_args.csv and records_processed["crashes"] == 0 else False
)

if cli_args.pdf and not no_crashes_found:
pdfs_processed_count = process_pdfs(
extract_dir, cli_args.s3_upload, cli_args.workers
)
records_processed["pdfs"] = pdfs_processed_count
elif cli_args.pdf and no_crashes_found:
# we skip PDF processing when there are no CSV crashes, because in those cases
# the extract may not contain a `crashReports` directory — and that will
# cause an unwanted failure
logger.info("Skipping PDF processing because no CSV crashes were processed")

# if processing CSVs and PDFs, make sure the number of crashes matches the number of PDFs
if cli_args.pdf and cli_args.csv:
if records_processed["crashes"] != records_processed["pdfs"]:
raise Exception(
"Mismatch between # of crashes processed vs PDFs. This should never happen!"
)

if cli_args.s3_download and cli_args.s3_archive and not cli_args.skip_unzip:
archive_extract_zip(extract["s3_file_key"])
set_log_entry_complete(log_entry_id=log_entry_id, records_processed=records_processed)
set_log_entry_complete(
log_entry_id=log_entry_id, records_processed=records_processed
)


if __name__ == "__main__":
Expand Down
4 changes: 1 addition & 3 deletions atd-etl/cris_import/utils/process_pdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,10 +139,8 @@ def process_pdfs(extract_dir, s3_upload, max_workers):
for filename in os.listdir(os.path.join(extract_dir, "crashReports"))
if filename.endswith(".pdf")
]
pdf_count = len(pdfs)

if not pdf_count:
raise IOError("No PDFs found in extract")
pdf_count = len(pdfs)

logger.info(f"Found {pdf_count} PDFs to process")

Expand Down

0 comments on commit 998900c

Please sign in to comment.