Parse FOIA
Here is the script that parses the scans.
import sys
import os
# Add the parent directory of etl to the Python path
= os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
project_root if project_root not in sys.path:
sys.path.append(project_root)
from table_parsing.utils import set_paths
from table_parsing.ocr import ocr_text
from table_parsing.screenshots import make_screenshots
from table_parsing.image import make_result
= [
pdf_ls # {"photo_pages": "/Users/nick/Documents/projects/GitHub/usda/data/source/montana-foias/2022-APHIS-04193-F_8956/8956_1", "result":"/Users/nick/Documents/projects/GitHub/usda/data/processed/montana-foia-parsed/8956_1_result.jsonl"},
# {"photo_pages": "/Users/nick/Documents/projects/GitHub/usda/data/source/montana-foias/2022-APHIS-04193-F_9430/2022-APHIS-04193-F_9430_2", "result":"/Users/nick/Documents/projects/GitHub/usda/data/processed/montana-foia-parsed/9430_2_result.jsonl"},
# {"photo_pages": "/Users/nick/Documents/projects/GitHub/usda/data/source/montana-foias/2022-APHIS-04193-F_9557/9557_1" , "result": "/Users/nick/Documents/projects/GitHub/usda/data/processed/montana-foia-parsed/9557_1_result.jsonl"},
# {"photo_pages": "/Users/nick/Documents/projects/GitHub/usda/data/source/montana-foias/2022-APHIS-04193-F_9557/9557_2" , "result":"/Users/nick/Documents/projects/GitHub/usda/data/processed/montana-foia-parsed/9557_2_result.jsonl"},
# {"photo_pages": "/Users/nick/Documents/projects/GitHub/usda/data/source/montana-foias/2022-APHIS-04193-F_9557/9557_3", "result":"/Users/nick/Documents/projects/GitHub/usda/data/processed/montana-foia-parsed/9557_3_result.jsonl"},
# {"photo_pages": "/Users/nick/Documents/projects/GitHub/usda/data/source/montana-foias/2022-APHIS-04193-F_9557/9557_4", "result":"/Users/nick/Documents/projects/GitHub/usda/data/processed/montana-foia-parsed/9557_4_result.jsonl"},
# {"photo_pages": "/Users/nick/Documents/projects/GitHub/usda/data/source/montana-foias/2022-APHIS-04193-F_8648/8648_dedupe", "result":"/Users/nick/Documents/projects/GitHub/usda/data/processed/montana-foia-parsed/8648_dedupe_result.jsonl"},
]
for pdf_item in pdf_ls:
= pdf_item["photo_pages"]
pdf_photos_path = pdf_item["result"]
output_result print(f"Starting process for {output_result} ")
# Create folders
= set_paths(pdf_photos_path)
json_folder_path, screenshots_folder_path, screenshot_annotated_folder_path, screenshot_jpg_path, screenshot_json_path
# OCR the pages
ocr_text(pdf_photos_path, json_folder_path )
# Screenshot each row
make_screenshots(pdf_photos_path,json_folder_path, screenshot_jpg_path )
# OCR each screenshot
ocr_text(screenshot_jpg_path, screenshot_json_path)
# Output file
make_result(screenshot_jpg_path, screenshot_json_path,output_result,screenshot_annotated_folder_path)print(f"Finished{output_result} ")
print("-------------------------")
Here is a presentation that explains the code at a high level.