Parse FOIA

Here is the script that parses the scans.

import sys
import os

# Add the parent directory of etl to the Python path
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from table_parsing.utils import set_paths
from table_parsing.ocr import ocr_text
from table_parsing.screenshots import make_screenshots
from table_parsing.image import make_result



pdf_ls = [
    # {"photo_pages": "/Users/nick/Documents/projects/GitHub/usda/data/source/montana-foias/2022-APHIS-04193-F_8956/8956_1", "result":"/Users/nick/Documents/projects/GitHub/usda/data/processed/montana-foia-parsed/8956_1_result.jsonl"},
    # {"photo_pages": "/Users/nick/Documents/projects/GitHub/usda/data/source/montana-foias/2022-APHIS-04193-F_9430/2022-APHIS-04193-F_9430_2", "result":"/Users/nick/Documents/projects/GitHub/usda/data/processed/montana-foia-parsed/9430_2_result.jsonl"},
    # {"photo_pages": "/Users/nick/Documents/projects/GitHub/usda/data/source/montana-foias/2022-APHIS-04193-F_9557/9557_1" , "result": "/Users/nick/Documents/projects/GitHub/usda/data/processed/montana-foia-parsed/9557_1_result.jsonl"},
    # {"photo_pages": "/Users/nick/Documents/projects/GitHub/usda/data/source/montana-foias/2022-APHIS-04193-F_9557/9557_2" , "result":"/Users/nick/Documents/projects/GitHub/usda/data/processed/montana-foia-parsed/9557_2_result.jsonl"},
    # {"photo_pages": "/Users/nick/Documents/projects/GitHub/usda/data/source/montana-foias/2022-APHIS-04193-F_9557/9557_3", "result":"/Users/nick/Documents/projects/GitHub/usda/data/processed/montana-foia-parsed/9557_3_result.jsonl"},
    # {"photo_pages": "/Users/nick/Documents/projects/GitHub/usda/data/source/montana-foias/2022-APHIS-04193-F_9557/9557_4", "result":"/Users/nick/Documents/projects/GitHub/usda/data/processed/montana-foia-parsed/9557_4_result.jsonl"},
    # {"photo_pages": "/Users/nick/Documents/projects/GitHub/usda/data/source/montana-foias/2022-APHIS-04193-F_8648/8648_dedupe", "result":"/Users/nick/Documents/projects/GitHub/usda/data/processed/montana-foia-parsed/8648_dedupe_result.jsonl"},


]

for pdf_item in pdf_ls:
    pdf_photos_path = pdf_item["photo_pages"]
    output_result = pdf_item["result"]
    print(f"Starting process for {output_result} ")
    # Create folders

    json_folder_path, screenshots_folder_path, screenshot_annotated_folder_path, screenshot_jpg_path, screenshot_json_path  = set_paths(pdf_photos_path)

    # OCR the pages 

    ocr_text(pdf_photos_path, json_folder_path )

    # Screenshot each row

    make_screenshots(pdf_photos_path,json_folder_path, screenshot_jpg_path )

    # OCR each screenshot

    ocr_text(screenshot_jpg_path, screenshot_json_path)

    # Output file

    make_result(screenshot_jpg_path, screenshot_json_path,output_result,screenshot_annotated_folder_path)
    print(f"Finished{output_result} ")
    print("-------------------------")







Here is a presentation that explains the code at a high level.