Parsing Thousands of Scanned Pages

Nick McMillan

The problem

Scanned pdfs

Scanned pdfs

Just an image
Can’t analyze for patterns (not in excel format)

Potential Solutions

Google Pinpoint

PROS
- Free
- Can handle regular patterns
CONS
- Take a long time to parse with big pdfs
- Can’t handle complex, custom patterns
- The “deep linking” function which made fact checking hard

The complex, custom patterns

Redaction boxes
New columns in certain work orders
One work order overlapping two pages

The solution

Criteria

Run locally
- Easily reproducible by anyone with my code
- Free
Easy to fact check each work order

Roadmap

flowchart TD
  A(Separate each work order into its own screenshot) --> B
  B(Identify the cells of the scanned image) --> C
  C(OCR text and get positional data) --> D
  D(Do math to assign text to cell) --> E
  E(Use location to transform to csv)

Screenshots



import cv2
import json
import os
import glob

from table_parsing.image import convert_bounding_box

def make_screenshots(image_dir, json_dir, screenshot_output_path):


    image_paths =  sorted(glob.glob(f"{image_dir}/*jpg"))
    json_paths = sorted(glob.glob(f"{json_dir}/*json"))

    first_half = None
    counter = 0
    for current_image_index in range(len(image_paths)):

        image_path = image_paths[current_image_index]
        json_path = json_paths[current_image_index]
        
        # print("current image: " + str(current_image_index))
        # Load the image
        image = cv2.imread(image_path)

        # Read the JSON file
        with open(json_path, 'r') as f:
            data = json.load(f)

        work_task_lines = []
        for i, box in enumerate(data['observations']):
            check_text = box["observation"]["text"]
            # print(check_text)
            if "for:" in check_text and ("WorkTask" in check_text or "Work Task" in check_text):
                # print(i)
                x1, y1, x2, y2 =  convert_bounding_box(image,box["observation"]["bounds"])
                work_task_line = max(y1-30, 0)
                work_task_lines.append(work_task_line)

        work_task_lines = sorted(work_task_lines)

        for line in work_task_lines:
            cv2.line(image, (0, line), (image.shape[1], line), (0, 255, 0), 2)
        # print(work_task_lines)
        # # Display the image
        # cv2.imshow('Image with Bounding Boxes', image)
        # cv2.waitKey(0)
        # cv2.destroyAllWindows()

        # print(counter)

        for i in range(len(work_task_lines)):
            start_y = work_task_lines[i]
            # print("star_y: " + str(start_y))

            if first_half is not None:
                counter = counter +1
                print(str(counter) + "--" + str(len(work_task_lines))) 
                # Crop from start to start y
                second_half = image[:start_y, :]
                combined_image = cv2.vconcat([first_half, second_half])

                output_path = os.path.join(screenshot_output_path, f"{counter:02}.jpg")
                cv2.imwrite(output_path, combined_image)

                # cv2.imwrite(os.path.join(screenshot_output_path, f"{counter:02}.5.jpg"), second_half)

                first_half = None

            # If there is no next line
            if i+1 >= len(work_task_lines):
                # Crop from start_y to the end of the image
                first_half = image[start_y:, :]
            else:
                end_y = work_task_lines[i+1]
                final_image = image[start_y:end_y, :]
            
            if first_half is None:
                counter = counter + 1
                # print(str(counter) + "--" + str(len(work_task_lines)))
                output_path = os.path.join(screenshot_output_path, f"{counter:02}.jpg")
                cv2.imwrite(output_path, final_image)

            else:
                pass

    # Screenshot last image
    counter = counter + 1
    # print(str(counter) + "--" + str(len(work_task_lines)))
    output_path = os.path.join(screenshot_output_path, f"{counter:02}.jpg")
    cv2.imwrite(output_path, first_half)

Screenshots

Identify cells



import cv2
import json
import numpy as np
from sklearn.cluster import DBSCAN
import glob
from tqdm import tqdm
import os
import re

from table_parsing.utils import stringify_keys

# Handle input data
# ==================================================
def process_image(image_path):

    # Load the image
    image = cv2.imread(image_path)

    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # Apply adaptive threshold to get binary image
    binary = cv2.adaptiveThreshold(~gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, -10)
    return image, binary

def load_data(json_path):
    with open(json_path, 'r') as file:
        data = json.load(file)

    return(data)

# Handle bounding boxes
# ============================================
def draw_bounding_box(image, bounds, color=(0, 255, 0), thickness=2):
    height, width, _ = image.shape
    top_left = (int(bounds['x1'] * width), int((1 - bounds['y2']) * height))
    bottom_right = (int(bounds['x2'] * width), int((1 - bounds['y1']) * height))
    cv2.rectangle(image, top_left, bottom_right, color, thickness)

def convert_bounding_box(image, bounds):
    height, width, _ = image.shape
    # top left
    x1 = int(bounds['x1'] * width)
    y2 = int((1 - bounds['y2']) * height)
    # Bottom right
    x2 = int(bounds['x2'] * width)
    y1 = int((1 - bounds['y1']) * height)

    # Calculations
    # width = x2 - x1
    # height = y2 - y1

    return x1, y1, x2, y2

def draw_all_word_bounding_boxes(image,data):
    # Loop through the observations and subBounds to draw bounding boxes
    for item in data['observations']:
        # Draw bounding box for the entire phrase
        draw_bounding_box(image, item["observation"]['bounds'], color=(255, 0, 0))
        # print(item["observation"]['text'])
    return(image)

        # Draw bounding boxes for each word
        # for subBound in item["observation"]['subBounds']:
        #     draw_bounding_box(image, subBound['bounds'], color=(0, 255, 0))

    # # Display the image
    # cv2.imshow('Image with Bounding Boxes', image)
    # cv2.waitKey(0)
    # cv2.destroyAllWindows()

# Cluster points
# ======================================

def find_horizontal_vertical_lines(binary, horizontal_scale_factor, vertical_scale_factor):
    # # Parameters to adjust
    # horizontal_scale_factor = 15  # Increase for larger structuring elements
    # vertical_scale_factor = 50    # Increase for larger structuring elements
    # dilation_size = 6            # Increase for more forgiving touch point detection

    # Detect horizontal lines
    horizontal = binary.copy()
    cols = horizontal.shape[1]
    horizontal_size = cols // horizontal_scale_factor
    horizontal_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))
    horizontal = cv2.erode(horizontal, horizontal_structure)
    horizontal = cv2.dilate(horizontal, horizontal_structure)

    # Detect vertical lines
    vertical = binary.copy()
    rows = vertical.shape[0]
    vertical_size = rows // vertical_scale_factor
    vertical_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size))
    vertical = cv2.erode(vertical, vertical_structure)
    vertical = cv2.dilate(vertical, vertical_structure)

    # Combine horizontal and vertical lines
    # grid = cv2.add(horizontal, vertical)

    return(horizontal, vertical)





# Find touches (where lines touch but don't intersect)
def find_touches(image, horizontal, vertical, dilation_size):

    # get touches
    # ==============
    touch_points = []

    # Dilate horizontal and vertical lines to ensure touching points are detected
    horizontal_dilated = cv2.dilate(horizontal, np.ones((dilation_size, dilation_size), np.uint8))
    vertical_dilated = cv2.dilate(vertical, np.ones((dilation_size, dilation_size), np.uint8))
    
    # Combine dilated lines to find touch points
    touch_points_img = cv2.bitwise_and(horizontal_dilated, vertical_dilated)
    
    # Get touch points coordinates
    touch_points_coords = np.argwhere(touch_points_img == 255)
    for point in touch_points_coords:
        touch_points.append((point[1], point[0]))

    # cluster touch points
    # ======================
    dbscan = DBSCAN(eps=10, min_samples=1).fit(touch_points)
    clustered_touch_points = []
    for label in np.unique(dbscan.labels_):
        cluster = np.array(touch_points)[dbscan.labels_ == label]
        if cluster.size > 0:
            clustered_touch_points.append(np.mean(cluster, axis=0).astype(int))


    
    return(clustered_touch_points)



def draw_clustered_touch_points(image, clustered_touch_points):
    # Draw clustered touch points for visualization
    for point in clustered_touch_points:
        cv2.circle(image, tuple(point), 5, (255, 0, 0), -1)


    # Display the image with detected lines, cells, intersections, and touches
    return(image)


# Final grid lines
# ========================================

# Identify and draw the outermost vertical and horizontal lines
def make_final_lines(image, clustered_touch_points):
    if not clustered_touch_points:
        return image

    # touch_points_np = np.array(touch_points)
    clustered_touch_points_np = np.array(clustered_touch_points)

    # Identify the outermost vertical lines
    left_most = np.min(clustered_touch_points_np[:, 0])
    right_most = np.max(clustered_touch_points_np[:, 0])
    middle = np.median(clustered_touch_points_np[:, 0])

    # Identify the outermost horizontal lines
    top_most = np.min(clustered_touch_points_np[:, 1])
    bottom_most = np.max(clustered_touch_points_np[:, 1])



    # Draw vertical lines
    cv2.line(image, (left_most, 0), (left_most, image.shape[0]), (0, 255, 0), 2)
    cv2.line(image, (int(middle), 0), (int(middle), image.shape[0]), (0, 255, 0), 2)
    cv2.line(image, (right_most, 0), (right_most, image.shape[0]), (0, 255, 0), 2)

    # Draw horizontal lines
    cv2.line(image, (0, top_most), (image.shape[1], top_most), (0, 255, 0), 2)
    cv2.line(image, (0, bottom_most), (image.shape[1], bottom_most), (0, 255, 0), 2)

   

    # Remove touch points that are within 5 pixels of top_most and bottom_most y-coordinates
    filtered_touch_points = [point for point in clustered_touch_points if not (top_most - 5 <= point[1] <= top_most + 5 or bottom_most - 5 <= point[1] <= bottom_most + 5)]
    filtered_touch_points_np = np.array(filtered_touch_points)

    # Identify unique y-coordinates of filtered touch points
    unique_y_coords = np.array(sorted(set(filtered_touch_points_np[:, 1]))).reshape(-1, 1)

    # Cluster the y-coordinates using DBSCAN
    dbscan = DBSCAN(eps=10, min_samples=1).fit(unique_y_coords)
    clustered_y_coords = []

    for label in np.unique(dbscan.labels_):
        cluster = unique_y_coords[dbscan.labels_ == label]
        if cluster.size > 0:
            clustered_y_coords.append(np.mean(cluster))

    clustered_y_coords = sorted(clustered_y_coords)


    
    for y in clustered_y_coords:
        y = int(y)

        cv2.line(image, (left_most, y), (right_most, y), (0, 255, 0), 2)



    return image, [top_most] + clustered_y_coords + [bottom_most], [left_most, middle, right_most]


# Assigning data 
# ============================================

# Function to calculate the intersection area of two rectangles
def intersection_area(box1, box2):
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])
    intersection = max(0, x2 - x1) * max(0, y2 - y1)
    return intersection


def assign_data(image, data, horizontal_lines, vertical_lines):

    row_result = {}

    for item in data['observations']:
        
        x1, y1, x2, y2 = convert_bounding_box(image, item["observation"]['bounds'] )
        text = item["observation"]['text']


        bounding_box = (x1, y1, x2, y2)


        # Determine the cell containing the majority of the bounding box
        max_intersection = 0
        cell_with_max_intersection = (0, 0)

        for i in range(len(horizontal_lines) - 1):
            for j in range(len(vertical_lines) - 1):
                # Define the bounding box for the current cell
                cell_box = (vertical_lines[j], horizontal_lines[i], vertical_lines[j + 1], horizontal_lines[i + 1])
                # print("bounding_box: " + str(bounding_box))
                # print("cell box: " + str(cell_box))

                
                # Calculate the intersection area between the bounding box and the cell
                intersection = intersection_area(bounding_box, cell_box)
                # print("intersection: " + str(intersection))
                # Update the cell with the maximum intersection area
                if intersection > max_intersection:
                    max_intersection = intersection
                    cell_with_max_intersection = (i, j)

        cell_key = cell_with_max_intersection
        if cell_key in row_result:
            row_result[cell_key] += " " + text
        else:
            row_result[cell_key] = text


    return(row_result)

def natural_sort_key(s):
    return [int(text) if text.isdigit() else text for text in re.split(r'(\d+)', s)]

def make_result(image_dir, json_dir, jsonl_output_path, annotated_output_path):

    image_paths =  sorted(glob.glob(f"{image_dir}/*jpg") , key=natural_sort_key)

    json_paths = sorted(glob.glob(f"{json_dir}/*json"), key=natural_sort_key)


    with open(jsonl_output_path, 'w') as jsonl_file:
        for i in range(len(image_paths)):
            image_path = image_paths[i]
            json_path = json_paths[i]

            file_name = os.path.basename(image_path)

            image, binary = process_image(image_path)
            data = load_data(json_path)

            # image = draw_all_word_bounding_boxes(image, data)

            horizontal_lines, vertical_lines = find_horizontal_vertical_lines(binary, horizontal_scale_factor = 15, vertical_scale_factor = 30)

            clustered_touch_points = find_touches(image, horizontal_lines, vertical_lines, dilation_size=6)

            image = draw_clustered_touch_points(image, clustered_touch_points)

            image_with_lines, horizontal_lines, vertical_lines = make_final_lines(image, clustered_touch_points)

            result = assign_data(image, data, horizontal_lines, vertical_lines)

        

            # Save results
            # ============
            cv2.imwrite(os.path.join(annotated_output_path, file_name), image_with_lines)
            # Write the result to the JSONL file
            jsonl_file.write(json.dumps(stringify_keys(result)) + '\n')

Identify cells

Apple’s live text feature

Get positional data of text


{
  "info" : {
    "program" : "textra",
    "version" : "0.2.1"
  },
  "observations" : [
    {
      "observation" : {
        "bounds" : {
          "x2" : 0.16144200584639501,
          "y2" : 0.95008912650623889,
          "x1" : 0.11912225663009407,
          "y1" : 0.98039215680926917
        },
        "text" : "36.",
        "confidence" : 1,
        "subBounds" : [
          {
            "offset" : [
              0,
              3
            ],
            "bounds" : {
              "y1" : 0.9803921565062389,
              "x2" : 0.1614420054231975,
              "x1" : 0.11912225705329153,
              "y2" : 0.95008912680926916
            },
            "text" : "36."
          }
        ]
      }
    },
    {
      "observation" : {
        "confidence" : 0.5,
        "subBounds" : [
          {
            "text" : "Work",
            "offset" : [
              0,
              4
            ],
            "bounds" : {
              "x2" : 0.23667711598746083,
              "y2" : 0.946524064171123,
              "x1" : 0.18652037617554859,
              "y1" : 0.97504456292335118
            }
          },
          {
            "text" : "Task",
            "bounds" : {
              "y2" : 0.946524064171123,
              "y1" : 0.97504456327985745,
              "x1" : 0.23981191222570533,
              "x2" : 0.28056426332288403
            },
            "offset" : [
              5,
              9
            ]
          },
          {
            "bounds" : {
              "x2" : 0.31504702194357365,
              "x1" : 0.28369905956112851,
              "y1" : 0.97504456327985745,
              "y2" : 0.946524064171123
            },
            "text" : "for:",
            "offset" : [
              10,
              14
            ]
          },
          {
            "bounds" : {
              "y2" : 0.946524064171123,
              "y1" : 0.97504456327985745,
              "x1" : 0.31818181818181818,
              "x2" : 0.34952978056426331
            },
            "offset" : [
              15,
              18
            ],
            "text" : "(b)"
          },
          {
            "text" : "(6)",
            "offset" : [
              19,
              22
            ],
            "bounds" : {
              "x1" : 0.35266457680250785,
              "y1" : 0.97504456327985745,
              "y2" : 0.94652406438502679,
              "x2" : 0.38871472949843267
            }
          }
        ],
        "bounds" : {
          "x1" : 0.18652037415360501,
          "y1" : 0.97504456320855615,
          "x2" : 0.38871473152037617,
          "y2" : 0.9465240640998217
        },
        "text" : "Work Task for: (b) (6)"
      }
    },
    {
      "observation" : {
        "bounds" : {
          "x1" : 0.44200626929467085,
          "y2" : 0.95008912675579327,
          "y1" : 0.9696969698930481,
          "x2" : 0.49686520346394986
        },
        "text" : "Direct",
        "confidence" : 1,
        "subBounds" : [
          {
            "text" : "Direct",
            "bounds" : {
              "x1" : 0.44200626984326019,
              "y1" : 0.96969696969696972,
              "y2" : 0.95008912695187164,
              "x2" : 0.49686520291536052
            },
            "offset" : [
              0,
              6
            ]
          }
        ]
      }
    },
    {
      "observation" : {
        "bounds" : {
          "y2" : 0.91257735462201994,
          "y1" : 0.9305599004304822,
          "x1" : 0.18810392744810853,
          "x2" : 0.25546974122659416
        },
        "text" : "Control",
        "confidence" : 1,
        "subBounds" : [
          {
            "text" : "Control",
            "bounds" : {
              "x1" : 0.18810392586520183,
              "y2" : 0.91257735442674492,
              "y1" : 0.93055990062575722,
              "x2" : 0.25546974280950085
            },
            "offset" : [
              0,
              7
            ]
          }
        ]
      }
    },
    {
      "observation" : {
        "text" : "Edit Sampling Disposition",
        "confidence" : 1,
        "subBounds" : [
          {
            "bounds" : {
              "x1" : 0.29780564742946702,
              "x2" : 0.34247648902821315,
              "y1" : 0.93048128327985735,
              "y2" : 0.90909090909090906
            },
            "offset" : [
              0,
              4
            ],
            "text" : "Edit"
          },
          {
            "offset" : [
              5,
              13
            ],
            "text" : "Sampling",
            "bounds" : {
              "x2" : 0.43652037617554856,
              "x1" : 0.34482758620689657,
              "y2" : 0.90909090909090906,
              "y1" : 0.93048128342245984
            }
          },
          {
            "offset" : [
              14,
              25
            ],
            "bounds" : {
              "x2" : 0.53761755485893414,
              "y2" : 0.90909090937611414,
              "x1" : 0.43887147335423199,
              "y1" : 0.93048128342245984
            },
            "text" : "Disposition"
          }
        ],
        "bounds" : {
          "x2" : 0.53761755725705318,
          "y2" : 0.90909090916221036,
          "x1" : 0.29780564503134793,
          "y1" : 0.93048128349376114
        }
      }
    },
    {
      "observation" : {
        "subBounds" : [
          {
            "bounds" : {
              "y1" : 0.90552584623885923,
              "y2" : 0.88235294117647056,
              "x1" : 0.18808777733542315,
              "x2" : 0.31798588388765869
            },
            "offset" : [
              0,
              14
            ],
            "text" : "Entered\/Edited"
          },
          {
            "text" : "by",
            "bounds" : {
              "x2" : 0.34345610145491118,
              "x1" : 0.32053290564438391,
              "y1" : 0.90552584670231728,
              "y2" : 0.88235294117647056
            },
            "offset" : [
              15,
              17
            ]
          },
          {
            "offset" : [
              18,
              22
            ],
            "bounds" : {
              "x1" : 0.34600312321163645,
              "x2" : 0.38930249307596571,
              "y2" : 0.88235294117647056,
              "y1" : 0.90552584670231728
            },
            "text" : "John"
          },
          {
            "bounds" : {
              "x2" : 0.45768024843260191,
              "y2" : 0.88235294117647056,
              "x1" : 0.39184951483269098,
              "y1" : 0.90552584670231728
            },
            "text" : "Steuber",
            "offset" : [
              23,
              30
            ]
          }
        ],
        "text" : "Entered\/Edited by John Steuber",
        "confidence" : 1,
        "bounds" : {
          "y1" : 0.9055258464705882,
          "x1" : 0.18808777463949844,
          "x2" : 0.45768025112852667,
          "y2" : 0.88235294094474159
        }
      }
    },
    {
      "observation" : {
        "subBounds" : [
          {
            "bounds" : {
              "y2" : 0.90730837834224598,
              "x2" : 0.61128526624869384,
              "y1" : 0.93404634581105173,
              "x1" : 0.56112852727272722
            },
            "text" : "Flag?",
            "offset" : [
              0,
              5
            ]
          }
        ],
        "bounds" : {
          "x2" : 0.6112852666666666,
          "y2" : 0.90730837811942955,
          "y1" : 0.93404634603386805,
          "x1" : 0.56112852685475434
        },
        "confidence" : 1,
        "text" : "Flag?"
      }
    },
    {
      "observation" : {
        "confidence" : 1,
        "bounds" : {
          "x1" : 0.18815321844609226,
          "y2" : 0.84093111714321089,
          "x2" : 0.28833580851217094,
          "y1" : 0.86138617326400346
        },
        "subBounds" : [
          {
            "offset" : [
              0,
              4
            ],
            "text" : "Work",
            "bounds" : {
              "y1" : 0.86138617345490087,
              "x1" : 0.18815321967438162,
              "x2" : 0.23759269482473663,
              "y2" : 0.84135996157891957
            }
          },
          {
            "bounds" : {
              "x2" : 0.28833580728388153,
              "y1" : 0.86094905011880474,
              "x1" : 0.23987593017825068,
              "y2" : 0.8409311169523136
            },
            "text" : "Date:",
            "offset" : [
              5,
              10
            ]
          }
        ],
        "text" : "Work Date:"
      }
    },
    {
      "observation" : {
        "subBounds" : [
          {
            "offset" : [
              0,
              10
            ],
            "bounds" : {
              "y2" : 0.79251246073001735,
              "x1" : 0.18664027946636474,
              "y1" : 0.81533067647150159,
              "x2" : 0.28828134807988426
            },
            "text" : "Agreement:"
          }
        ],
        "text" : "Agreement:",
        "confidence" : 1,
        "bounds" : {
          "y2" : 0.7925124601617406,
          "x1" : 0.1866402807620976,
          "y1" : 0.81533067703977824,
          "x2" : 0.2882813467841514
        }
      }
    },
    {
      "observation" : {
        "subBounds" : [
          {
            "offset" : [
              0,
              10
            ],
            "bounds" : {
              "x2" : 0.51567398056426339,
              "y2" : 0.84135472370766484,
              "x1" : 0.42476489147335422,
              "y1" : 0.86096256645276292
            },
            "text" : "06\/18\/2019"
          }
        ],
        "confidence" : 1,
        "bounds" : {
          "x1" : 0.42476489056426336,
          "x2" : 0.5156739814733543,
          "y1" : 0.86096256664884141,
          "y2" : 0.84135472351158647
        },
        "text" : "06\/18\/2019"
      }
    },
    {
      "observation" : {
        "confidence" : 1,
        "bounds" : {
          "x2" : 0.75391849447492165,
          "y2" : 0.83778966112596553,
          "x1" : 0.55172413710815049,
          "y1" : 0.86096256665181226
        },
        "text" : "(Entry Date: 07\/22\/2019)",
        "subBounds" : [
          {
            "text" : "(Entry",
            "offset" : [
              0,
              6
            ],
            "bounds" : {
              "x2" : 0.60521159482226472,
              "y2" : 0.83778966131907306,
              "x1" : 0.55172413879310345,
              "y1" : 0.86096256645870473
            }
          },
          {
            "offset" : [
              7,
              12
            ],
            "bounds" : {
              "y1" : 0.86096256684491979,
              "x2" : 0.65360500820004452,
              "x1" : 0.60775861657899,
              "y2" : 0.83778966131907306
            },
            "text" : "Date:"
          },
          {
            "text" : "07\/22\/2019)",
            "offset" : [
              13,
              24
            ],
            "bounds" : {
              "x1" : 0.6561520299567698,
              "y1" : 0.86096256684491979,
              "x2" : 0.7539184927899687,
              "y2" : 0.83778966131907306
            }
          }
        ]
      }
    },
    {
      "observation" : {
        "subBounds" : [
          {
            "offset" : [
              0,
              3
            ],
            "text" : "(b)",
            "bounds" : {
              "y1" : 0.82709447350713017,
              "y2" : 0.77540106951871657,
              "x1" : 0.39655172568573666,
              "x2" : 0.4476880877742947
            }
          },
          {
            "text" : "(6),",
            "offset" : [
              4,
              8
            ],
            "bounds" : {
              "x2" : 0.52155172413793105,
              "y2" : 0.77540106951871657,
              "x1" : 0.45336990595611287,
              "y1" : 0.82709447415329773
            }
          },
          {
            "text" : "(b)",
            "offset" : [
              9,
              12
            ],
            "bounds" : {
              "x2" : 0.58405172413793105,
              "y1" : 0.82709447415329773,
              "x1" : 0.52723354231974917,
              "y2" : 0.77540106951871657
            }
          },
          {
            "offset" : [
              13,
              16
            ],
            "bounds" : {
              "x2" : 0.64655172413793105,
              "y1" : 0.82709447415329773,
              "y2" : 0.77540106951871657,
              "x1" : 0.58973354231974917
            },
            "text" : "(3)"
          },
          {
            "text" : "(A)",
            "offset" : [
              17,
              20
            ],
            "bounds" : {
              "y2" : 0.77540106951871657,
              "x2" : 0.71003134559169279,
              "x1" : 0.65223354231974917,
              "y1" : 0.82709447415329773
            }
          }
        ],
        "confidence" : 0.5,
        "text" : "(b) (6), (b) (3) (A)",
        "bounds" : {
          "x2" : 0.71003134755094044,
          "x1" : 0.396551723726489,
          "y2" : 0.77540106919563279,
          "y1" : 0.82709447383021395
        }
      }
    },
    {
      "observation" : {
        "bounds" : {
          "y2" : 0.74674913099502405,
          "x1" : 0.18811629417125428,
          "y1" : 0.77018491574121795,
          "x2" : 0.27113135340022909
        },
        "confidence" : 1,
        "subBounds" : [
          {
            "text" : "Property:",
            "bounds" : {
              "x1" : 0.18811629376148015,
              "y2" : 0.74674913140219401,
              "y1" : 0.77018491533404809,
              "x2" : 0.27113135381000325
            },
            "offset" : [
              0,
              9
            ]
          }
        ],
        "text" : "Property:"
      }
    },
    {
      "observation" : {
        "bounds" : {
          "y2" : 0.70231729026292333,
          "y1" : 0.72549019578877005,
          "x2" : 0.38401253832288407,
          "x1" : 0.18652037531347965
        },
        "confidence" : 1,
        "subBounds" : [
          {
            "offset" : [
              0,
              9
            ],
            "text" : "Activity:",
            "bounds" : {
              "x1" : 0.18652037778213162,
              "x2" : 0.26293102887730613,
              "y2" : 0.70231729055258474,
              "y1" : 0.72549019549910865
            }
          },
          {
            "text" : "232167123126",
            "offset" : [
              10,
              22
            ],
            "bounds" : {
              "x1" : 0.2654780506340314,
              "y1" : 0.72549019607843135,
              "x2" : 0.38401253585423206,
              "y2" : 0.70231729055258474
            }
          }
        ],
        "text" : "Activity: 232167123126"
      }
    },
    {
      "observation" : {
        "text" : "Activity",
        "bounds" : {
          "x1" : 0.18808777498432602,
          "x2" : 0.25705329222570533,
          "y1" : 0.67914438516934039,
          "y2" : 0.65775401083778973
        },
        "confidence" : 1,
        "subBounds" : [
          {
            "bounds" : {
              "y2" : 0.6577540110516934,
              "y1" : 0.67914438495543672,
              "x1" : 0.18808777567398116,
              "x2" : 0.25705329153605017
            },
            "text" : "Activity",
            "offset" : [
              0,
              8
            ]
          }
        ]
      }
    },
    {
      "observation" : {
        "text" : "Measurements:",
        "subBounds" : [
          {
            "text" : "Measurements:",
            "bounds" : {
              "x2" : 0.31818181603970747,
              "y2" : 0.63458110549613789,
              "x1" : 0.18652037622779519,
              "y1" : 0.65418894830659535
            },
            "offset" : [
              0,
              13
            ]
          }
        ],
        "confidence" : 1,
        "bounds" : {
          "x1" : 0.18652037513061656,
          "x2" : 0.31818181713688615,
          "y1" : 0.65418894846999409,
          "y2" : 0.63458110533273915
        }
      }
    },
    {
      "observation" : {
        "text" : "Conflict",
        "bounds" : {
          "x1" : 0.18804212707963844,
          "x2" : 0.26023373619021656,
          "y1" : 0.59872947198931992,
          "y2" : 0.57774111643092119
        },
        "subBounds" : [
          {
            "text" : "Conflict",
            "offset" : [
              0,
              8
            ],
            "bounds" : {
              "x1" : 0.18804212830892908,
              "y1" : 0.59872947233735208,
              "x2" : 0.26023373496092594,
              "y2" : 0.57774111608288903
            }
          }
        ],
        "confidence" : 1
      }
    },
    {
      "observation" : {
        "text" : "& Loss:",
        "confidence" : 1,
        "bounds" : {
          "y1" : 0.57230287497679577,
          "x1" : 0.18811501012243048,
          "x2" : 0.25545865919438998,
          "y2" : 0.55069177603962349
        },
        "subBounds" : [
          {
            "text" : "&",
            "offset" : [
              0,
              1
            ],
            "bounds" : {
              "x1" : 0.18811500922711974,
              "y1" : 0.57230287532172841,
              "y2" : 0.55085867016349144,
              "x2" : 0.20451843678489462
            }
          },
          {
            "offset" : [
              2,
              7
            ],
            "text" : "Loss:",
            "bounds" : {
              "y1" : 0.57224125242535107,
              "x1" : 0.20692370952325456,
              "x2" : 0.25545866008970075,
              "y2" : 0.55069177654466794
            }
          }
        ]
      }
    },
    {
      "observation" : {
        "text" : "Components",
        "confidence" : 1,
        "subBounds" : [
          {
            "text" : "Components",
            "offset" : [
              0,
              10
            ],
            "bounds" : {
              "y2" : 0.43950755706575395,
              "x1" : 0.18821297698626274,
              "y1" : 0.46245322762887275,
              "x2" : 0.29454564203566597
            }
          }
        ],
        "bounds" : {
          "x2" : 0.29454564217437784,
          "x1" : 0.18821297684755084,
          "y1" : 0.46245322782008669,
          "y2" : 0.43950755687454002
        }
      }
    },
    {
      "observation" : {
        "subBounds" : [
          {
            "offset" : [
              0,
              1
            ],
            "text" : "&",
            "bounds" : {
              "y2" : 0.41354723707664887,
              "x1" : 0.18808777622779516,
              "y1" : 0.43672014221628053,
              "x2" : 0.2059169265917476
            }
          },
          {
            "offset" : [
              2,
              15
            ],
            "bounds" : {
              "y1" : 0.43672014260249559,
              "x2" : 0.33385579887669803,
              "y2" : 0.41354723707664887,
              "x1" : 0.20846394834847287
            },
            "text" : "Take\/Samples:"
          }
        ],
        "confidence" : 1,
        "text" : "& Take\/Samples:",
        "bounds" : {
          "x1" : 0.18808777501306162,
          "y1" : 0.43672014240938806,
          "y2" : 0.41354723688354134,
          "x2" : 0.33385580009143151
        }
      }
    },
    {
      "observation" : {
        "subBounds" : [
          {
            "offset" : [
              0,
              8
            ],
            "text" : "Remarks:",
            "bounds" : {
              "y2" : 0.30783199789780724,
              "x2" : 0.27106815845026078,
              "y1" : 0.32674910715079897,
              "x1" : 0.18661209063933379
            }
          }
        ],
        "text" : "Remarks:",
        "bounds" : {
          "x2" : 0.27106815660011696,
          "x1" : 0.1866120924894776,
          "y1" : 0.3267491071440769,
          "y2" : 0.30783199790452931
        },
        "confidence" : 1
      }
    },
    {
      "observation" : {
        "confidence" : 1,
        "bounds" : {
          "y2" : 0.21595561891730042,
          "x1" : 0.18802405371508266,
          "y1" : 0.23680730495637337,
          "x2" : 0.25711701054505059
        },
        "text" : "Project:",
        "subBounds" : [
          {
            "text" : "Project:",
            "offset" : [
              0,
              8
            ],
            "bounds" : {
              "y1" : 0.23680730463596333,
              "x2" : 0.25711700959380346,
              "y2" : 0.21595561923771056,
              "x1" : 0.18802405466632979
            }
          }
        ]
      }
    },
    {
      "observation" : {
        "confidence" : 0.5,
        "subBounds" : [
          {
            "bounds" : {
              "x1" : 0.42476489028213166,
              "y1" : 0.72727272704354462,
              "x2" : 0.48334639068681245,
              "y2" : 0.70409982174688057
            },
            "offset" : [
              0,
              5
            ],
            "text" : "FIELD"
          },
          {
            "text" : "WRK",
            "offset" : [
              6,
              9
            ],
            "bounds" : {
              "x1" : 0.48589341244353768,
              "x2" : 0.5317398040645922,
              "y1" : 0.72727272727272729,
              "y2" : 0.70409982174688057
            }
          },
          {
            "text" : "(PERFORMED)",
            "bounds" : {
              "x1" : 0.53428682582131748,
              "x2" : 0.67084639146887592,
              "y2" : 0.70409982184873954,
              "y1" : 0.72727272727272729
            },
            "offset" : [
              10,
              21
            ]
          }
        ],
        "text" : "FIELD WRK (PERFORMED)",
        "bounds" : {
          "x2" : 0.670846393226601,
          "y1" : 0.72727272720906555,
          "y2" : 0.70409982168321883,
          "x1" : 0.42476488852440664
        }
      }
    },
    {
      "observation" : {
        "text" : "6 HOURS",
        "confidence" : 1,
        "bounds" : {
          "y2" : 0.64884135481283423,
          "x1" : 0.42476489075235113,
          "y1" : 0.66844919795008917,
          "x2" : 0.50940438918495301
        },
        "subBounds" : [
          {
            "offset" : [
              0,
              1
            ],
            "bounds" : {
              "x1" : 0.4247648912225705,
              "y1" : 0.66844919784115664,
              "x2" : 0.43769592476489028,
              "y2" : 0.64884135472370774
            },
            "text" : "6"
          },
          {
            "bounds" : {
              "y1" : 0.66844919786096257,
              "x2" : 0.50940438871473359,
              "x1" : 0.43985109717868337,
              "y2" : 0.64884135492176664
            },
            "text" : "HOURS",
            "offset" : [
              2,
              7
            ]
          }
        ]
      }
    },
    {
      "observation" : {
        "confidence" : 1,
        "subBounds" : [
          {
            "offset" : [
              0,
              6
            ],
            "text" : "BEARS.",
            "bounds" : {
              "x1" : 0.42633228840125392,
              "y1" : 0.60962566822001529,
              "y2" : 0.58823529411764708,
              "x2" : 0.4945141065830721
            }
          },
          {
            "offset" : [
              7,
              12
            ],
            "bounds" : {
              "y1" : 0.60962566844919786,
              "x2" : 0.5650470219435737,
              "y2" : 0.58823529411764708,
              "x1" : 0.49686520376175547
            },
            "text" : "BLACK"
          },
          {
            "bounds" : {
              "y1" : 0.60962566844919786,
              "x1" : 0.56739811912225702,
              "x2" : 0.63087774294670851,
              "y2" : 0.58823529411764708
            },
            "text" : "damage",
            "offset" : [
              13,
              19
            ]
          },
          {
            "text" : "threat",
            "offset" : [
              20,
              26
            ],
            "bounds" : {
              "y1" : 0.60962566844919786,
              "x2" : 0.68260188087774298,
              "y2" : 0.58823529411764708,
              "x1" : 0.63322884012539182
            }
          },
          {
            "offset" : [
              27,
              29
            ],
            "bounds" : {
              "y2" : 0.58823529411764708,
              "y1" : 0.60962566844919786,
              "x2" : 0.70376175548589337,
              "x1" : 0.6849529780564263
            },
            "text" : "of"
          },
          {
            "text" : "EGGS",
            "offset" : [
              30,
              34
            ],
            "bounds" : {
              "x2" : 0.76175548110165703,
              "y2" : 0.58823529419404119,
              "y1" : 0.60962566844919786,
              "x1" : 0.7061128526645768
            }
          }
        ],
        "bounds" : {
          "y2" : 0.58823529404125285,
          "x2" : 0.76175548349753708,
          "x1" : 0.42633228600537404,
          "y1" : 0.60962566837280363
        },
        "text" : "BEARS. BLACK damage threat of EGGS"
      }
    },
    {
      "observation" : {
        "confidence" : 1,
        "subBounds" : [
          {
            "text" : "BEARS,",
            "bounds" : {
              "x2" : 0.4945141065830721,
              "x1" : 0.42633229334975364,
              "y2" : 0.56149732620320858,
              "y1" : 0.58288770040743576
            },
            "offset" : [
              0,
              6
            ]
          },
          {
            "offset" : [
              7,
              12
            ],
            "bounds" : {
              "y1" : 0.58288770053475936,
              "x1" : 0.49686520376175547,
              "x2" : 0.5650470219435737,
              "y2" : 0.56149732620320858
            },
            "text" : "BLACK"
          },
          {
            "text" : "damage",
            "offset" : [
              13,
              19
            ],
            "bounds" : {
              "x2" : 0.63087774294670851,
              "x1" : 0.56739811912225702,
              "y1" : 0.58288770053475936,
              "y2" : 0.56149732620320858
            }
          },
          {
            "offset" : [
              20,
              26
            ],
            "text" : "threat",
            "bounds" : {
              "y2" : 0.56149732620320858,
              "x1" : 0.63322884012539182,
              "y1" : 0.58288770053475936,
              "x2" : 0.68260188087774298
            }
          },
          {
            "text" : "of",
            "bounds" : {
              "x2" : 0.70376175548589337,
              "y2" : 0.56149732620320858,
              "x1" : 0.6849529780564263,
              "y1" : 0.58288770053475936
            },
            "offset" : [
              27,
              29
            ]
          },
          {
            "bounds" : {
              "x1" : 0.7061128526645768,
              "y1" : 0.58288770053475936,
              "x2" : 0.77272727272727271,
              "y2" : 0.56149732638146166
            },
            "offset" : [
              30,
              35
            ],
            "text" : "FOWL."
          }
        ],
        "bounds" : {
          "y1" : 0.5828877005602241,
          "x2" : 0.77272727520152262,
          "x1" : 0.42633229087550378,
          "y2" : 0.56149732622867332
        },
        "text" : "BEARS, BLACK damage threat of FOWL."
      }
    },
    {
      "observation" : {
        "text" : "CHICKENS (OTHER)",
        "confidence" : 1,
        "subBounds" : [
          {
            "text" : "CHICKENS",
            "offset" : [
              0,
              8
            ],
            "bounds" : {
              "x2" : 0.5329153605015674,
              "x1" : 0.42476489028213166,
              "y1" : 0.55793226352941172,
              "y2" : 0.53654188948306603
            }
          },
          {
            "offset" : [
              9,
              16
            ],
            "text" : "(OTHER)",
            "bounds" : {
              "y1" : 0.55793226381461669,
              "x1" : 0.53526645768025083,
              "x2" : 0.62068965125391862,
              "y2" : 0.5365418896256684
            }
          }
        ],
        "bounds" : {
          "y1" : 0.5579322637433155,
          "x2" : 0.62068965321316616,
          "x1" : 0.42476488832288406,
          "y2" : 0.53654188941176473
        }
      }
    },
    {
      "observation" : {
        "subBounds" : [
          {
            "offset" : [
              0,
              4
            ],
            "bounds" : {
              "y2" : 0.49019607843137258,
              "y1" : 0.51336898349376114,
              "x1" : 0.42633228912225701,
              "x2" : 0.46963165826558317
            },
            "text" : "Cmp:"
          },
          {
            "text" : "PHYSICAL",
            "bounds" : {
              "x1" : 0.47217868002230845,
              "y2" : 0.49019607843137258,
              "x2" : 0.57151252853459322,
              "y1" : 0.5133689839572193
            },
            "offset" : [
              5,
              13
            ]
          },
          {
            "offset" : [
              14,
              21
            ],
            "text" : "ACTIONS",
            "bounds" : {
              "x1" : 0.5740595502913185,
              "y2" : 0.49019607843137258,
              "y1" : 0.5133689839572193,
              "x2" : 0.660658290019977
            }
          },
          {
            "bounds" : {
              "y1" : 0.5133689839572193,
              "y2" : 0.49019607843137258,
              "x2" : 0.80094043210031363,
              "x1" : 0.66320531177670228
            },
            "offset" : [
              22,
              34
            ],
            "text" : "(HAND\/VOICE)"
          }
        ],
        "bounds" : {
          "y1" : 0.51336898372549022,
          "x2" : 0.80094043584639507,
          "x1" : 0.42633228537617562,
          "y2" : 0.4901960781996435
        },
        "text" : "Cmp: PHYSICAL ACTIONS (HAND\/VOICE)",
        "confidence" : 1
      }
    },
    {
      "observation" : {
        "text" : "APPLIED\/USED 1 IN",
        "bounds" : {
          "x1" : 0.4263322896663681,
          "x2" : 0.60344827712718319,
          "y1" : 0.48663101614463966,
          "y2" : 0.46880570420168066
        },
        "subBounds" : [
          {
            "offset" : [
              0,
              12
            ],
            "bounds" : {
              "x1" : 0.42633229093148228,
              "x2" : 0.56543887147335425,
              "y1" : 0.48663101601731606,
              "y2" : 0.4688057040998217
            },
            "text" : "APPLIED\/USED"
          },
          {
            "text" : "1",
            "offset" : [
              13,
              14
            ],
            "bounds" : {
              "x1" : 0.56739811912225702,
              "y1" : 0.4866310160427807,
              "x2" : 0.57915360501567403,
              "y2" : 0.4688057040998217
            }
          },
          {
            "bounds" : {
              "x2" : 0.60344827586206895,
              "y2" : 0.46880570432900426,
              "x1" : 0.5811128526645768,
              "y1" : 0.4866310160427807
            },
            "text" : "IN",
            "offset" : [
              15,
              17
            ]
          }
        ],
        "confidence" : 1
      }
    },
    {
      "observation" : {
        "bounds" : {
          "x2" : 0.85109717625391856,
          "x1" : 0.48432601637931039,
          "y2" : 0.44028520481283429,
          "y1" : 0.46167557914438506
        },
        "subBounds" : [
          {
            "bounds" : {
              "y1" : 0.46167557896613198,
              "x1" : 0.48432601943573667,
              "x2" : 0.52429467084639503,
              "y2" : 0.44028520499108736
            },
            "offset" : [
              0,
              3
            ],
            "text" : "Cmp"
          },
          {
            "bounds" : {
              "x2" : 0.57366771159874608,
              "x1" : 0.52664576802507834,
              "y2" : 0.44028520499108736,
              "y1" : 0.46167557932263814
            },
            "text" : "Take:",
            "offset" : [
              4,
              9
            ]
          },
          {
            "text" : "1",
            "bounds" : {
              "x1" : 0.5760188087774295,
              "y2" : 0.44028520499108736,
              "y1" : 0.46167557932263814,
              "x2" : 0.58777429467084641
            },
            "offset" : [
              10,
              11
            ]
          },
          {
            "bounds" : {
              "y2" : 0.44028520499108736,
              "y1" : 0.46167557932263814,
              "x1" : 0.59012539184952983,
              "x2" : 0.61833855799373039
            },
            "offset" : [
              12,
              14
            ],
            "text" : "EA"
          },
          {
            "bounds" : {
              "y1" : 0.46167557932263814,
              "x1" : 0.62068965517241381,
              "y2" : 0.44028520499108736,
              "x2" : 0.74294670846394983
            },
            "text" : "BEARBLACK",
            "offset" : [
              15,
              24
            ]
          },
          {
            "bounds" : {
              "x1" : 0.74529780564263326,
              "y1" : 0.46167557932263814,
              "x2" : 0.85109717319749223,
              "y2" : 0.44028520499108736
            },
            "text" : "DISPERSED",
            "offset" : [
              25,
              34
            ]
          }
        ],
        "text" : "Cmp Take: 1 EA BEARBLACK DISPERSED",
        "confidence" : 0.5
      }
    },
    {
      "observation" : {
        "confidence" : 0.30000001192092896,
        "bounds" : {
          "y1" : 0.43493761126814368,
          "x2" : 0.54702194394312587,
          "y2" : 0.41532976813088873,
          "x1" : 0.48275862105911332
        },
        "subBounds" : [
          {
            "bounds" : {
              "y1" : 0.4349376111280876,
              "x2" : 0.50862068965517238,
              "y2" : 0.4153297682709447,
              "x1" : 0.48275862151813703
            },
            "text" : "Int",
            "offset" : [
              0,
              3
            ]
          },
          {
            "bounds" : {
              "y1" : 0.43493761140819964,
              "y2" : 0.4153297682709447,
              "x2" : 0.54702194348410205,
              "x1" : 0.51077586206896552
            },
            "text" : "Trgt",
            "offset" : [
              4,
              8
            ]
          }
        ],
        "text" : "Int Trgt"
      }
    },
    {
      "observation" : {
        "bounds" : {
          "x1" : 0.42476488655172423,
          "y2" : 0.38859180024955453,
          "x2" : 0.79780563890282141,
          "y1" : 0.41354723696969709
        },
        "confidence" : 1,
        "text" : "Cmp: BARRIERS, FENCING (PERMANENT",
        "subBounds" : [
          {
            "bounds" : {
              "x2" : 0.46865203462798022,
              "x1" : 0.42476489028213166,
              "y1" : 0.4135472367201426,
              "y2" : 0.3885918003565062
            },
            "offset" : [
              0,
              4
            ],
            "text" : "Cmp:"
          },
          {
            "text" : "BARRIERS,",
            "offset" : [
              5,
              14
            ],
            "bounds" : {
              "x1" : 0.47139498114959572,
              "y2" : 0.3885918003565062,
              "y1" : 0.41354723707664887,
              "x2" : 0.57562694897098599
            }
          },
          {
            "text" : "FENCING",
            "offset" : [
              15,
              22
            ],
            "bounds" : {
              "y2" : 0.3885918003565062,
              "x2" : 0.66614418418429866,
              "y1" : 0.41354723707664887,
              "x1" : 0.57836989549260154
            }
          },
          {
            "offset" : [
              23,
              33
            ],
            "bounds" : {
              "y2" : 0.3885918004991088,
              "x1" : 0.6688871307059141,
              "y1" : 0.41354723707664887,
              "x2" : 0.79780563517241398
            },
            "text" : "(PERMANENT"
          }
        ]
      }
    },
    {
      "observation" : {
        "bounds" : {
          "x1" : 0.4247648866300941,
          "x2" : 0.78996864838558001,
          "y2" : 0.36541889474153288,
          "y1" : 0.38502673787878783
        },
        "confidence" : 1,
        "subBounds" : [
          {
            "offset" : [
              0,
              10
            ],
            "bounds" : {
              "x1" : 0.42476489028213166,
              "y2" : 0.36541889483065959,
              "y1" : 0.38502673768270945,
              "x2" : 0.5540752351097179
            },
            "text" : "ELECTRICAL"
          },
          {
            "offset" : [
              11,
              23
            ],
            "bounds" : {
              "x1" : 0.55623040752351094,
              "x2" : 0.69631661442006265,
              "y1" : 0.38502673796791442,
              "y2" : 0.36541889483065959
            },
            "text" : "APPLIED\/USED"
          },
          {
            "text" : "12",
            "offset" : [
              24,
              26
            ],
            "bounds" : {
              "y2" : 0.36541889483065959,
              "x1" : 0.6984717868338558,
              "y1" : 0.38502673796791442,
              "x2" : 0.72002351097178685
            }
          },
          {
            "offset" : [
              27,
              30
            ],
            "text" : "LIN",
            "bounds" : {
              "x1" : 0.72217868338557989,
              "y1" : 0.38502673796791442,
              "x2" : 0.75881661442006265,
              "y2" : 0.36541889483065959
            }
          },
          {
            "offset" : [
              31,
              33
            ],
            "text" : "YD",
            "bounds" : {
              "x1" : 0.7609717868338558,
              "y1" : 0.38502673796791442,
              "x2" : 0.78996864473354245,
              "y2" : 0.36541889493761137
            }
          }
        ],
        "text" : "ELECTRICAL APPLIED\/USED 12 LIN YD"
      }
    },
    {
      "observation" : {
        "subBounds" : [
          {
            "text" : "YOUNG",
            "offset" : [
              0,
              5
            ],
            "bounds" : {
              "x2" : 0.49745297805642635,
              "y1" : 0.34046345809071099,
              "x1" : 0.42633229292929287,
              "y2" : 0.32085561497326198
            }
          },
          {
            "offset" : [
              6,
              11
            ],
            "bounds" : {
              "x1" : 0.49960815047021945,
              "y1" : 0.34046345811051693,
              "y2" : 0.32085561497326198,
              "x2" : 0.56857366771159878
            },
            "text" : "BLACK"
          },
          {
            "text" : "BEAR",
            "offset" : [
              12,
              16
            ],
            "bounds" : {
              "x2" : 0.62676332288401249,
              "y1" : 0.34046345811051693,
              "y2" : 0.32085561497326198,
              "x1" : 0.57072884012539182
            }
          },
          {
            "offset" : [
              17,
              21
            ],
            "text" : "CAME",
            "bounds" : {
              "x1" : 0.62891849529780564,
              "y1" : 0.34046345811051693,
              "y2" : 0.32085561497326198,
              "x2" : 0.68926332288401249
            }
          },
          {
            "text" : "INTO",
            "offset" : [
              22,
              26
            ],
            "bounds" : {
              "y1" : 0.34046345811051693,
              "x2" : 0.74098746081504707,
              "y2" : 0.32085561497326198,
              "x1" : 0.69141849529780564
            }
          },
          {
            "bounds" : {
              "x1" : 0.74314263322884011,
              "y1" : 0.34046345811051693,
              "y2" : 0.32085561497326198,
              "x2" : 0.76469435736677116
            },
            "text" : "50",
            "offset" : [
              27,
              29
            ]
          },
          {
            "offset" : [
              30,
              35
            ],
            "bounds" : {
              "y1" : 0.34046345811051693,
              "x2" : 0.83385579937304077,
              "x1" : 0.76684952978056431,
              "y2" : 0.320855615171321
            },
            "text" : "YARDS"
          }
        ],
        "text" : "YOUNG BLACK BEAR CAME INTO 50 YARDS",
        "confidence" : 1,
        "bounds" : {
          "x1" : 0.42633229066527339,
          "y1" : 0.34046345819964341,
          "y2" : 0.32085561506238858,
          "x2" : 0.8338558016370603
        }
      }
    },
    {
      "observation" : {
        "subBounds" : [
          {
            "bounds" : {
              "x1" : 0.42633229851097154,
              "y1" : 0.31372548966131908,
              "x2" : 0.4898119122257053,
              "y2" : 0.29233511586452765
            },
            "text" : "WHILE",
            "offset" : [
              0,
              5
            ]
          },
          {
            "text" : "I",
            "bounds" : {
              "y1" : 0.31372549019607843,
              "y2" : 0.29233511586452765,
              "x1" : 0.49216300940438873,
              "x2" : 0.50156739811912221
            },
            "offset" : [
              6,
              7
            ]
          },
          {
            "offset" : [
              8,
              11
            ],
            "bounds" : {
              "x1" : 0.50391849529780564,
              "y2" : 0.29233511586452765,
              "y1" : 0.31372549019607843,
              "x2" : 0.55094043887147337
            },
            "text" : "WAS"
          },
          {
            "text" : "WORKING.",
            "bounds" : {
              "x1" : 0.55329153605015668,
              "y1" : 0.31372549019607843,
              "y2" : 0.29233511586452765,
              "x2" : 0.65438871473354232
            },
            "offset" : [
              12,
              20
            ]
          },
          {
            "bounds" : {
              "x1" : 0.65673981191222575,
              "y1" : 0.31372549019607843,
              "x2" : 0.66614420062695923,
              "y2" : 0.29233511586452765
            },
            "offset" : [
              21,
              22
            ],
            "text" : "I"
          },
          {
            "offset" : [
              23,
              28
            ],
            "text" : "HAZED",
            "bounds" : {
              "x1" : 0.66849529780564265,
              "y1" : 0.31372549019607843,
              "x2" : 0.73667711598746077,
              "y2" : 0.29233511586452765
            }
          },
          {
            "bounds" : {
              "y2" : 0.29233511586452765,
              "x2" : 0.7601880877742947,
              "x1" : 0.7390282131661442,
              "y1" : 0.31372549019607843
            },
            "text" : "IT",
            "offset" : [
              29,
              31
            ]
          },
          {
            "bounds" : {
              "y2" : 0.29233511586452765,
              "y1" : 0.31372549019607843,
              "x2" : 0.83228840121473358,
              "x1" : 0.76253918495297801
            },
            "text" : "AWAY.",
            "offset" : [
              32,
              37
            ]
          }
        ],
        "text" : "WHILE I WAS WORKING. I HAZED IT AWAY.",
        "confidence" : 1,
        "bounds" : {
          "y2" : 0.29233511559714798,
          "x1" : 0.42633229343652024,
          "x2" : 0.83228840628918477,
          "y1" : 0.31372548992869875
        }
      }
    },
    {
      "observation" : {
        "text" : "OPERATIONAL NONLETHAL PREDATION",
        "confidence" : 1,
        "bounds" : {
          "x2" : 0.80721002818704279,
          "y1" : 0.26737967905525839,
          "x1" : 0.42789968335945666,
          "y2" : 0.24598930472370761
        },
        "subBounds" : [
          {
            "text" : "OPERATIONAL",
            "bounds" : {
              "x1" : 0.42789968652037619,
              "x2" : 0.56661442006269591,
              "y2" : 0.24598930481283421,
              "y1" : 0.26737967887700531
            },
            "offset" : [
              0,
              11
            ]
          },
          {
            "text" : "NONLETHAL",
            "offset" : [
              12,
              21
            ],
            "bounds" : {
              "x2" : 0.69122257053291536,
              "y1" : 0.26737967914438499,
              "y2" : 0.24598930481283421,
              "x1" : 0.56896551724137934
            }
          },
          {
            "text" : "PREDATION",
            "offset" : [
              22,
              31
            ],
            "bounds" : {
              "x1" : 0.69357366771159878,
              "x2" : 0.80721002502612338,
              "y2" : 0.2459893049019608,
              "y1" : 0.26737967914438499
            }
          }
        ]
      }
    },
    {
      "observation" : {
        "confidence" : 1,
        "text" : "DAMAGE MANAGEMENT",
        "subBounds" : [
          {
            "text" : "DAMAGE",
            "offset" : [
              0,
              6
            ],
            "bounds" : {
              "y2" : 0.22103386809269165,
              "y1" : 0.23885917998472117,
              "x2" : 0.51763322884012541,
              "x1" : 0.42946708802060007
            }
          },
          {
            "bounds" : {
              "y1" : 0.23885918003565065,
              "y2" : 0.22103386829640947,
              "x2" : 0.66614420062695923,
              "x1" : 0.51959247648902818
            },
            "offset" : [
              7,
              17
            ],
            "text" : "MANAGEMENT"
          }
        ],
        "bounds" : {
          "y1" : 0.23885918011204488,
          "x2" : 0.66614420231751004,
          "y2" : 0.22103386816908588,
          "x1" : 0.42946708633004926
        }
      }
    },
    {
      "observation" : {
        "confidence" : 1,
        "bounds" : {
          "x2" : 0.38871473156739816,
          "y2" : 0.13190730832442066,
          "x1" : 0.19122256855799377,
          "y1" : 0.15508021385026738
        },
        "subBounds" : [
          {
            "offset" : [
              0,
              8
            ],
            "text" : "FlaggedX",
            "bounds" : {
              "x1" : 0.19122257053291536,
              "y1" : 0.1550802136185383,
              "x2" : 0.27272726674812342,
              "y2" : 0.13190730837789666
            }
          },
          {
            "text" : "by:",
            "bounds" : {
              "y1" : 0.15508021390374327,
              "x1" : 0.27527428850484864,
              "x2" : 0.3032915278288264,
              "y2" : 0.13190730837789666
            },
            "offset" : [
              9,
              12
            ]
          },
          {
            "offset" : [
              13,
              22
            ],
            "bounds" : {
              "x2" : 0.38871472959247655,
              "x1" : 0.30583854958555168,
              "y1" : 0.15508021390374327,
              "y2" : 0.13190730855614974
            },
            "text" : "Alexandra"
          }
        ],
        "text" : "FlaggedX by: Alexandra"
      }
    },
    {
      "observation" : {
        "text" : "Few on 07\/18\/19",
        "confidence" : 1,
        "bounds" : {
          "x2" : 0.32445141098484848,
          "y2" : 0.11051693420974451,
          "y1" : 0.13012477734699945,
          "x1" : 0.19122257085945663
        },
        "subBounds" : [
          {
            "text" : "Few",
            "bounds" : {
              "x2" : 0.22786050156739812,
              "y1" : 0.13012477718360071,
              "y2" : 0.11051693404634577,
              "x1" : 0.19122257196969694
            },
            "offset" : [
              0,
              3
            ]
          },
          {
            "text" : "on",
            "offset" : [
              4,
              6
            ],
            "bounds" : {
              "y2" : 0.11051693404634577,
              "y1" : 0.13012477718360071,
              "x1" : 0.23001567398119122,
              "x2" : 0.25156739811912227
            }
          },
          {
            "text" : "07\/18\/19",
            "bounds" : {
              "y1" : 0.13012477718360071,
              "x1" : 0.25372257053291536,
              "y2" : 0.11051693437314314,
              "x2" : 0.32445140987460813
            },
            "offset" : [
              7,
              15
            ]
          }
        ]
      }
    },
    {
      "observation" : {
        "subBounds" : [
          {
            "offset" : [
              0,
              9
            ],
            "bounds" : {
              "y2" : 0.074866310160427774,
              "y1" : 0.10516934046345816,
              "x2" : 0.26782915360501569,
              "x1" : 0.19122257339341686
            },
            "text" : "Corrected"
          },
          {
            "text" : "by",
            "offset" : [
              10,
              12
            ],
            "bounds" : {
              "y1" : 0.10516934046345816,
              "x2" : 0.28781347962382448,
              "y2" : 0.074866310160427774,
              "x1" : 0.2711598746081505
            }
          },
          {
            "text" : "°",
            "offset" : [
              13,
              14
            ],
            "bounds" : {
              "x2" : 0.3077978056426332,
              "y1" : 0.10516934046345816,
              "x1" : 0.29114420062695923,
              "y2" : 0.074866310160427774
            }
          },
          {
            "text" : "0\/",
            "offset" : [
              15,
              17
            ],
            "bounds" : {
              "y1" : 0.10516934046345816,
              "x1" : 0.31112852664576801,
              "x2" : 0.33777429467084641,
              "y2" : 0.074866310160427774
            }
          },
          {
            "text" : "a",
            "bounds" : {
              "y2" : 0.074866310160427774,
              "y1" : 0.10516934046345816,
              "x2" : 0.34443573667711597,
              "x1" : 0.34110501567398122
            },
            "offset" : [
              18,
              19
            ]
          },
          {
            "text" : "va",
            "offset" : [
              20,
              22
            ],
            "bounds" : {
              "y2" : 0.074866310918003443,
              "x1" : 0.34776645768025077,
              "y1" : 0.10516934046345816,
              "x2" : 0.3526645756269593
            }
          }
        ],
        "bounds" : {
          "x2" : 0.35266457764498432,
          "y2" : 0.074866310539215775,
          "y1" : 0.10516934084224594,
          "x1" : 0.19122257137539184
        },
        "confidence" : 0.30000001192092896,
        "text" : "Corrected by ° 0\/ a va"
      }
    },
    {
      "observation" : {
        "confidence" : 1,
        "bounds" : {
          "y1" : 0.074769308303107351,
          "x2" : 0.33387747120885075,
          "y2" : 0.053572937846799396,
          "x1" : 0.26016641675669006
        },
        "text" : "07\/19\/19",
        "subBounds" : [
          {
            "text" : "07\/19\/19",
            "offset" : [
              0,
              8
            ],
            "bounds" : {
              "y1" : 0.074769308397636292,
              "y2" : 0.053572937752270122,
              "x2" : 0.33387747148615882,
              "x1" : 0.26016641647938205
            }
          }
        ]
      }
    },
    {
      "observation" : {
        "text" : "Please add the component hazing to this work task and",
        "confidence" : 1,
        "bounds" : {
          "x1" : 0.41692789699255495,
          "y1" : 0.15508021384803916,
          "x2" : 0.84796237975117572,
          "y2" : 0.13190730832219244
        },
        "subBounds" : [
          {
            "text" : "Please",
            "offset" : [
              0,
              6
            ],
            "bounds" : {
              "x2" : 0.46786833482102541,
              "x1" : 0.41692789968652039,
              "y1" : 0.15508021370320857,
              "y2" : 0.13190730837789666
            }
          },
          {
            "offset" : [
              7,
              10
            ],
            "bounds" : {
              "y2" : 0.13190730837789666,
              "x1" : 0.47041535657775063,
              "y1" : 0.15508021390374327,
              "x2" : 0.50097961765845367
            },
            "text" : "add"
          },
          {
            "offset" : [
              11,
              14
            ],
            "text" : "the",
            "bounds" : {
              "y1" : 0.15508021390374327,
              "y2" : 0.13190730837789666,
              "x1" : 0.50352663941517894,
              "x2" : 0.52899685698243148
            }
          },
          {
            "bounds" : {
              "y1" : 0.15508021390374327,
              "x2" : 0.6232366619812657,
              "x1" : 0.53154387873915665,
              "y2" : 0.13190730837789666
            },
            "offset" : [
              15,
              24
            ],
            "text" : "component"
          },
          {
            "bounds" : {
              "y2" : 0.13190730837789666,
              "x2" : 0.67927114062922123,
              "y1" : 0.15508021390374327,
              "x1" : 0.62578368373799098
            },
            "offset" : [
              25,
              31
            ],
            "text" : "hazing"
          },
          {
            "text" : "to",
            "offset" : [
              32,
              34
            ],
            "bounds" : {
              "y2" : 0.13190730837789666,
              "x1" : 0.6818181623859465,
              "y1" : 0.15508021390374327,
              "x2" : 0.69964731468302332
            }
          },
          {
            "offset" : [
              35,
              39
            ],
            "bounds" : {
              "x1" : 0.70219433643974849,
              "y1" : 0.15508021390374327,
              "x2" : 0.73275859752045158,
              "y2" : 0.13190730837789666
            },
            "text" : "this"
          },
          {
            "offset" : [
              40,
              44
            ],
            "bounds" : {
              "y1" : 0.15508021390374327,
              "x2" : 0.77860498914150611,
              "x1" : 0.73530561927717686,
              "y2" : 0.13190730837789666
            },
            "text" : "work"
          },
          {
            "offset" : [
              45,
              49
            ],
            "bounds" : {
              "y2" : 0.13190730837789666,
              "x1" : 0.78115201089823139,
              "x2" : 0.81426329373565964,
              "y1" : 0.15508021390374327
            },
            "text" : "task"
          },
          {
            "offset" : [
              50,
              53
            ],
            "bounds" : {
              "y2" : 0.13190730846702314,
              "x2" : 0.84796237705721011,
              "x1" : 0.81681031549238481,
              "y1" : 0.15508021390374327
            },
            "text" : "and"
          }
        ]
      }
    },
    {
      "observation" : {
        "subBounds" : [
          {
            "offset" : [
              0,
              8
            ],
            "bounds" : {
              "y1" : 0.13198302617439839,
              "x1" : 0.41537440775719958,
              "y2" : 0.10641790388710937,
              "x2" : 0.4782731557767686
            },
            "text" : "indicate"
          },
          {
            "bounds" : {
              "x1" : 0.48120274996350854,
              "x2" : 0.51941586255074168,
              "y1" : 0.13134588885603504,
              "y2" : 0.10601969325351657
            },
            "text" : "how",
            "offset" : [
              9,
              12
            ]
          },
          {
            "offset" : [
              13,
              16
            ],
            "bounds" : {
              "y2" : 0.10570112474664217,
              "x2" : 0.55233002796992026,
              "x1" : 0.52234545673748167,
              "y1" : 0.13094767822244213
            },
            "text" : "you"
          },
          {
            "bounds" : {
              "x2" : 0.60170127609868795,
              "y2" : 0.10522327198633075,
              "y1" : 0.13062910971556796,
              "x1" : 0.55525962215666014
            },
            "text" : "hazed",
            "offset" : [
              17,
              22
            ]
          },
          {
            "offset" : [
              23,
              26
            ],
            "bounds" : {
              "x1" : 0.60463087028542783,
              "y2" : 0.1049312508550293,
              "x2" : 0.63187259439960153,
              "y1" : 0.13015125695525642
            },
            "text" : "the"
          },
          {
            "text" : "bear",
            "offset" : [
              27,
              31
            ],
            "bounds" : {
              "y1" : 0.12985923582395498,
              "x2" : 0.67027245405530977,
              "y2" : 0.10455958759700934,
              "x1" : 0.63480218858634141
            }
          },
          {
            "bounds" : {
              "y2" : 0.10405518746112496,
              "x1" : 0.67320204824204966,
              "x2" : 0.72238654930234236,
              "y1" : 0.12948757256593491
            },
            "offset" : [
              32,
              37
            ],
            "text" : "away."
          },
          {
            "bounds" : {
              "y1" : 0.12898317243005064,
              "x1" : 0.72531614348908224,
              "x2" : 0.79148995940014022,
              "y2" : 0.1025756349286161
            },
            "offset" : [
              38,
              45
            ],
            "text" : "Thanks!"
          }
        ],
        "confidence" : 1,
        "bounds" : {
          "y1" : 0.13198302676332385,
          "x2" : 0.79148995450205717,
          "y2" : 0.10257563433969075,
          "x1" : 0.41537441265528263
        },
        "text" : "indicate how you hazed the bear away. Thanks!"
      }
    },
    {
      "observation" : {
        "bounds" : {
          "x1" : 0.41692790228578885,
          "y1" : 0.085561497400475361,
          "x2" : 0.72884012799111808,
          "y2" : 0.062388591874628641
        },
        "confidence" : 1,
        "text" : "Added Components and hazing activity",
        "subBounds" : [
          {
            "text" : "Added",
            "offset" : [
              0,
              5
            ],
            "bounds" : {
              "y2" : 0.062388591800356497,
              "x2" : 0.47041535657775063,
              "x1" : 0.41692790488505738,
              "y1" : 0.085561497207367831
            }
          },
          {
            "text" : "Components",
            "offset" : [
              6,
              16
            ],
            "bounds" : {
              "x1" : 0.47296237833447591,
              "y1" : 0.085561497326203217,
              "x2" : 0.57484324860348601,
              "y2" : 0.062388591800356497
            }
          },
          {
            "text" : "and",
            "offset" : [
              17,
              20
            ],
            "bounds" : {
              "y1" : 0.085561497326203217,
              "x2" : 0.60795453144091427,
              "y2" : 0.062388591800356497,
              "x1" : 0.57739027036021118
            }
          },
          {
            "bounds" : {
              "y1" : 0.085561497326203217,
              "y2" : 0.062388591800356497,
              "x2" : 0.66653603184559507,
              "x1" : 0.61050155319763943
            },
            "text" : "hazing",
            "offset" : [
              21,
              27
            ]
          },
          {
            "bounds" : {
              "y1" : 0.085561497326203217,
              "x2" : 0.7288401253918495,
              "y2" : 0.06238859206773617,
              "x1" : 0.66908305360232023
            },
            "offset" : [
              28,
              36
            ],
            "text" : "activity"
          }
        ]
      }
    }
  ]
}

Do the math to assign text to cell



import cv2
import json
import numpy as np
from sklearn.cluster import DBSCAN
import glob
from tqdm import tqdm
import os
import re

from table_parsing.utils import stringify_keys

# Handle input data
# ==================================================
def process_image(image_path):

    # Load the image
    image = cv2.imread(image_path)

    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # Apply adaptive threshold to get binary image
    binary = cv2.adaptiveThreshold(~gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, -10)
    return image, binary

def load_data(json_path):
    with open(json_path, 'r') as file:
        data = json.load(file)

    return(data)

# Handle bounding boxes
# ============================================
def draw_bounding_box(image, bounds, color=(0, 255, 0), thickness=2):
    height, width, _ = image.shape
    top_left = (int(bounds['x1'] * width), int((1 - bounds['y2']) * height))
    bottom_right = (int(bounds['x2'] * width), int((1 - bounds['y1']) * height))
    cv2.rectangle(image, top_left, bottom_right, color, thickness)

def convert_bounding_box(image, bounds):
    height, width, _ = image.shape
    # top left
    x1 = int(bounds['x1'] * width)
    y2 = int((1 - bounds['y2']) * height)
    # Bottom right
    x2 = int(bounds['x2'] * width)
    y1 = int((1 - bounds['y1']) * height)

    # Calculations
    # width = x2 - x1
    # height = y2 - y1

    return x1, y1, x2, y2

def draw_all_word_bounding_boxes(image,data):
    # Loop through the observations and subBounds to draw bounding boxes
    for item in data['observations']:
        # Draw bounding box for the entire phrase
        draw_bounding_box(image, item["observation"]['bounds'], color=(255, 0, 0))
        # print(item["observation"]['text'])
    return(image)

        # Draw bounding boxes for each word
        # for subBound in item["observation"]['subBounds']:
        #     draw_bounding_box(image, subBound['bounds'], color=(0, 255, 0))

    # # Display the image
    # cv2.imshow('Image with Bounding Boxes', image)
    # cv2.waitKey(0)
    # cv2.destroyAllWindows()

# Cluster points
# ======================================

def find_horizontal_vertical_lines(binary, horizontal_scale_factor, vertical_scale_factor):
    # # Parameters to adjust
    # horizontal_scale_factor = 15  # Increase for larger structuring elements
    # vertical_scale_factor = 50    # Increase for larger structuring elements
    # dilation_size = 6            # Increase for more forgiving touch point detection

    # Detect horizontal lines
    horizontal = binary.copy()
    cols = horizontal.shape[1]
    horizontal_size = cols // horizontal_scale_factor
    horizontal_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))
    horizontal = cv2.erode(horizontal, horizontal_structure)
    horizontal = cv2.dilate(horizontal, horizontal_structure)

    # Detect vertical lines
    vertical = binary.copy()
    rows = vertical.shape[0]
    vertical_size = rows // vertical_scale_factor
    vertical_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size))
    vertical = cv2.erode(vertical, vertical_structure)
    vertical = cv2.dilate(vertical, vertical_structure)

    # Combine horizontal and vertical lines
    # grid = cv2.add(horizontal, vertical)

    return(horizontal, vertical)





# Find touches (where lines touch but don't intersect)
def find_touches(image, horizontal, vertical, dilation_size):

    # get touches
    # ==============
    touch_points = []

    # Dilate horizontal and vertical lines to ensure touching points are detected
    horizontal_dilated = cv2.dilate(horizontal, np.ones((dilation_size, dilation_size), np.uint8))
    vertical_dilated = cv2.dilate(vertical, np.ones((dilation_size, dilation_size), np.uint8))
    
    # Combine dilated lines to find touch points
    touch_points_img = cv2.bitwise_and(horizontal_dilated, vertical_dilated)
    
    # Get touch points coordinates
    touch_points_coords = np.argwhere(touch_points_img == 255)
    for point in touch_points_coords:
        touch_points.append((point[1], point[0]))

    # cluster touch points
    # ======================
    dbscan = DBSCAN(eps=10, min_samples=1).fit(touch_points)
    clustered_touch_points = []
    for label in np.unique(dbscan.labels_):
        cluster = np.array(touch_points)[dbscan.labels_ == label]
        if cluster.size > 0:
            clustered_touch_points.append(np.mean(cluster, axis=0).astype(int))


    
    return(clustered_touch_points)



def draw_clustered_touch_points(image, clustered_touch_points):
    # Draw clustered touch points for visualization
    for point in clustered_touch_points:
        cv2.circle(image, tuple(point), 5, (255, 0, 0), -1)


    # Display the image with detected lines, cells, intersections, and touches
    return(image)


# Final grid lines
# ========================================

# Identify and draw the outermost vertical and horizontal lines
def make_final_lines(image, clustered_touch_points):
    if not clustered_touch_points:
        return image

    # touch_points_np = np.array(touch_points)
    clustered_touch_points_np = np.array(clustered_touch_points)

    # Identify the outermost vertical lines
    left_most = np.min(clustered_touch_points_np[:, 0])
    right_most = np.max(clustered_touch_points_np[:, 0])
    middle = np.median(clustered_touch_points_np[:, 0])

    # Identify the outermost horizontal lines
    top_most = np.min(clustered_touch_points_np[:, 1])
    bottom_most = np.max(clustered_touch_points_np[:, 1])



    # Draw vertical lines
    cv2.line(image, (left_most, 0), (left_most, image.shape[0]), (0, 255, 0), 2)
    cv2.line(image, (int(middle), 0), (int(middle), image.shape[0]), (0, 255, 0), 2)
    cv2.line(image, (right_most, 0), (right_most, image.shape[0]), (0, 255, 0), 2)

    # Draw horizontal lines
    cv2.line(image, (0, top_most), (image.shape[1], top_most), (0, 255, 0), 2)
    cv2.line(image, (0, bottom_most), (image.shape[1], bottom_most), (0, 255, 0), 2)

   

    # Remove touch points that are within 5 pixels of top_most and bottom_most y-coordinates
    filtered_touch_points = [point for point in clustered_touch_points if not (top_most - 5 <= point[1] <= top_most + 5 or bottom_most - 5 <= point[1] <= bottom_most + 5)]
    filtered_touch_points_np = np.array(filtered_touch_points)

    # Identify unique y-coordinates of filtered touch points
    unique_y_coords = np.array(sorted(set(filtered_touch_points_np[:, 1]))).reshape(-1, 1)

    # Cluster the y-coordinates using DBSCAN
    dbscan = DBSCAN(eps=10, min_samples=1).fit(unique_y_coords)
    clustered_y_coords = []

    for label in np.unique(dbscan.labels_):
        cluster = unique_y_coords[dbscan.labels_ == label]
        if cluster.size > 0:
            clustered_y_coords.append(np.mean(cluster))

    clustered_y_coords = sorted(clustered_y_coords)


    
    for y in clustered_y_coords:
        y = int(y)

        cv2.line(image, (left_most, y), (right_most, y), (0, 255, 0), 2)



    return image, [top_most] + clustered_y_coords + [bottom_most], [left_most, middle, right_most]


# Assigning data 
# ============================================

# Function to calculate the intersection area of two rectangles
def intersection_area(box1, box2):
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])
    intersection = max(0, x2 - x1) * max(0, y2 - y1)
    return intersection


def assign_data(image, data, horizontal_lines, vertical_lines):

    row_result = {}

    for item in data['observations']:
        
        x1, y1, x2, y2 = convert_bounding_box(image, item["observation"]['bounds'] )
        text = item["observation"]['text']


        bounding_box = (x1, y1, x2, y2)


        # Determine the cell containing the majority of the bounding box
        max_intersection = 0
        cell_with_max_intersection = (0, 0)

        for i in range(len(horizontal_lines) - 1):
            for j in range(len(vertical_lines) - 1):
                # Define the bounding box for the current cell
                cell_box = (vertical_lines[j], horizontal_lines[i], vertical_lines[j + 1], horizontal_lines[i + 1])
                # print("bounding_box: " + str(bounding_box))
                # print("cell box: " + str(cell_box))

                
                # Calculate the intersection area between the bounding box and the cell
                intersection = intersection_area(bounding_box, cell_box)
                # print("intersection: " + str(intersection))
                # Update the cell with the maximum intersection area
                if intersection > max_intersection:
                    max_intersection = intersection
                    cell_with_max_intersection = (i, j)

        cell_key = cell_with_max_intersection
        if cell_key in row_result:
            row_result[cell_key] += " " + text
        else:
            row_result[cell_key] = text


    return(row_result)

def natural_sort_key(s):
    return [int(text) if text.isdigit() else text for text in re.split(r'(\d+)', s)]

def make_result(image_dir, json_dir, jsonl_output_path, annotated_output_path):

    image_paths =  sorted(glob.glob(f"{image_dir}/*jpg") , key=natural_sort_key)

    json_paths = sorted(glob.glob(f"{json_dir}/*json"), key=natural_sort_key)


    with open(jsonl_output_path, 'w') as jsonl_file:
        for i in range(len(image_paths)):
            image_path = image_paths[i]
            json_path = json_paths[i]

            file_name = os.path.basename(image_path)

            image, binary = process_image(image_path)
            data = load_data(json_path)

            # image = draw_all_word_bounding_boxes(image, data)

            horizontal_lines, vertical_lines = find_horizontal_vertical_lines(binary, horizontal_scale_factor = 15, vertical_scale_factor = 30)

            clustered_touch_points = find_touches(image, horizontal_lines, vertical_lines, dilation_size=6)

            image = draw_clustered_touch_points(image, clustered_touch_points)

            image_with_lines, horizontal_lines, vertical_lines = make_final_lines(image, clustered_touch_points)

            result = assign_data(image, data, horizontal_lines, vertical_lines)

        

            # Save results
            # ============
            cv2.imwrite(os.path.join(annotated_output_path, file_name), image_with_lines)
            # Write the result to the JSONL file
            jsonl_file.write(json.dumps(stringify_keys(result)) + '\n')

Parsing Thousands of Scanned Pages

The problem

Scanned pdfs

Scanned pdfs

Potential Solutions

Google Pinpoint

The complex, custom patterns

The solution

Criteria

Roadmap

Screenshots

Screenshots

Identify cells

Identify cells

Identify cells

Apple’s live text feature

Get positional data of text

Do the math to assign text to cell

Final result

A table

Questions?