Analyze the jsonl

library(tidyverse)
library(jsonlite)
library(here)
library(glue)

Reshape json

arrange_matrix <- function(lst, i) {
  # Extract row and column indices
  positions <- sapply(names(lst), function(x) as.numeric(unlist(strsplit(gsub("[^0-9,]", "", x), ","))))
  rows <- max(positions[1, ]) + 1
  cols <- max(positions[2, ]) + 1

  # Create an empty matrix with the determined dimensions
  mat <- matrix(NA, nrow = rows, ncol = cols)
  
  # Populate the matrix using the keys and values from the list
  for (key in names(lst)) {
    position <- as.numeric(unlist(strsplit(gsub("[^0-9,]", "", key), ","))) + 1
    mat[position[1], position[2]] <- lst[[key]]

}


   # Convert the matrix to a dataframe
   df <- as.data.frame(mat, stringsAsFactors = FALSE)

   
   if(ncol(df) == 2){

    df = df  %>% 
      rename(column = V1, value = V2) %>% 
      mutate(entry = i) %>% 
      mutate(correct = TRUE)


   } else {

    df = df  %>% 
      rename(column = V1) %>% 
      mutate(entry = i) %>% 
      mutate(correct = FALSE)





   }

   

   return(df)
  
 
}


foia_json_to_df = function(path, label){

  con <- file(path, "r")

# Initialize an empty list to store the data
json_data <- list()

# Read and parse the file line by line
while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) {
  parsed_line <- fromJSON(line)
  json_data <- append(json_data, list(parsed_line))
}

# Close the connection
close(con)

df = tibble()

for(i in seq(length(json_data))){

   temp =  arrange_matrix(json_data[[i]], i)
   df = bind_rows(df, temp)
}

df = df  %>% 
  mutate(file = label)

return(df)


}

Cleaning functions

clean_parsed = function(df){

  phrases_to_remove <- c("\\(b\\) \\(6\\)", "\\(b\\) 6\\)" ,"\\(0\\) \\(6\\) \\(0\\) \\(35 \\(a\\)",
"b \\(6", "\\(o\\) \\(6\\)",
"\\(0\\) \\(6\\)\\, \\(0\\) \\(35 \\(a\\)",
"\\!",
"\\?",
"control",  
"entered/edited by lori warnell", 
"control",  
"edit", 
"flag", 
"sampling",  
"disposition",  
"technical", 
"assistance", 
"aerial", 
"direct", 
"entered/edited by dalin tidwell", 
"entered/ed by john steuber",
"entered/ed by dalin tidwell", 
 "≤ back to report tab test details 05-03-2023",
 "< back to report tab test details 05-03-2023",
 "≤ back to report tab",
  "entered/edited by dalin tidwell",
  " d ",
  " z ",
  "8")

  df_clean = df  %>% 

    mutate(column = str_squish(column), value = str_squish(value)) %>% 
# Get rid of blank cells 
# ==========================
filter(!(is.na(column) == TRUE & is.na(value) == TRUE)) %>% 
mutate(
    column = case_when(
      is.na(column) & grepl("^Activity:", value) ~ "Activity:",
      TRUE ~ column
    ),
    value = case_when(
      column == "Activity:" & grepl("^Activity:", value) ~ sub("^Activity: ", "", value),
      TRUE ~ value
    )
  ) %>% 
# When there is just a number followed by period and worktask in other column combine
# =====================================================================================
mutate(column = case_when(

  str_detect(column, "\\b\\d+\\.\\s*") & str_detect(value, "WorkTask") ~ paste0(column, " ", value),
TRUE ~ column)) %>% 
# Separate activity number
# ===================
 mutate(
    value = case_when(
      grepl("^Activity: (\\d+)", column) ~ {
        activity_num <- sub("^Activity: (\\d+).*", "\\1", column)
        paste0(activity_num, " ", value)
      },
      TRUE ~ value
    ), 
    column = case_when(
      grepl("^Activity: (\\d+)", column) ~ "Activity:",
      TRUE ~ column

    )
  ) %>% 
  # Make flagged comments a single column
  # ====================================
  mutate(
    value = case_when(
      str_detect(column, "FlaggedX") | str_detect(value, "FlaggedX") ~ paste0(column, " ", value),
      TRUE ~ value
    ),
    column = case_when(
      str_detect(column, "FlaggedX") | str_detect(value, "FlaggedX") ~ "Flagged:",
      TRUE ~ column

    )
  # Combine columns that overlapped page
  # ======================================
  ) %>% 
  summarize(value = paste(value, collapse = ", "), .by = c(entry, column, file)) %>% 
  mutate(lag_column = lag(column)) %>% 
  mutate(column = case_when(
    is.na(column) == TRUE ~ lag_column,
    TRUE ~ column
  )) %>% 
  summarize(value = paste(value, collapse = "| "), .by = c(entry, column, file)) %>% 
  # Standardize column names
  # =========================
  mutate(column = tolower(str_squish(column))) %>% 
  ## Fix Worktask
  # ===================
  mutate(column = str_replace_all(column, "work task", "worktask")) %>% 
  mutate(column = case_when(str_detect(column, "worktask|work task") ~ str_replace_all(column, "\\b\\d+\\.\\s*", ""),
  TRUE ~ column)) %>% 
  mutate(column = case_when(
    str_detect(column, "worktask|work task") ~ str_replace_all(column, 
                                       setNames(rep("", length(phrases_to_remove)), phrases_to_remove)),
    TRUE ~ column),

    value = case_when(
    str_detect(column, "worktask|work task") ~ str_replace_all(value, 
                                       setNames(rep("", length(phrases_to_remove)), 
                                    paste0("(?i)", phrases_to_remove))),
    TRUE ~ value)
  ) %>% 
  mutate(column = case_when(
    column == "components & take/samples:" ~ "components & take:",
    TRUE ~ column
  )) %>% 
  # Stuff after colon
  # =====================
  mutate(
    # Extract the text after the colon (if present)
    value_extracted = str_extract(column, "(?<=: ).*"),
    # Combine the extracted text with the value column
    value = ifelse(is.na(value), value_extracted, paste(value_extracted, value, sep = " ")),
    # Remove the text after the colon in the column column
    column = str_extract(column, "^[^:]+")
  ) %>%
  select(-value_extracted) %>% 
  mutate(value = str_remove_all(value, "NA"))

  return(df_clean)


}


make_df_shape = function(df){

df_wide = df %>% 
  pivot_wider(
    names_from = column,
    values_from = value,
    values_fn = list(value = ~ paste(unique(.), collapse = ", "))
  ) %>% 
  janitor::clean_names()

return(df_wide)


}

Clean each json file

# 89561_1
# ============

df_8956_1 = foia_json_to_df(here("data/processed/montana-foia-parsed/8956_1_result.jsonl"), "8956_1")
df_8956_1_clean = df_8956_1  %>% 
  clean_parsed()
df_89561_1_final = df_8956_1_clean  %>% 
  make_df_shape() %>% 
   select(c(file, entry, worktask_for, work_date, agreement, property,activity,activity_measurements, conflict_loss, components_take, remarks, project, flagged))


# 9430_2
# ========


df_9430_2 = foia_json_to_df(here("data/processed/montana-foia-parsed/9430_2_result.jsonl"), "9430_2")
df_9430_2_clean = df_9430_2  %>% 
  clean_parsed()
df_9430_2_final = df_9430_2_clean  %>% 
  make_df_shape() %>% 
   select(c(file, entry, worktask_for, work_date, agreement, property,activity,activity_measurements, conflict_loss, components_take, remarks, project, flagged))


# 9557_1
# ==========

df_9557_1 = foia_json_to_df(here("data/processed/montana-foia-parsed/9557_1_result.jsonl"), "9557_1")
df_9557_1_clean = df_9557_1  %>% 
  clean_parsed()
df_9557_1_final = df_9557_1_clean  %>% 
  make_df_shape() %>% 
   select(c(file, entry, worktask_for, work_date, agreement, property,activity,activity_measurements, conflict_loss, components_take, remarks, project, flagged))



# 9557_2 
# ==========

df_9557_2 = foia_json_to_df(here("data/processed/montana-foia-parsed/9557_2_result.jsonl"), "9557_2")
df_9557_2_clean = df_9557_2  %>% 
  clean_parsed()
df_9557_2_final = df_9557_2_clean  %>% 
  make_df_shape() %>% 
   select(c(file, entry, worktask_for, work_date, agreement, property,activity,activity_measurements, conflict_loss, components_take, remarks, project, flagged))


# 9557_3
# ==========

df_9557_3 = foia_json_to_df(here("data/processed/montana-foia-parsed/9557_3_result.jsonl"), "9557_3")
df_9557_3_clean = df_9557_3  %>% 
  clean_parsed()
df_9557_3_final = df_9557_3_clean  %>% 
  make_df_shape() %>% 
   select(c(file, entry, worktask_for, work_date, agreement, property,activity,activity_measurements, conflict_loss, components_take, remarks, project, flagged))


# 9557_4
# ==========
df_9557_4 = foia_json_to_df(here("data/processed/montana-foia-parsed/9557_4_result.jsonl"), "9557_4")
df_9557_4_clean = df_9557_4  %>% 
  clean_parsed()
df_9557_4_final = df_9557_4_clean  %>% 
  make_df_shape() %>% 
   select(c(file, entry, worktask_for, work_date, agreement, property,activity,activity_measurements, conflict_loss, components_take, remarks, project, flagged))

Combine files

# no_physical_harm_words = c("BARRIER", "FENCING", "RANGE RIDER")
# physical_harm_words = c("TRAP", "SRES", "FOOTHOLD")
# death_aim_words = c("CYANIDE", "FIREARM", "FIXED WING")
# action_maintenance = c("CHECKED")


extract_date_outside_parentheses <- function(text) {
  str_extract_all(text, "(?<!\\()\\b\\d{2}/\\d{2}/\\d{4}\\b(?!\\))") %>%
    unlist() %>%
    .[!grepl("\\(", .)]
}

work_orders_df_raw = bind_rows(df_89561_1_final,df_9430_2_final,df_9557_1_final,df_9557_2_final, df_9557_3_final,df_9557_4_final)

work_orders_df_clean = work_orders_df_raw  %>% 
       mutate(x_entry_date = gsub(".*\\(Entry Date: (.*?)\\).*", "\\1", work_date),
         x_work_date = map_chr(work_date, ~ paste(extract_date_outside_parentheses(.x), collapse = ", ")), .after = work_date
  ) %>%
  # mutate(x_work_date = str_remove(x_work_date, "\\(Entry Date:")) %>% 
  mutate(
    x_work_date = as.Date(x_work_date, format = "%m/%d/%Y"),
    x_entry_date = as.Date(x_entry_date, format = "%m/%d/%Y"),
    x_work_year = lubridate::year(x_work_date)
  )

extract_patterns <- function(text) {
  matches <- str_extract_all(text, "\\d+ EA \\w+ KILLED")[[1]]
  return(matches)
}


animals_df = work_orders_df_clean  %>% 
    mutate(across(where(is.character), str_squish)) %>% 
    mutate(id = glue("{file}-{entry}"), .before = file) %>% 
     # Replace spelling errors
  mutate(components_take = str_replace_all(components_take, "Стр", "Cmp")) %>% 
  mutate(components_take = str_replace_all(components_take, " BA ", " EA ")) %>% 
    mutate(x_threat_of = case_when(
    str_detect(tolower(conflict_loss), "damage threat of|damage threat") ~ TRUE,
    TRUE ~ FALSE
  )) %>% 
  mutate(x_predation = case_when(
    str_detect(tolower(conflict_loss), "predation" ) ~ TRUE,
    str_detect(tolower(conflict_loss), "injury") ~ TRUE,
    TRUE ~ FALSE
  )) %>% 
  mutate(x_reason = case_when(

    x_threat_of == TRUE & x_predation == TRUE ~ "predation",
    x_threat_of == FALSE & x_predation == TRUE ~ "predation",
    x_threat_of == TRUE & x_predation == FALSE ~ "threat",
    TRUE ~ "neither"

  )) %>% 
  mutate(x_result = case_when(
    str_detect(components_take, "No Components/Take|No Components/ Take") ~ "no_components_take",
    str_detect(components_take, "KILLED") ~ "killed",
    str_detect(components_take, "SAMPLED") ~ "sampled",
    TRUE ~ NA_character_
  )) %>% 
  rowwise() %>%
  mutate(matches = list(extract_patterns(components_take))) %>% 
  ungroup() 
 


components_df = animals_df  %>% 
    select(id, components_take)  %>% 
    mutate(across(where(is.character), str_squish)) %>% 
    separate_longer_delim(components_take, delim = "Cmp") %>% 
    filter(components_take != "") %>% 
    mutate(x_action = case_when(
      str_detect(components_take, "SET") ~ "set",
      str_detect(components_take, "CHECK") ~ "check",
       str_detect(components_take, "APPLIED|USED") ~ "apply",
      str_detect(components_take, "REMOVE") ~ "remove",
      str_detect(components_take, "FIRED") ~ "fire",
      str_detect(components_take, "TESTED") ~ "tested",
      TRUE ~ NA_character_
      
    )) %>% 
  
  mutate(x_item = case_when(
    str_detect(components_take, "BARRIER") ~ "barrier", 
    str_detect(components_take, "FENCING") ~ "fencing", 
    str_detect(components_take, "RANGE RIDER") ~ "range_rider",
    str_detect(components_take, "TRAP") ~ "trap",
    str_detect(components_take, "SRES") ~ "snare",
    str_detect(components_take, "CYANIDE") ~ "cyanide",
    str_detect(components_take, "FIREARM") ~ "fire_arm",
    str_detect(components_take, "FIXED WING") ~ "fixed_wing",
    str_detect(components_take, "HELICOPTER") ~ "helicopter",
    str_detect(components_take, "GAS") ~ "gas",
    TRUE ~ NA_character_


  ))  %>% 
   mutate(x_result = case_when(
    str_detect(components_take, "No Components/Take") ~ "no_components_take",
    str_detect(components_take, "KILLED") ~ "killed",
    str_detect(components_take, "TRANSFER") ~ "transfer",
    TRUE ~ NA_character_
  ))  %>% 
  # strict category is lethal if it not mean to dissuade, and intention is to kill aka cyanaide, fire arm, and gas
  mutate(x_lethal_strict = case_when(
    str_detect(x_item, "cyanide|fire_arm|gas") ~ 1,
    TRUE ~ 0

  )) %>% 
  # lax  is lethal if the  items are not, like guns, cyanide, gas

  mutate(x_lethal_lax = case_when(
    str_detect(x_item, "barrier|fencing|range_rider|helicopter|fixed_wing") | is.na(x_item) == TRUE ~ 0,
    TRUE ~ 1

  ))  %>% 
  filter(!is.na(x_item)) %>% 
  mutate(x_item_count = case_when(
    !is.na(x_item) ~ 1,
    TRUE ~ 0 
  ))


# Create summaries
# ==================


component_summary_strict_df = components_df  %>% 
  summarize(lethal_strict_sum = sum(x_lethal_strict), x_items_actions = paste(glue("{x_item}:{x_action}"), collapse = ", "), x_number_items = sum(x_item_count), .by = id)

component_summary_lax_df = components_df  %>% 
  summarize(lethal_lax_sum = sum(x_lethal_lax) , x_items_actions = paste(glue("{x_item}:{x_action}"), collapse = ", "), .by = id) %>% 
  select(-x_items_actions)

combined_summary_df = full_join(component_summary_strict_df, component_summary_lax_df, join_by(id)) 


# Combine back
# =============

animals_df = left_join(animals_df,combined_summary_df, join_by(id) )

Export files

saveRDS(animals_df, here("data/processed/usda_workorders.RDS"))