---
title: Analyze the jsonl
format:
html:
code-fold: false
code-tools: true
execute:
eval: false
---
```{r}
library(tidyverse)
library(jsonlite)
library(here)
library(glue)
```
## Reshape json
```{r}
arrange_matrix <- function(lst, i) {
# Extract row and column indices
positions <- sapply(names(lst), function(x) as.numeric(unlist(strsplit(gsub("[^0-9,]", "", x), ","))))
rows <- max(positions[1, ]) + 1
cols <- max(positions[2, ]) + 1
# Create an empty matrix with the determined dimensions
mat <- matrix(NA, nrow = rows, ncol = cols)
# Populate the matrix using the keys and values from the list
for (key in names(lst)) {
position <- as.numeric(unlist(strsplit(gsub("[^0-9,]", "", key), ","))) + 1
mat[position[1], position[2]] <- lst[[key]]
}
# Convert the matrix to a dataframe
df <- as.data.frame(mat, stringsAsFactors = FALSE)
if(ncol(df) == 2){
df = df %>%
rename(column = V1, value = V2) %>%
mutate(entry = i) %>%
mutate(correct = TRUE)
} else {
df = df %>%
rename(column = V1) %>%
mutate(entry = i) %>%
mutate(correct = FALSE)
}
return(df)
}
foia_json_to_df = function(path, label){
con <- file(path, "r")
# Initialize an empty list to store the data
json_data <- list()
# Read and parse the file line by line
while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) {
parsed_line <- fromJSON(line)
json_data <- append(json_data, list(parsed_line))
}
# Close the connection
close(con)
df = tibble()
for(i in seq(length(json_data))){
temp = arrange_matrix(json_data[[i]], i)
df = bind_rows(df, temp)
}
df = df %>%
mutate(file = label)
return(df)
}
```
## Cleaning functions
```{r}
clean_parsed = function(df){
phrases_to_remove <- c("\\(b\\) \\(6\\)", "\\(b\\) 6\\)" ,"\\(0\\) \\(6\\) \\(0\\) \\(35 \\(a\\)",
"b \\(6", "\\(o\\) \\(6\\)",
"\\(0\\) \\(6\\)\\, \\(0\\) \\(35 \\(a\\)",
"\\!",
"\\?",
"control",
"entered/edited by lori warnell",
"control",
"edit",
"flag",
"sampling",
"disposition",
"technical",
"assistance",
"aerial",
"direct",
"entered/edited by dalin tidwell",
"entered/ed by john steuber",
"entered/ed by dalin tidwell",
"≤ back to report tab test details 05-03-2023",
"< back to report tab test details 05-03-2023",
"≤ back to report tab",
"entered/edited by dalin tidwell",
" d ",
" z ",
"8")
df_clean = df %>%
mutate(column = str_squish(column), value = str_squish(value)) %>%
# Get rid of blank cells
# ==========================
filter(!(is.na(column) == TRUE & is.na(value) == TRUE)) %>%
mutate(
column = case_when(
is.na(column) & grepl("^Activity:", value) ~ "Activity:",
TRUE ~ column
),
value = case_when(
column == "Activity:" & grepl("^Activity:", value) ~ sub("^Activity: ", "", value),
TRUE ~ value
)
) %>%
# When there is just a number followed by period and worktask in other column combine
# =====================================================================================
mutate(column = case_when(
str_detect(column, "\\b\\d+\\.\\s*") & str_detect(value, "WorkTask") ~ paste0(column, " ", value),
TRUE ~ column)) %>%
# Separate activity number
# ===================
mutate(
value = case_when(
grepl("^Activity: (\\d+)", column) ~ {
activity_num <- sub("^Activity: (\\d+).*", "\\1", column)
paste0(activity_num, " ", value)
},
TRUE ~ value
),
column = case_when(
grepl("^Activity: (\\d+)", column) ~ "Activity:",
TRUE ~ column
)
) %>%
# Make flagged comments a single column
# ====================================
mutate(
value = case_when(
str_detect(column, "FlaggedX") | str_detect(value, "FlaggedX") ~ paste0(column, " ", value),
TRUE ~ value
),
column = case_when(
str_detect(column, "FlaggedX") | str_detect(value, "FlaggedX") ~ "Flagged:",
TRUE ~ column
)
# Combine columns that overlapped page
# ======================================
) %>%
summarize(value = paste(value, collapse = ", "), .by = c(entry, column, file)) %>%
mutate(lag_column = lag(column)) %>%
mutate(column = case_when(
is.na(column) == TRUE ~ lag_column,
TRUE ~ column
)) %>%
summarize(value = paste(value, collapse = "| "), .by = c(entry, column, file)) %>%
# Standardize column names
# =========================
mutate(column = tolower(str_squish(column))) %>%
## Fix Worktask
# ===================
mutate(column = str_replace_all(column, "work task", "worktask")) %>%
mutate(column = case_when(str_detect(column, "worktask|work task") ~ str_replace_all(column, "\\b\\d+\\.\\s*", ""),
TRUE ~ column)) %>%
mutate(column = case_when(
str_detect(column, "worktask|work task") ~ str_replace_all(column,
setNames(rep("", length(phrases_to_remove)), phrases_to_remove)),
TRUE ~ column),
value = case_when(
str_detect(column, "worktask|work task") ~ str_replace_all(value,
setNames(rep("", length(phrases_to_remove)),
paste0("(?i)", phrases_to_remove))),
TRUE ~ value)
) %>%
mutate(column = case_when(
column == "components & take/samples:" ~ "components & take:",
TRUE ~ column
)) %>%
# Stuff after colon
# =====================
mutate(
# Extract the text after the colon (if present)
value_extracted = str_extract(column, "(?<=: ).*"),
# Combine the extracted text with the value column
value = ifelse(is.na(value), value_extracted, paste(value_extracted, value, sep = " ")),
# Remove the text after the colon in the column column
column = str_extract(column, "^[^:]+")
) %>%
select(-value_extracted) %>%
mutate(value = str_remove_all(value, "NA"))
return(df_clean)
}
make_df_shape = function(df){
df_wide = df %>%
pivot_wider(
names_from = column,
values_from = value,
values_fn = list(value = ~ paste(unique(.), collapse = ", "))
) %>%
janitor::clean_names()
return(df_wide)
}
```
## Clean each json file
```{r}
# 89561_1
# ============
df_8956_1 = foia_json_to_df(here("data/processed/montana-foia-parsed/8956_1_result.jsonl"), "8956_1")
df_8956_1_clean = df_8956_1 %>%
clean_parsed()
df_89561_1_final = df_8956_1_clean %>%
make_df_shape() %>%
select(c(file, entry, worktask_for, work_date, agreement, property,activity,activity_measurements, conflict_loss, components_take, remarks, project, flagged))
# 9430_2
# ========
df_9430_2 = foia_json_to_df(here("data/processed/montana-foia-parsed/9430_2_result.jsonl"), "9430_2")
df_9430_2_clean = df_9430_2 %>%
clean_parsed()
df_9430_2_final = df_9430_2_clean %>%
make_df_shape() %>%
select(c(file, entry, worktask_for, work_date, agreement, property,activity,activity_measurements, conflict_loss, components_take, remarks, project, flagged))
# 9557_1
# ==========
df_9557_1 = foia_json_to_df(here("data/processed/montana-foia-parsed/9557_1_result.jsonl"), "9557_1")
df_9557_1_clean = df_9557_1 %>%
clean_parsed()
df_9557_1_final = df_9557_1_clean %>%
make_df_shape() %>%
select(c(file, entry, worktask_for, work_date, agreement, property,activity,activity_measurements, conflict_loss, components_take, remarks, project, flagged))
# 9557_2
# ==========
df_9557_2 = foia_json_to_df(here("data/processed/montana-foia-parsed/9557_2_result.jsonl"), "9557_2")
df_9557_2_clean = df_9557_2 %>%
clean_parsed()
df_9557_2_final = df_9557_2_clean %>%
make_df_shape() %>%
select(c(file, entry, worktask_for, work_date, agreement, property,activity,activity_measurements, conflict_loss, components_take, remarks, project, flagged))
# 9557_3
# ==========
df_9557_3 = foia_json_to_df(here("data/processed/montana-foia-parsed/9557_3_result.jsonl"), "9557_3")
df_9557_3_clean = df_9557_3 %>%
clean_parsed()
df_9557_3_final = df_9557_3_clean %>%
make_df_shape() %>%
select(c(file, entry, worktask_for, work_date, agreement, property,activity,activity_measurements, conflict_loss, components_take, remarks, project, flagged))
# 9557_4
# ==========
df_9557_4 = foia_json_to_df(here("data/processed/montana-foia-parsed/9557_4_result.jsonl"), "9557_4")
df_9557_4_clean = df_9557_4 %>%
clean_parsed()
df_9557_4_final = df_9557_4_clean %>%
make_df_shape() %>%
select(c(file, entry, worktask_for, work_date, agreement, property,activity,activity_measurements, conflict_loss, components_take, remarks, project, flagged))
```
## Combine files
```{r}
# no_physical_harm_words = c("BARRIER", "FENCING", "RANGE RIDER")
# physical_harm_words = c("TRAP", "SRES", "FOOTHOLD")
# death_aim_words = c("CYANIDE", "FIREARM", "FIXED WING")
# action_maintenance = c("CHECKED")
extract_date_outside_parentheses <- function(text) {
str_extract_all(text, "(?<!\\()\\b\\d{2}/\\d{2}/\\d{4}\\b(?!\\))") %>%
unlist() %>%
.[!grepl("\\(", .)]
}
work_orders_df_raw = bind_rows(df_89561_1_final,df_9430_2_final,df_9557_1_final,df_9557_2_final, df_9557_3_final,df_9557_4_final)
work_orders_df_clean = work_orders_df_raw %>%
mutate(x_entry_date = gsub(".*\\(Entry Date: (.*?)\\).*", "\\1", work_date),
x_work_date = map_chr(work_date, ~ paste(extract_date_outside_parentheses(.x), collapse = ", ")), .after = work_date
) %>%
# mutate(x_work_date = str_remove(x_work_date, "\\(Entry Date:")) %>%
mutate(
x_work_date = as.Date(x_work_date, format = "%m/%d/%Y"),
x_entry_date = as.Date(x_entry_date, format = "%m/%d/%Y"),
x_work_year = lubridate::year(x_work_date)
)
extract_patterns <- function(text) {
matches <- str_extract_all(text, "\\d+ EA \\w+ KILLED")[[1]]
return(matches)
}
animals_df = work_orders_df_clean %>%
mutate(across(where(is.character), str_squish)) %>%
mutate(id = glue("{file}-{entry}"), .before = file) %>%
# Replace spelling errors
mutate(components_take = str_replace_all(components_take, "Стр", "Cmp")) %>%
mutate(components_take = str_replace_all(components_take, " BA ", " EA ")) %>%
mutate(x_threat_of = case_when(
str_detect(tolower(conflict_loss), "damage threat of|damage threat") ~ TRUE,
TRUE ~ FALSE
)) %>%
mutate(x_predation = case_when(
str_detect(tolower(conflict_loss), "predation" ) ~ TRUE,
str_detect(tolower(conflict_loss), "injury") ~ TRUE,
TRUE ~ FALSE
)) %>%
mutate(x_reason = case_when(
x_threat_of == TRUE & x_predation == TRUE ~ "predation",
x_threat_of == FALSE & x_predation == TRUE ~ "predation",
x_threat_of == TRUE & x_predation == FALSE ~ "threat",
TRUE ~ "neither"
)) %>%
mutate(x_result = case_when(
str_detect(components_take, "No Components/Take|No Components/ Take") ~ "no_components_take",
str_detect(components_take, "KILLED") ~ "killed",
str_detect(components_take, "SAMPLED") ~ "sampled",
TRUE ~ NA_character_
)) %>%
rowwise() %>%
mutate(matches = list(extract_patterns(components_take))) %>%
ungroup()
components_df = animals_df %>%
select(id, components_take) %>%
mutate(across(where(is.character), str_squish)) %>%
separate_longer_delim(components_take, delim = "Cmp") %>%
filter(components_take != "") %>%
mutate(x_action = case_when(
str_detect(components_take, "SET") ~ "set",
str_detect(components_take, "CHECK") ~ "check",
str_detect(components_take, "APPLIED|USED") ~ "apply",
str_detect(components_take, "REMOVE") ~ "remove",
str_detect(components_take, "FIRED") ~ "fire",
str_detect(components_take, "TESTED") ~ "tested",
TRUE ~ NA_character_
)) %>%
mutate(x_item = case_when(
str_detect(components_take, "BARRIER") ~ "barrier",
str_detect(components_take, "FENCING") ~ "fencing",
str_detect(components_take, "RANGE RIDER") ~ "range_rider",
str_detect(components_take, "TRAP") ~ "trap",
str_detect(components_take, "SRES") ~ "snare",
str_detect(components_take, "CYANIDE") ~ "cyanide",
str_detect(components_take, "FIREARM") ~ "fire_arm",
str_detect(components_take, "FIXED WING") ~ "fixed_wing",
str_detect(components_take, "HELICOPTER") ~ "helicopter",
str_detect(components_take, "GAS") ~ "gas",
TRUE ~ NA_character_
)) %>%
mutate(x_result = case_when(
str_detect(components_take, "No Components/Take") ~ "no_components_take",
str_detect(components_take, "KILLED") ~ "killed",
str_detect(components_take, "TRANSFER") ~ "transfer",
TRUE ~ NA_character_
)) %>%
# strict category is lethal if it not mean to dissuade, and intention is to kill aka cyanaide, fire arm, and gas
mutate(x_lethal_strict = case_when(
str_detect(x_item, "cyanide|fire_arm|gas") ~ 1,
TRUE ~ 0
)) %>%
# lax is lethal if the items are not, like guns, cyanide, gas
mutate(x_lethal_lax = case_when(
str_detect(x_item, "barrier|fencing|range_rider|helicopter|fixed_wing") | is.na(x_item) == TRUE ~ 0,
TRUE ~ 1
)) %>%
filter(!is.na(x_item)) %>%
mutate(x_item_count = case_when(
!is.na(x_item) ~ 1,
TRUE ~ 0
))
# Create summaries
# ==================
component_summary_strict_df = components_df %>%
summarize(lethal_strict_sum = sum(x_lethal_strict), x_items_actions = paste(glue("{x_item}:{x_action}"), collapse = ", "), x_number_items = sum(x_item_count), .by = id)
component_summary_lax_df = components_df %>%
summarize(lethal_lax_sum = sum(x_lethal_lax) , x_items_actions = paste(glue("{x_item}:{x_action}"), collapse = ", "), .by = id) %>%
select(-x_items_actions)
combined_summary_df = full_join(component_summary_strict_df, component_summary_lax_df, join_by(id))
# Combine back
# =============
animals_df = left_join(animals_df,combined_summary_df, join_by(id) )
```
## Export files
```{r}
saveRDS(animals_df, here("data/processed/usda_workorders.RDS"))
```