library(here)
Custom Functions
Here is the .R script that is loaded for each notebook that includes custom functions commonly used.
library(gt)
# Clean markers data
# ===================
# All valid markers (ones that were put up and taken down)
= read.csv(here("data/processed/hmdb-cleaned/all_valid_markers_df.csv"))
all_valid_markers_df
= read.csv(here("data/processed/hmdb-cleaned/all_valid_markers_up_df.csv"))
all_valid_markers_up_df
= read.csv(here("data/processed/hmdb-cleaned/marker_categories_df.csv"))
marker_categories_df
= read.csv(here("data/processed/hmdb-cleaned/marker_series_df.csv"))
marker_series_df
# Visualization for notebook
# ==========================
<- function(df, title, subtitle = NA){
cat_table
if(is.na(subtitle)){
= ""
subtitle
}
%>%
df gt() %>%
tab_header(
title = md(title),
subtitle = subtitle
%>%
) opt_interactive(use_text_wrapping = FALSE)
}
= function(category_str, x_all_markers_df, y_categories_df){
get_marker_category_subset
= y_categories_df %>%
category_query_df filter(category == category_str)
= x_all_markers_df %>%
marker_query_df filter(marker_id %in% category_query_df$marker_id) %>%
mutate(across(where(is.character), ~htmlEscape(., FALSE))) %>%
mutate(across(where(is.character), str_squish)) %>%
mutate(marker_id = as.character(marker_id))
return(marker_query_df)
}
= function(category_str, x_all_markers_df, y_series_df){
get_marker_series_subset
= y_series_df %>%
series_query_df filter(name == category_str)
= x_all_markers_df %>%
marker_query_df filter(marker_id %in% series_query_df$marker_id) %>%
mutate(across(where(is.character), ~htmlEscape(., FALSE))) %>%
mutate(across(where(is.character), str_squish)) %>%
mutate(marker_id = as.character(marker_id))
return(marker_query_df)
}
<- function(df) {
unnest_all
<- purrr::keep(df, is.list) # Find all nested columns
nested_cols
if (length(nested_cols) > 0) {
# Unnest the first nested column
<- df %>%
unnested_df
unnest_wider(col = all_of(names(nested_cols)[1]), names_sep = "__")
# Recursively unnest the remaining nested columns
unnest_all(unnested_df)
else {
}
return(df) # Return the unnested data frame
}
}
# Convert each row of the DataFrame to a JSON object and save it to a jsonl file
# Function to convert each row of the DataFrame to a JSON object and save it to a jsonl file
# Function to stream a DataFrame to a JSON Lines file
<- function(df, filename) {
write_jsonl_stream # Open a connection to the file
<- file(filename, "w")
con
# Stream out the data
stream_out(df, con, pagesize = nrow(df))
# Close the connection
close(con)
}
# Function to find matching groups for an organization
<- function(organization_name) {
find_matching_groups <- c()
matching_groups
# Loop through the named list
for (category_name in names(heritage_groups)) {
<- heritage_groups[[category_name]]$group
groups # print(category_name)
# print("-------")
# Loop through the groups in the category
for (group_info in groups) {
<- group_info$name
group_name
if(is.null(group_info$alt)){
= c(group_name)
vec_names
else {
} = c(group_name, group_info$alt)
vec_names
}
# Check if the organization_name matches either the name or alt
if (str_detect(organization_name, paste("\\b", vec_names, "\\b", sep = "", collapse = "|"))) {
<- c(matching_groups, group_name)
matching_groups
}
}
}
# Return a comma-separated string of matching group names
if (length(matching_groups) > 0) {
return(paste(matching_groups, collapse = ", "))
else {
} return(NA)
}
}
# Southern states
# https://www2.census.gov/geo/pdfs/maps-data/maps/reference/us_regdiv.pdf
= c(
southern_states "texas",
"oklahoma",
"arkansas",
"louisiana",
"mississippi",
"alabama",
"tennessee",
"kentucky",
"florida",
"georgia",
"south carolina",
"north carolina",
"virginia",
"west virginia",
"maryland",
"delaware",
"district of columbia"
)