Custom Functions

library(here)

Here is the .R script that is loaded for each notebook that includes custom functions commonly used.


library(gt)

# Clean markers data
# ===================

# All valid markers (ones that were put up and taken down)
all_valid_markers_df = read.csv(here("data/processed/hmdb-cleaned/all_valid_markers_df.csv")) 

all_valid_markers_up_df = read.csv(here("data/processed/hmdb-cleaned/all_valid_markers_up_df.csv"))

marker_categories_df = read.csv(here("data/processed/hmdb-cleaned/marker_categories_df.csv"))

marker_series_df = read.csv(here("data/processed/hmdb-cleaned/marker_series_df.csv"))

# Visualization for notebook
# ==========================

cat_table <- function(df, title, subtitle = NA){

  if(is.na(subtitle)){
    subtitle = ""
  }

  df  %>% 
    gt() %>% 
    tab_header(
    title = md(title),
    subtitle = subtitle
  ) %>% 
    opt_interactive(use_text_wrapping = FALSE)
}



get_marker_category_subset = function(category_str, x_all_markers_df, y_categories_df){

    category_query_df = y_categories_df  %>% 
        filter(category == category_str)

    marker_query_df = x_all_markers_df  %>% 
        filter(marker_id %in% category_query_df$marker_id) %>% 
        mutate(across(where(is.character), ~htmlEscape(., FALSE))) %>% 
        mutate(across(where(is.character), str_squish))  %>% 
        mutate(marker_id = as.character(marker_id))

    return(marker_query_df)   


}

get_marker_series_subset = function(category_str, x_all_markers_df, y_series_df){

    series_query_df = y_series_df  %>% 
        filter(name == category_str)

    marker_query_df = x_all_markers_df  %>% 
        filter(marker_id %in% series_query_df$marker_id) %>% 
        mutate(across(where(is.character), ~htmlEscape(., FALSE))) %>% 
        mutate(across(where(is.character), str_squish))  %>% 
        mutate(marker_id = as.character(marker_id))

    return(marker_query_df)   


}





unnest_all <- function(df) {

nested_cols <- purrr::keep(df, is.list) # Find all nested columns

if (length(nested_cols) > 0) {

# Unnest the first nested column

unnested_df <- df %>%

unnest_wider(col = all_of(names(nested_cols)[1]), names_sep = "__")

# Recursively unnest the remaining nested columns

unnest_all(unnested_df)

} else {

return(df) # Return the unnested data frame

}

}



# Convert each row of the DataFrame to a JSON object and save it to a jsonl file
# Function to convert each row of the DataFrame to a JSON object and save it to a jsonl file
# Function to stream a DataFrame to a JSON Lines file
write_jsonl_stream <- function(df, filename) {
  # Open a connection to the file
  con <- file(filename, "w")
  
  # Stream out the data
  stream_out(df, con, pagesize = nrow(df))
  
  # Close the connection
  close(con)
}


# Function to find matching groups for an organization
find_matching_groups <- function(organization_name) {
  matching_groups <- c()
  
  # Loop through the named list
  for (category_name in names(heritage_groups)) {
    groups <- heritage_groups[[category_name]]$group
    # print(category_name)
    # print("-------")
    # Loop through the groups in the category
    for (group_info in groups) {
      group_name <- group_info$name
     
      if(is.null(group_info$alt)){
        vec_names = c(group_name)

      } else {
        vec_names = c(group_name, group_info$alt)
      }
      
     
      # Check if the organization_name matches either the name or alt
      if (str_detect(organization_name, paste("\\b", vec_names, "\\b", sep = "", collapse = "|"))) {
        matching_groups <- c(matching_groups, group_name)
      }
    }
  }
  
  # Return a comma-separated string of matching group names
  if (length(matching_groups) > 0) {
    return(paste(matching_groups, collapse = ", "))
  } else {
    return(NA)
  }
}


# Southern states
# https://www2.census.gov/geo/pdfs/maps-data/maps/reference/us_regdiv.pdf
southern_states = c(
  "texas",
  "oklahoma",
  "arkansas",
  "louisiana",
  "mississippi",
  "alabama",
  "tennessee",
  "kentucky",
  "florida",
  "georgia",
  "south carolina",
  "north carolina",
  "virginia",
  "west virginia",
  "maryland",
  "delaware",
  "district of columbia"
)