library(tidyverse)
library(here)
library(jsonlite)
library(here)
library(gt)
library(reticulate)
library(configr)
library(RcppTOML)
source(here("analysis/_functions.R"))
Number of Organizations
Sentence:
+up_first_sunday: And while the markers often look official, the reality is that anyone can put up a marker - more than 35,000 different groups, historical associations, cities, towns, states, individuals have.
+digital: more than 35,000 different groups, societies, organizations, towns, governments and individuals have [put up markers].
+marking_hate_watc: More than 35,000 different groups, historical societies, towns, individuals - all kinds of people - have put up markers.
+two_way_atc: all kinds of people have — more than 35,000 different groups, and individuals, and towns and societies, organizations - have.
The number we report is a minimum number of groups that have put up markers.
So first we start off with the organization
column and find the number of unique organizations.
= all_valid_markers_df %>%
all_orgs summarize(n = n(), .by = X_organization_original) %>%
arrange(desc(n))
We can see that just by getting unique organizations we arrive at 49,196 organizations.
cat_table(all_orgs, "Unique organizations")
Looking through these, we have a few problems.
Many markers are put up by multiple groups, there is no standard way of separating groups.
Examples below include semi-colons, commas, and dots to separate groups:
- Montana Historical Society; Department of the Interior, National Register of Historic Places
- The Daughters of The American Revolution (DAR) and The State of Texas
- National Park Service • City of Cave Spring • Trail of Tears Association
Some group names have chapters or abbreviations inserted in them
Examples include:
- General Robert A. Toombs, Camp 932, SCV, Chapter 1329, UDC
- SCV stands for Sons of Confederate Veterans and UDC stands for United Daughters of the Confederacy. Each of the groups has a chapter included with them
- General Robert A. Toombs, Camp 932, SCV, Chapter 1329, UDC
To solve this, I used a local large language model called llama2 to separate each row into a single organization:
The code looks like this:
from openai import OpenAI
from pydantic import BaseModel, ValidationError, Field
from typing import List
import instructor
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import json
=pd.read_csv("/Users/nick/Documents/projects/GitHub/historical-markers/data/processed/all_orgs.csv")
orgs_df
= orgs_df["X_organization_original"].tolist()[1:] data
def _save_to_jsonl(file_name, directory, data_dict):
"""
Appends a dictionary to a JSONL file in the specified directory.
If the file does not exist, it creates the file.
Args:
file_name (str): The name of the file.
directory (str): The directory where the file is located or should be created.
data_dict (dict): The dictionary to append.
"""
# Create a Path object for the directory and ensure it exists
= Path(directory)
dir_path =True, exist_ok=True)
dir_path.mkdir(parents
# Create the full file path
= dir_path / file_name
file_path
# Append to the file or create a new one if it doesn't exist
with file_path.open('a') as file:
= json.dumps(data_dict)
json_line file.write(json_line + '\n')
class Organization(BaseModel):
str] = Field(..., description="A list of organizations from a sentence") organizations: List[
# enables `response_model` in create call
= instructor.patch(
client
OpenAI(="http://localhost:11434/v1",
base_url="ollama", # required, but unused
api_key
),=instructor.Mode.JSON,
mode )
= "llama2"
model_name
= client.chat.completions.create(
resp =model_name,
model=[
messages
{"role": "user",
"content": f"Identify the organizations listed in this sentence and output as a json list: mcdonalds; papa johns and wendy's",
}
],=Organization)
response_model
= resp.model_dump(mode = "json")
response_json
print(response_json)
= "llama2"
model_name
for org in tqdm(data):
try:
= client.chat.completions.create(
resp =model_name,
model=[
messages
{"role": "user",
"content": f"Identify the organizations listed in this sentence and output as a json list: {org} ",
}
],=Organization)
response_model
= resp.model_dump(mode = "json")
response_json "original"] = org
response_json["model"] = model_name
response_json["org_separated.jsonl", "../data/processed/local-llm/org/", response_json)
_save_to_jsonl(
except ValidationError as exc:
= repr(exc.errors()[0]['type'])
error = {}
response_json "error"] = error
response_json["original"] = org
response_json["model"] = model_name
response_json["org_separated_error.jsonl", "../data/processed/local-llm/org/", response_json) _save_to_jsonl(
= stream_in(file(here("data/processed/local-llm/org/org_separated.jsonl"))) ai_sep
Found 500 records...
Found 1000 records...
Found 1500 records...
Found 2000 records...
Found 2500 records...
Found 3000 records...
Found 3500 records...
Found 4000 records...
Found 4500 records...
Found 5000 records...
Found 5500 records...
Found 6000 records...
Found 6500 records...
Found 7000 records...
Found 7500 records...
Found 8000 records...
Found 8500 records...
Found 9000 records...
Found 9500 records...
Found 10000 records...
Found 10500 records...
Found 11000 records...
Found 11500 records...
Found 12000 records...
Found 12500 records...
Found 13000 records...
Found 13500 records...
Found 14000 records...
Found 14500 records...
Found 15000 records...
Found 15500 records...
Found 16000 records...
Found 16500 records...
Found 17000 records...
Found 17500 records...
Found 18000 records...
Found 18500 records...
Found 19000 records...
Found 19500 records...
Found 20000 records...
Found 20500 records...
Found 21000 records...
Found 21500 records...
Found 22000 records...
Found 22500 records...
Found 23000 records...
Found 23500 records...
Found 24000 records...
Found 24500 records...
Found 25000 records...
Found 25500 records...
Found 26000 records...
Found 26500 records...
Found 27000 records...
Found 27500 records...
Found 28000 records...
Found 28500 records...
Found 29000 records...
Found 29500 records...
Found 30000 records...
Found 30500 records...
Found 31000 records...
Found 31500 records...
Found 32000 records...
Found 32500 records...
Found 33000 records...
Found 33500 records...
Found 34000 records...
Found 34500 records...
Found 35000 records...
Found 35500 records...
Found 36000 records...
Found 36500 records...
Found 37000 records...
Found 37500 records...
Found 38000 records...
Found 38500 records...
Found 39000 records...
Found 39500 records...
Found 40000 records...
Found 40500 records...
Found 41000 records...
Found 41500 records...
Found 42000 records...
Found 42500 records...
Found 43000 records...
Found 43500 records...
Found 44000 records...
Found 44500 records...
Found 45000 records...
Found 45500 records...
Found 46000 records...
Found 46500 records...
Found 46791 records...
Imported 46791 records. Simplifying...
<- readLines(here("data/processed/local-llm/org/org_separated.jsonl"))
lines <- lapply(lines, jsonlite::fromJSON)
json_list
= tibble(data = json_list) %>%
ai_org_raw_df unnest_wider(data) %>%
unnest_wider(organizations, names_sep = "_")
=ai_org_raw_df %>%
ai_org_df filter(!original %in% c(": On base of the marker, both front and back, is a list of donors and sponsors.")) %>%
select(-model, -original) %>%
pivot_longer(cols = everything(), names_to = "org_placement", values_to = "organization" ) %>%
filter(!is.na(organization)) %>%
summarize(n = n(), .by = organization) %>%
arrange(desc(n))
# write.csv(ai_org_df, here("data/processed/ai_org.csv"))
After separating each row into individual organizations and getting the unique categories we increase our count to 54,200 organizations.
The next step is to handle misspellings, different spelling or names for the same organization, and acronyms. I brought the data into OpenRefine and used their clustering algorithms. When I came across chapters I would research and fill in the actual national name when I could.
= read_csv(here("data/processed/ai-org-csv-openrefine.csv"))
open_refine_clustered_raw_df
= open_refine_clustered_raw_df %>%
open_refine_clustered_df summarize(n = n(), .by = organization_cluster)
After OpenRefine, I arrived at 49,735 organizations.
However, there are a few caveats to the pipeline:
- I wasn’t able to fill in the national group for every chapter
- The separating of the groups was not perfect
- The clustering of the groups is not perfect
Therefore I did the following:
- removed any organization that had the words post, chapter, chapters, unit, units, camp, camps, or no. in their name. The assumption here is that we would have gotten their national group in another marker (and even if we didn’t we are looking for a conservative number to report)
- removed any organization that was just a date (a date might have become an organization name in the separating phase)
- removed any group that was just one word (a lot of words got separated incorrectly from the main group name so therefore wanting to arrive at a conservative estimate)
= open_refine_clustered_df %>%
organization_cleaning mutate(remove = case_when(
str_detect(organization_cluster, " post| chapter| chapters| units| units| camp| camps| unit| no.") ~ 1,
str_detect(organization_cluster, "(january|february|march|april|may|june|july|august|september|october|november|december)\\s[1-31]{1,2},\\s[0-9]{4}") ~ 1,
str_detect(organization_cluster, "(january|february|march|april|may|june|july|august|september|october|november|december)\\s[1-31]{1,2}") ~ 1,
str_detect(organization_cluster, "erected") ~ 1,
str_count(organization_cluster, "\\S+") == 1 ~ 1,
TRUE ~ 0
))
= organization_cleaning %>%
orgs_to_remove filter(remove == 1) %>%
select(organization_cluster)
Doing so removed 7,808 organizations.
cat_table(orgs_to_remove, "Organizations Removed")
= organization_cleaning %>%
final_orgs filter(remove == 0) %>%
select(organization_cluster) %>%
arrange(organization_cluster)
# write.csv(final_orgs, here("data/processed/final_orgs_parsed.csv"))
I brought this cleaned list back into OpenRefine, did a bit more clustering, and then manually clustered groups that OpenRefine was not getting.
= c("post", "chapter", "chapters", "unit", "units","camp", "camps", "camp", "no.", "troop")
org_phrases
= read.csv(here("data/processed/final-orgs-parsed-2.csv"))
final_orgs_2
= final_orgs_2 %>%
final summarize(n = n(), .by = organization_cluster_2) %>%
mutate(organization = gsub("[^[:alnum:] ]", "", organization_cluster_2)) %>%
mutate(final = case_when(
str_detect(organization_cluster_2, "lion club|lions club|lion's club") ~ "lions club",
str_detect(organization_cluster_2, "society of colonial dames") ~ "society of colonial dames",
str_detect(organization_cluster_2, "rotary") ~ "rotary club",
str_detect(organization_cluster_2, "daughters of the american colonists") ~ "daughters of the american colonists",
str_detect(organization_cluster_2, "sons of confederate veterans|sons of the confederate veterans") ~ "sons of confederate veterans",
str_detect(organization_cluster_2, "ladies memorial association") ~ "ladies memorial association",
str_detect(organization_cluster_2, "daughters of union veterans") ~ "daughters of union veterans",
str_detect(organization_cluster_2, "sons of union veterans") ~ "sons of union veterans",
str_detect(organization_cluster_2, "e clampus|e clampus virtus") ~ "e clampus vitus",
str_detect(organization_cluster_2, "native sons") ~ "native sons of the golden west",
str_detect(organization_cluster_2, "pomeroy") ~ "william g pomeroy",
str_detect(organization_cluster_2, "boy scout") ~ "boy scout",
TRUE ~ organization_cluster_2)) %>%
summarize(n = n(), .by = final) %>%
mutate(start = case_when(
str_detect(str_sub(final, 1, 1), "^[a-zA-Z]") ~ str_sub(final, 1, 1), # Checks if the first character is a letter
str_detect(str_sub(final, 1, 1), "^[0-9]") ~ "number", # Checks if the first character is a number
TRUE ~ "symbol" # If the first character is neither letter nor number, it is considered a symbol
%>%
)) filter(!str_detect(final, paste("\\b", org_phrases, "\\b", sep = "", collapse = "|"))) %>%
arrange(final)
= final
final_2dot5
#write_csv(final_2dot5, here("data/processed/final_2dot5.csv"))
%>%
final_2dot5 mutate(start = case_when(
str_detect(str_sub(final, 1, 1), "^[a-zA-Z]") ~ str_sub(final, 1, 1), # Checks if the first character is a letter
str_detect(str_sub(final, 1, 1), "^[0-9]") ~ "number", # Checks if the first character is a number
TRUE ~ "symbol" # If the first character is neither letter nor number, it is considered a symbol
%>%
)) filter(start == "a") %>%
write_csv(here("data/processed/a.csv"))
Checking by hand revealed a few more ways to handle any duplicates programmatically. This included removing the following from the start of a string: - the - city of - village of - borough of
and also replace “&” for “and”, removing non-alpha-numeric characters, and removing “inc”.
After doing this I regrouped the organizations.
= read_csv(here("data/handmade/final orgs hand checked - Sheet2.csv"))
final_hand_checked_raw_df
= final_hand_checked_raw_df %>%
final_hand_checked_df mutate(remove= as.character(remove)) %>%
filter(is.na(remove)) %>%
mutate(final = str_squish(final)) %>%
mutate(final = str_squish(str_remove_all(final, "^the"))) %>%
mutate(final = str_squish(str_remove_all(final, "^city of"))) %>%
mutate(final = str_squish(str_remove_all(final, "^village of"))) %>%
mutate(final = str_squish(str_remove_all(final, "^borough of"))) %>%
mutate(final = str_squish(str_replace_all(final, "&", "and"))) %>%
mutate(final = str_replace_all(final, "[^a-zA-Z0-9 ]", "")) %>%
mutate(final = str_replace_all(final, "\\binc\\b", "")) %>%
mutate(final = str_squish(final)) %>%
summarize(n = n(), .by = final) %>%
arrange(desc(n))
At the end of this methodology we are left with 37,601
It is possible that organizations changed names over the years, therefore I rounded down to 35,000 to account for that.