library(tidyverse) # data manipulation
library(jsonlite) # json manipulation
library(lubridate) # date manipulation
library(odbc)
library(DBI)
library(here) # path management
library(glue)
Process HMDB Data
This data set is the main markers data set from the Historical Markers Database (HMBD). It was created by running the following query from the SQL database that HMDB provided us. This query was given to us by the maintainer of the data set and returns all valid markers.
```{sql}
Select * From dbo.Markers WHERE Accepted is not Null AND Rejected is Null AND Withdrawn is Null AND ReplacedByID is Null
```
Database Tables
Other tables in the database also contain information about the markers. The tables used so far in our analysis are marked in the below list:
To see the full detailed explanation of the tables see the HMDB Schema.
Set up R Libraries
= function(df){
clean_data_frame
= df %>%
df_clean ::clean_names() %>% # clean column names
janitormutate(
across(where(is.character), str_squish)) # clean
return(df_clean)
}
= function(df, column_name){
remove_html
= df %>%
df_clean mutate({{column_name}} := str_replace_all({{column_name}}, "<[^>]*>", "")) %>%
mutate({{column_name}} := str_replace_all({{column_name}}, "\"", "'")) %>%
mutate({{column_name}} := str_replace_all({{column_name}}, "\"", "'"))
return(df_clean)
}
SQL Server
I have a docker container running Microsoft Server on Port 1433.
This bash command downloads the sql driver needed to connect to the sql database.
```{bash}
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install.sh)"
brew tap microsoft/mssql-release https://github.com/Microsoft/homebrew-mssql-release
brew update
HOMEBREW_ACCEPT_EULA=Y brew install msodbcsql18 mssql-tools18
```
In the .Renviron
make sure to put the folder where the driver is located:
```{bash}
ODBCSYSINI='/opt/homebrew/etc'
```
Connect to the database
<- dbConnect(odbc::odbc(),
conn driver = "ODBC Driver 18 for SQL Server",
server = "localhost",
database = "HMdbNPR",
UID = "sa",
PWD = "reallyStrongPwd123",
port = 1433,
TrustServerCertificate = "Yes"
)
# dbListTables(conn)
Query the data from the appropiate tables
= dbGetQuery(conn, '
all_valid_markers_raw_df Select * From Markers WHERE Accepted is not Null AND Rejected is Null AND Withdrawn is Null AND ReplacedByID is Null
')
= dbGetQuery(conn, '
related_markers_raw_df Select * From RelatedMarkers
')
= dbGetQuery(conn, '
category_list_raw_df Select * From CategoryList
')
= dbGetQuery(conn, '
categories_raw_df Select * From Categories
')
= dbGetQuery(conn, '
series_list_raw_df Select * From SeriesList WHERE Accepted is not Null and Rejected is Null and Withdrawn is Null
')
= dbGetQuery(conn, '
series_raw_df Select * From Series WHERE Accepted is not Null and Rejected is Null and Withdrawn is Null
')
dbDisconnect(conn)
Clean the data for each table
Make a clean data frame for
- all the valid markers
- the valid markers that are still up (some were taken down or replaced).
For the markers that are still up, I filtered for: - reported missing
is NA & reported_missing_flag
is FALSE & confirmed_missing
is NA, and confirmed_replaced
is NA
```{r}
filter(is.na(reported_missing) & reported_missing_flag == FALSE & is.na(confirmed_missing) & is.na(confirmed_replaced))
```
# Filter for US markers and clean
= all_valid_markers_raw_df %>%
all_valid_markers_df clean_data_frame() %>% # Remove extra spaces
mutate(X_text_original = text,
X_organization_original = organization) %>%
# Clean up original text to remove html
remove_html(X_text_original) %>%
remove_html(title) %>%
mutate(marker_content = glue("title: {title}; text: {X_text_original}")) %>%
mutate(across(where(is.character) & !all_of(c("X_text_original","X_organization_original")), tolower)) %>%
filter(country == "united states of america") %>%
mutate(across(where(is.character), ~replace(., . == "null", NA))) %>% # replace null with NA
relocate(text, .after = title) %>%
relocate(organization, .after = title) %>%
relocate(marker_no, .after = title) %>%
mutate(url = paste0("https://www.hmdb.org/m.asp?m=",marker_id), .before = title) %>% # create url column for easy access to marker page
mutate(year_erected = as.numeric(year_erected)) %>% # convert year erected to numeric
mutate(reported_missing_flag = case_when(
is.na(reported_missing) ~ FALSE,
TRUE ~ TRUE
%>%
)) mutate(is_missing = case_when(
!(is.na(reported_missing) & reported_missing_flag == FALSE & is.na(confirmed_missing) & is.na(confirmed_replaced)) ~ TRUE,
TRUE ~ FALSE
))
= all_valid_markers_df %>%
all_valid_markers_up_df filter(is_missing == FALSE)
Make a categories and series data frame that combines the labels with the identifiers.
# Categories
# ==========
# Marker ID with the category ID
<- category_list_raw_df %>%
category_list_df clean_data_frame() %>%
mutate(across(where(is.character), tolower))
# Category ID with the category label
<- categories_raw_df %>%
categories_df clean_data_frame() %>%
mutate(across(where(is.character), tolower))
# Marker ID with the category label
= left_join(category_list_df, categories_df, by = "category_id") %>%
marker_categories_df select(marker_id, category_id, category)
# Series
# =======
= series_list_raw_df %>%
series_list_df clean_data_frame() %>%
mutate(across(where(is.character), tolower))
= series_raw_df %>%
series_df clean_data_frame() %>%
mutate(across(where(is.character), tolower))
= left_join(series_list_df, series_df, by = "series_id") %>%
marker_series_df select(marker_id, series_id, name, optional_title, comments)
# Related markers
# ===============
= related_markers_raw_df %>%
related_markers_df clean_data_frame() %>%
mutate(across(where(is.character), tolower))
Export the dataframes to csv
<- list("all_valid_markers_df"=all_valid_markers_df,
list_of_dfs "all_valid_markers_up_df" = all_valid_markers_up_df,
"marker_categories_df"= marker_categories_df,
"marker_series_df"=marker_series_df)
iwalk(list_of_dfs, ~write.csv(.x, here("data/processed/hmdb-cleaned/",glue("{.y}.csv"))))