Process HMDB Data

This data set is the main markers data set from the Historical Markers Database (HMBD). It was created by running the following query from the SQL database that HMDB provided us. This query was given to us by the maintainer of the data set and returns all valid markers.

```{sql}
Select * From dbo.Markers WHERE Accepted is not Null AND Rejected is Null AND Withdrawn is Null AND ReplacedByID is Null
```

Database Tables

Other tables in the database also contain information about the markers. The tables used so far in our analysis are marked in the below list:

To see the full detailed explanation of the tables see the HMDB Schema.

Set up R Libraries

library(tidyverse) # data manipulation
library(jsonlite) # json manipulation
library(lubridate) # date manipulation
library(odbc)
library(DBI) 
library(here) # path management
library(glue)
clean_data_frame = function(df){
  
  df_clean = df %>% 
    janitor::clean_names() %>% # clean column names
    mutate( 
           across(where(is.character), str_squish)) # clean 
  
    return(df_clean)
  
}


remove_html = function(df, column_name){

  df_clean = df  %>% 
    mutate({{column_name}} := str_replace_all({{column_name}}, "<[^>]*>", "")) %>% 
    mutate({{column_name}} := str_replace_all({{column_name}}, "\"", "'")) %>% 
    mutate({{column_name}} := str_replace_all({{column_name}}, "\"", "'"))

  return(df_clean)

}

SQL Server

I have a docker container running Microsoft Server on Port 1433.

This bash command downloads the sql driver needed to connect to the sql database.

```{bash}
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install.sh)"
brew tap microsoft/mssql-release https://github.com/Microsoft/homebrew-mssql-release
brew update
HOMEBREW_ACCEPT_EULA=Y brew install msodbcsql18 mssql-tools18

```

In the .Renviron make sure to put the folder where the driver is located:

```{bash}

ODBCSYSINI='/opt/homebrew/etc'

```

Connect to the database

conn <- dbConnect(odbc::odbc(),
                      driver = "ODBC Driver 18 for SQL Server",
                      server = "localhost",
                      database = "HMdbNPR",
                      UID = "sa",
                      PWD = "reallyStrongPwd123",
                      port = 1433,
                      TrustServerCertificate = "Yes"
)


# dbListTables(conn)

Query the data from the appropiate tables

all_valid_markers_raw_df = dbGetQuery(conn, '
Select * From Markers WHERE Accepted is not Null AND Rejected is Null AND Withdrawn is Null AND ReplacedByID is Null
')



related_markers_raw_df = dbGetQuery(conn, '
Select * From RelatedMarkers
')

category_list_raw_df = dbGetQuery(conn, '
Select * From CategoryList
')

categories_raw_df = dbGetQuery(conn, '
Select * From Categories
')

series_list_raw_df = dbGetQuery(conn, '
Select * From SeriesList WHERE Accepted is not Null and Rejected is Null and Withdrawn is Null
')

series_raw_df = dbGetQuery(conn, '
Select * From Series WHERE Accepted is not Null and Rejected is Null and Withdrawn is Null
')


dbDisconnect(conn)

Clean the data for each table

Make a clean data frame for

  1. all the valid markers
  2. the valid markers that are still up (some were taken down or replaced).

For the markers that are still up, I filtered for: - reported missing is NA & reported_missing_flag is FALSE & confirmed_missingis NA, and confirmed_replaced is NA

```{r}

 filter(is.na(reported_missing) & reported_missing_flag == FALSE & is.na(confirmed_missing) & is.na(confirmed_replaced)) 

```
# Filter for US markers and clean 
all_valid_markers_df = all_valid_markers_raw_df %>% 
  clean_data_frame() %>% # Remove extra spaces
  mutate(X_text_original = text,
         X_organization_original = organization)  %>% 
  # Clean up original text to remove html
  remove_html(X_text_original)  %>% 
  remove_html(title) %>% 
  mutate(marker_content = glue("title: {title}; text: {X_text_original}"))  %>% 
  mutate(across(where(is.character) & !all_of(c("X_text_original","X_organization_original")), tolower)) %>% 
  filter(country == "united states of america") %>%
  mutate(across(where(is.character), ~replace(., . == "null", NA)))  %>% # replace null with NA
  relocate(text, .after = title) %>% 
  relocate(organization, .after = title) %>% 
  relocate(marker_no, .after = title) %>% 
  mutate(url = paste0("https://www.hmdb.org/m.asp?m=",marker_id), .before = title)  %>%  # create url column for easy access to marker page  
  mutate(year_erected = as.numeric(year_erected))  %>%  # convert year erected to numeric
  mutate(reported_missing_flag = case_when(
    is.na(reported_missing) ~ FALSE,
    TRUE ~ TRUE
  )) %>% 
  mutate(is_missing = case_when(

    !(is.na(reported_missing) & reported_missing_flag == FALSE & is.na(confirmed_missing) & is.na(confirmed_replaced)) ~ TRUE, 
    TRUE ~ FALSE

  ))

all_valid_markers_up_df = all_valid_markers_df %>% 
  filter(is_missing == FALSE)

Make a categories and series data frame that combines the labels with the identifiers.

# Categories
# ==========

# Marker ID with the category ID
category_list_df <- category_list_raw_df %>% 
  clean_data_frame() %>% 
  mutate(across(where(is.character), tolower))

# Category ID with the category label
categories_df <- categories_raw_df %>% 
  clean_data_frame()  %>% 
  mutate(across(where(is.character), tolower))

# Marker ID with the category label
marker_categories_df  = left_join(category_list_df, categories_df, by = "category_id") %>% 
  select(marker_id, category_id, category)



# Series 
# =======

series_list_df = series_list_raw_df %>% 
  clean_data_frame()  %>% 
  mutate(across(where(is.character), tolower))

series_df = series_raw_df  %>% 
  clean_data_frame()  %>% 
  mutate(across(where(is.character), tolower))

marker_series_df = left_join(series_list_df, series_df, by = "series_id") %>% 
  select(marker_id, series_id, name, optional_title, comments)


# Related markers 
# ===============

related_markers_df = related_markers_raw_df %>% 
  clean_data_frame()  %>% 
  mutate(across(where(is.character), tolower))

Export the dataframes to csv

list_of_dfs <- list("all_valid_markers_df"=all_valid_markers_df, 
                    "all_valid_markers_up_df" = all_valid_markers_up_df,
                    "marker_categories_df"= marker_categories_df,
                    "marker_series_df"=marker_series_df)

iwalk(list_of_dfs, ~write.csv(.x, here("data/processed/hmdb-cleaned/",glue("{.y}.csv"))))