#################
# Source codes for A Researcher's Guide to Using Electronic Health Records, 2nd ed
# https://www.routledge.com/A-Researchers-Guide-to-Using-Electronic-Health-Records-From-Planning-to/Goldstein/p/book/9781032169590
# (c) Neal D. Goldstein, 2023, https://www.goldsteinepi.com/books/ehr/index.html
#################


### Code #3.1. Saving the birthwt dataset from the MASS package as a CSV. ###

#load the MASS package
library(MASS)

#load the birthwt dataset into the environment
birthwt=birthwt

#View the dataset in RStudio
View(birthwt)

#save as a CSV
#note that missing values are encoded as empty strings,
#but can be encoded
#in any pattern recognizable by the destination, such as
#"." for SAS
write.csv(birthwt, file="birthwt.csv", na="", row.names=F)


### Code #4.1. Using SQL to extract and subset data from a Microsoft SQL Server database. ###

#connect to SQL Server database using the RODBC package
library(RODBC)

#setup the channel using the ODBC data source
#ImmunizationDB
channel = odbcConnect(dsn="ImmunizationDB", readOnlyOptimize=T)

#issue the query
immunizations = sqlQuery(channel, paste("SELECT [patient id], [patient name], [gender], [address], [antigen], [date administered] FROM [patient demographics] LEFT JOIN [vaccine administered] ON [patient demographics]. [patient id]=[vaccine administered].[patient id] WHERE [county]='Philadelphia County';"), stringsAsFactors=F)
close(channel)

#save the output
save(immunizations, file="research_database.Rdata")


### Code #5.1. Reshaping data from long to wide. ###

#issue the reshape command
reshape(long_format, v.names=c("Antigen","Date"), timevar="Vaccine", idvar="ID", direction="wide")


### Code #5.2. Linkage operation of two datasets with unique identifiers using the integrated merge function. ###

#link the two vaccine variables from the second dataset
#into the first dataset
#join by ID, and keep all observations from the first
#dataset

merge(x=dataset1, y=dataset2[,c("ID","Antigen","Date")], by="ID", all.x=T)


### Code #5.3. Linkage operation of two datasets with unique identifiers using a coded algorithm. ###

#initialize two new vaccine variables in the first dataset
dataset1$Antigen.2 = NA
dataset1$Date.2 = NA

#initialize a variable to keep track of matches
dataset2$Matched = F

#loop through each observation in the first dataset
for (i in 1:nrow(dataset1))
{
  #look for a match by ID
  matched = which(dataset2$ID==dataset1$ID[i])
  
  if (length(matched) > 0)
  {
    #found a match, copy over vaccine variables
    dataset1$Antigen.2[i] = dataset2$Antigen[matched]
    dataset1$Date.2[i] = dataset2$Date[matched]
    
    #indicate a match occurred
    dataset2$Matched[matched] = T
  }
}

#the above code can also be accomplished with the
#integrated merge function by
#specifying the argument all.y=T


### Code #5.4. Fuzzy matching and linkage approach using the Levenshtein distance. ###

library(vwr) #for levenshtein.distance

#create matching variables in each dataset
#a combination of last name, gender, and date of birth
#upper case with non-alphanumeric characters removed
dataset1$NewID = gsub("[^A-Z0-9]","", toupper(paste(dataset1$Last_name,dataset1$Gender,dataset1$Date_of_birth)))
dataset2$NewID = gsub("[^A-Z0-9]","", toupper(paste(dataset2$Last_name,dataset2$Gender,dataset2$Date_of_birth)))

#initialize two new vaccine variables in the first
#dataset
dataset1$Antigen.2 = NA
dataset1$Date.2 = NA

#initialize a variable to keep track of matches
dataset2$Matched = F

#loop through each observation in the first dataset
for (i in 1:nrow(dataset1))
{
  #get a list of potential matches
  distance = levenshtein.distance(dataset1$NewID[i], dataset2$NewID);
  
  #check for a match using a distance of 3 as the
  #initial criteria
  if (min(distance)<=3)
  {
    #find the match, note there may be more than one
    #match,
    #but we'll just take the first
    matched = which(distance == min(distance))[1]
    
    #and copy the vaccine variables
    dataset1$Antigen.2[i] = dataset2$Antigen[matched]
    dataset1$Date.2[i] = dataset2$Date[matched]
    
    #indicate a match occurred
    dataset2$Matched[matched] = T
  }
}


### Code #5.5. Common data type casting operations. ###

#check the data types
str(dataset)

#cast to numeric
dataset$dose_numeric = as.numeric(dataset$dose)

#cast to date, the format argument specifies the format
#of the date
dataset$antigen_date = as.Date(dataset$date, format="%m/%d/%Y")

#cast to text
dataset$name_text = as.character(dataset$name)

#make categorical
dataset$gender_category = as.factor(dataset$gender)


### Code #5.6. Recoding examples. ###

#scenario 1
dataset$gender_mf = ifelse(dataset$gender=="Male" | dataset$gender=="M", "M", 
                           ifelse(dataset$gender=="Female" | dataset$gender=="F", "F", NA))

#scenario 2
dataset$yesno_10 = ifelse(dataset$yesno_21==2, 1,
                          ifelse(dataset$yesno_21==1, 0, NA))

#scenario 3
dataset$married_category = ifelse(dataset$married=="Married", 0, 
                                  ifelse(dataset$married=="Not Married", 1, NA))

#scenario 4
dataset$weight_kgs = ifelse(is.numeric(dataset$weight_lbs), (dataset$weight_lbs / 2.2), NA)

#scenario 5
dataset$race_category = ifelse(dataset$race_ethnicity=="non-Hispanic Caucasian" | dataset$race_ethnicity=="Hispanic Caucasian", 0, 
                               ifelse(dataset$race_ethnicity=="non-Hispanic African American" | dataset$race_ethnicity=="Hispanic African American", 1,
                                      ifelse(dataset$race_ethnicity=="non-Hispanic Asian" | dataset$race_ethnicity=="Hispanic Asian", 2, NA)))

dataset$ethnicity_category = ifelse(dataset$race_ethnicity=="non-Hispanic Caucasian" | dataset$race_ethnicity=="non-Hispanic African American" | dataset$race_ethnicity=="non-Hispanic Asian", 0,
                                    ifelse(dataset$race_ethnicity=="Hispanic Caucasian" | dataset$race_ethnicity=="Hispanic African American" | dataset$race_ethnicity=="Hispanic Asian", 1, NA))

#scenario 6
dataset$education_category = ifelse(dataset$education== "grade school or below" | dataset$education=="middle school" | dataset$education=="high school no diploma", 0, 
                                    ifelse(dataset$education=="high school diploma or equivalent" | dataset$education=="college without degree", 1, 
                                           ifelse(dataset$education=="associates degree" | dataset$education=="bachelors degree" | dataset$education=="masters degree" | dataset$education=="doctoral degree", 2, NA)))

# scenario 7
dataset$region_category = i felse(dataset$northeast==1, "Northeast", 
                                  ifelse(dataset$south==1, "South",
                                         ifelse(dataset$midwest==1, "Midwest",
                                                ifelse(dataset$west==1, "West", NA))))
#scenario 8
dataset$bmi_calculated = ifelse(is.numeric(dataset$weight_kgs) & is.numeric(dataset$height_m), (dataset$weight_kgs / (dataset$height_m^2)), NA)

#scenario 9
dataset$ibuprofen = ifelse(length(grep("ibuprofen", dataset$medications, ignore.case=T))>0, 1, 0)


### Code #5.7. Splitting one observation into two observations. ###

#create a linking variable placeholder in original
#dataset
dataset$split_record = NA

#copy the observation in position i to a temporary
#dataset to split
split_observations = dataset[i, ]

#duplicate the observation for a total of two new
#records
split_observations = rbind(split_observations, split_observations[1,])

#create a linking variable that is based on position i
#with the suffix of the
#number of new records, separated by a period
split_observations$split_record = paste(i, "2", sep=".")

#perform any other variable manipulations here, such as
#to remove any data
#related to the second offspring from split_
#observations[1, ] and likewise to
#remove any data related to the first offspring from
#split_observations[2, ]
#merge the two new observations back to the original
#dataset
dataset = rbind[dataset, split_observations]

#drop the original observation
dataset = dataset[-i, ]


### Code #5.8. Proposed fuzzy matching approach for duplicate observation detection. ###

library(vwr) #for levenshtein.distance

#create a matching variables in the dataset
#a combination of last name, gender, and date of birth
#upper case with non-alphanumeric characters removed
dataset$NewID = gsub("[^A-Z0-9]","", toupper(paste(dataset$Last_name,dataset$Gender,dataset$Date_of_birth)))

#initialize a variable to keep track of potential duplicates
dataset$Duplicate = NA

#loop through each observation in the dataset
for (i in 1:nrow(dataset))
{
  #get a list of potential matches, but will self match
  distance = levenshtein.distance(dataset$NewID[i], dataset$NewID);
  
  #set the self match above the distance threshold
  distance[i] = 99
  
  #check for a match using a distance of 3 as the
  #initial criteria
  if (min(distance)<=3)
  {
    #find the match, note there may be more than one
    #match
    matched = which(distance<=3)
    
    #record the potential duplicate(s)
    dataset$Duplicate[i] = paste(matched, collapse=",")
  }
}

#review list of potential duplicates
dataset[!is.na(dataset$Duplicate), ]


### Code #7.1. Assembling a cross-section from the research database. ###

#subset by inclusion/exclusion criteria
analytic_dataset = subset(research_database, year>=2001 & year<=2020 & born_at_hospital==1)

#keep only core variables
analytic_dataset = analytic_dataset[, c("ID", "birth_weight", "staph_infection", "delivery_method", "sex", "race", "ethnicity", "mom_marital_status", "mom_age", "+other variables")]


### Code #7.2. Assembling a cohort from the research database. ###

#subset by inclusion/exclusion criteria
working_dataset = subset(research_database, year>=2001 & year<=2020 & born_at_hospital==1)

#keep only core variables
working_dataset = working_dataset[, c("ID", "birth_weight", "staph_infection", "delivery_method", "sex", "race", "ethnicity", "mom_marital_status", "mom_age", "+other variables")]

## OPTION1: entire cohort
analytic_dataset = working_dataset

## OPTION2: randomly select unexposed 1:1 with exposed
#sample on exposure, all VLBW<1500g
analytic_dataset = subset(working_dataset, VLBW<1500)

#seed ensures consistency if need to reproduce
set.seed(777)

#randomly select unexposed, without replacement
analytic_dataset = rbind(analytic_dataset, working_dataset[sample(which(working_dataset$VLBW>=1500), nrow(analytic_dataset), replace=F), ])


### Code #7.3. Assembling a case-control sample from the research database. ###

#subset by inclusion/exclusion criteria
working_dataset = subset(research_database, year>=2001 & year<=2020 & born_at_hospital==1)

#keep only core variables
working_dataset = working_dataset[, c("ID", "birth_weight", "staph_infection", "delivery_method", "sex", "race", "ethnicity", "mom_marital_status", "mom_age", "+other variables")]

#create a placeholder for matched case and control
working_dataset$matched = NA

#sample all cases (assume case = 1 and control = 0)
analytic_dataset = s ubset(working_dataset, case==1)

#list of eligible controls by unique identifier
eligible = working_dataset$MRN[working_dataset$case==0]

#seed ensures consistency if need to reproduce
set.seed(777)

#iterate over all cases to select one control per case
for (i in 1:nrow(analytic_dataset))
{
  #sample a control
  potential = sample(eligible, 1)
  
  need_control = TRUE
  while (need_control)
  {
    #check for matching characteristics, if any
    #when match, set need_control to FALSE, otherwise
    #sample a new potential
    #control using: potential = sample(eligible, 1)
  }
  
  #add as a control for this case
  analytic_dataset = rbind(analytic_dataset, working_dataset[which(working_dataset$MRN==potential), ])
  
  #track match using the case ID
  analytic_dataset$matched[i] = analytic_dataset$ID[i]
  analytic_dataset$matched[nrow(analytic_dataset)] = analytic_dataset$ID[i]
  
  #remove from control eligible list
  eligible = eligible[-which(eligible==potential)]
}


### Code #7.4. Transforming the analytic dataset from wide to long format. ###

#issue the reshape command
#Vaccine will define the vaccine number for each
#individual
#drop removes the original identifier to be replaced with
#a new identifier
reshape(analytic_dataset, varying=c("Antigen.1", "Date.1","Antigen.2","Date.2"), direction="long", timevar="Vaccine", drop="ID")


### Code #10.1. Loading the birthwt dataset and creating the follow-up time variable, gestation. ###

#load the birthwt dataset from the MASS package
library(MASS)
analytic_dataset = birthwt

#seed ensures consistency if need to reproduce
set.seed(777)

#simulate a gestational age variable, where 
#smoking increases likelihood of preterm birth
analytic_dataset$gestation = ifelse(analytic_dataset$smoke==0, 	round(rnorm(n=nrow(analytic_dataset), mean=40, sd=1),0), 	round(rnorm(n=nrow(analytic_dataset), mean=37, sd=1),0))


### Code #10.2. A descriptive analysis of the birthwt analytic dataset. ###

library(gmodels) #CrossTable function
library(psych) #describe function

CrossTable(analytic_dataset$low)
describe(analytic_dataset$age)

#dichotomize maternal weight
analytic_dataset$lwt_170 = ifelse(analytic_dataset$lwt<170, 0, 1)
CrossTable(analytic_dataset$lwt_170)
CrossTable(analytic_dataset$race)
CrossTable(analytic_dataset$smoke)

#collapse preterm labors
analytic_dataset$ptl_collapsed = ifelse(analytic_dataset$ptl>1, 1, analytic_dataset$ptl)
CrossTable(analytic_dataset$ptl_collapsed)

CrossTable(analytic_dataset$ht)
CrossTable(analytic_dataset$ui)

#collapse doctor visits
analytic_dataset$ftv_collapsed = ifelse(analytic_dataset$ftv>3, 3, analytic_dataset$ftv)
CrossTable(analytic_dataset$ftv_collapsed)

describe(analytic_dataset$bwt)
describe(analytic_dataset$gestation)


### Code #10.3. Example regression code predicting infant birthweight from maternal weight (two parameterizations). ###

#linear regression, continuous predictor
summary(lm(bwt ~ lwt, data=analytic_dataset))

#linear regression, categorical predictor
summary(lm(bwt ~ as.factor(lwt_170), data=analytic_dataset))

#logistic regression, categorical predictor
summary(glm(low ~ as.factor(lwt_170), data=analytic_dataset, family=binomial(link=logit)))


### Code #11.1. Estimating the relationship of the potential confounders with the exposure and outcome. ###

library(gmodels) #CrossTable function
library(psych) #describe function

#exposure
describeBy(analytic_dataset$age, analytic_dataset$smoke);
t.test(analytic_dataset$age ~ analytic_dataset$smoke)
CrossTable(analytic_dataset$lwt_170, analytic_dataset$smoke, prop.r=F, prop.t=F, prop.chisq=F, chisq=T)
CrossTable(analytic_dataset$race, analytic_dataset$smoke, prop.r=F, prop.t=F, prop.chisq=F, chisq=T)
CrossTable(analytic_dataset$ptl_collapsed, analytic_dataset$smoke, prop.r=F, prop.t=F, prop.chisq=F, chisq=T)
CrossTable(analytic_dataset$ht, analytic_dataset$smoke, prop.r=F, prop.t=F, prop.chisq=F, chisq=T)
CrossTable(analytic_dataset$ui, analytic_dataset$smoke, prop.r=F, prop.t=F, prop.chisq=F, chisq=T)
CrossTable(analytic_dataset$ftv_collapsed, analytic_dataset$smoke, prop.r=F, prop.t=F, prop.chisq=F, chisq=T)
describeBy(analytic_dataset$gestation, analytic_dataset$smoke); t.test(analytic_dataset$gestation ~ analytic_dataset$smoke)

##outcome
describeBy(analytic_dataset$age, analytic_dataset$low); t.test(analytic_dataset$age ~ analytic_dataset$low)
CrossTable(analytic_dataset$lwt_170, analytic_dataset$low, prop.r=F, prop.t=F, prop.chisq=F, chisq=T)
CrossTable(analytic_dataset$race, analytic_dataset$low, prop.r=F, prop.t=F, prop.chisq=F, chisq=T)
CrossTable(analytic_dataset$ptl_collapsed, analytic_dataset$low, prop.r=F, prop.t=F, prop.chisq=F, chisq=T)
CrossTable(analytic_dataset$ht, analytic_dataset$low, prop.r=F, prop.t=F, prop.chisq=F, chisq=T)
CrossTable(analytic_dataset$ui, analytic_dataset$low, prop.r=F, prop.t=F, prop.chisq=F, chisq=T)
CrossTable(analytic_dataset$ftv_collapsed, analytic_dataset$low, prop.r=F, prop.t=F, prop.chisq=F, chisq=T)
describeBy(analytic_dataset$gestation, analytic_dataset$low); t.test(analytic_dataset$gestation ~ analytic_dataset$low)


### Code #11.2. Estimating the change in effect of the potential confounders with the exposure and outcome. ###

#change in estimates, without confounder
summary(glm(low ~ as.factor(smoke), data=analytic_dataset, family=binomial(link=logit)))

#with confounder
summary(glm(low ~ as.factor(smoke) + age, data=analytic_dataset, family=binomial(link=logit)))
summary(glm(low ~ as.factor(smoke) + as.factor(lwt_170), data=analytic_dataset, family=binomial(link=logit)))
summary(glm(low ~ as.factor(smoke) + as.factor(race), data=analytic_dataset, family=binomial(logit)))
summary(glm(low ~ as.factor(smoke) + as.factor(ptl_collapsed), data=analytic_dataset, family=binomial(link=logit)))
summary(glm(low ~ as.factor(smoke) + as.factor(ht), data=analytic_dataset, family=binomial(link=logit)))
summary(glm(low ~ as.factor(smoke) + as.factor(ui), data=analytic_dataset, family=binomial(link=logit)))
summary(glm(low ~ as.factor(smoke) + as.factor(ftv_collapsed), data=analytic_dataset, family=binomial(link=logit)))
summary(glm(low ~ as.factor(smoke) + gestation, data=analytic_dataset, family=binomial(link=logit)))


### Code #11.3. Creating a matched case-control study from the birthwt analytic dataset. ###

#create a unique identifier
analytic_dataset$id = 1:nrow(analytic_dataset)

#create a placeholder for matched case and control
analytic_dataset$matched = NA

#sample all cases
matched_dataset = s ubset(analytic_dataset, low==1)

#list of eligible controls by unique identifier
eligible = analytic_dataset$id[analytic_dataset$low==0]

#seed ensures consistency if need to reproduce
set.seed(776)

#iterate over all cases to select one control per case
for (i in 1:nrow(matched_dataset))
{
  #sample a control
  potential = sample(eligible, 1)
  
  need_control = TRUE
  while (need_control)
  {
    #match on age +/- 2 years
    if (abs(analytic_dataset$age[potential] - matched_ dataset$age[i]) <=2) {
      need_control = FALSE
    } else {
      potential = sample(eligible, 1)
    }
  }
  #add as a control for this case
  matched_dataset = rbind(matched_dataset, analytic_dataset[which(analytic_dataset$id==potential), ])
  
  #track match using the case id
  matched_dataset$matched[i] = matched_dataset$id[i]
  matched_dataset$matched[nrow(matched_dataset)] = matched_dataset$id[i]
  
  #remove from control eligible list
  eligible = eligible[-which(eligible==potential)]
}


### Code #11.4. Example analytic models to estimate the association between smoking and low birthweight in the birthwt analytic dataset assuming various study designs. ###

#unadjusted prevalance odds ratio via logistic regression
summary(glm(low ~ as.factor(smoke), data=analytic_dataset, 	family=binomial(link=logit)))
exp(0.7041)

#unadjusted prevalance rate ratio via log-binomial regression
summary(glm(low ~ as.factor(smoke), data=analytic_dataset, 	family=binomial(link=log)))
exp(0.4748)

#adjusted prevalance odds ratio via logistic regression
summary(glm(low ~ as.factor(smoke) + as.factor(race) + as.factor(ptl_collapsed) + as.factor(ftv_collapsed) + gestation, data=analytic_dataset, family=binomial(link=logit)))
exp(0.3667)

#unadjusted cumulative incidence relative risk 
library(epitools)
no_outcome_unexposed = sum(!analytic_dataset$low[analytic_dataset$smoke==0])
outcome_unexposed = sum(analytic_dataset$low[analytic_dataset$smoke==0])
no_outcome_exposed =  sum(!analytic_dataset$low[analytic_dataset$smoke==1])
outcome_exposed = sum(analytic_dataset$low[analytic_dataset$smoke==1])
riskratio(c(no_outcome_unexposed,outcome_unexposed,no_outcome_exposed,outcome_exposed))

#unadjusted incidence rate ratio 
outcome_unexposed = sum(analytic_dataset$low[analytic_dataset$smoke==0])
outcome_exposed = sum(analytic_dataset$low[analytic_dataset$smoke==1])
persontime_unexposed = sum(analytic_dataset$gestation[analytic_dataset$smoke==0])
persontime_exposed = sum(analytic_dataset$gestation[analytic_dataset$smoke==1])
rateratio(c(outcome_unexposed,outcome_exposed,persontime_unexposed,persontime_exposed))

#adjusted cumulative incidence relative risk estimated via logistic regression
summary(glm(low ~ as.factor(smoke) + as.factor(race) + as.factor(ptl_collapsed) + as.factor(ftv_collapsed), data=analytic_dataset, family=binomial(link=logit)))
exp(0.8031)

#adjusted cumulative incidence relative risk estimated via modified poisson regression 
library(sandwich) #robust standard errors
poissonmod = glm(low ~ as.factor(smoke) + as.factor(race) + as.factor(ptl_collapsed) + as.factor(ftv_collapsed), data=analytic_dataset, family=poisson())
poissonmod.cov.matrix = vcovHC(poissonmod, type="HC0")
poissonmod.std.err = sqrt(diag(poissonmod.cov.matrix))
poissonmod.r.est = cbind(Estimate= coef(poissonmod), "Robust SE" = poissonmod.std.err, "Pr(>|z|)" = 2 * pnorm(abs(coef(poissonmod)/poissonmod.std.err), lower.tail=FALSE), LL = coef(poissonmod) - 1.96 * poissonmod.std.err, UL = coef(poissonmod) + 1.96 * poissonmod.std.err)
exp(poissonmod.r.est[, c(1,4,5)])

#adjusted cumulative incidence relative risk estimated via cox proportional hazards regression
library(survival)
summary(coxph(Surv(gestation, low)~as.factor(smoke) + as.factor(race) + as.factor(ptl_collapsed) + as.factor(ftv_collapsed), data=analytic_dataset))

#adjusted incidence rate ratio estimated via poisson regression 
summary(glm(low ~ as.factor(smoke) + as.factor(race) + as.factor(ptl_collapsed) + as.factor(ftv_collapsed), offset=log(gestation), data=analytic_dataset, family=poisson()))
exp(0.5540)

#unadjusted survival plot
plot(survfit(Surv(gestation, low) ~ as.factor(smoke), data=analytic_dataset), lty=c(1,2), xlim=c(30,45), xlab="Gestation in weeks", ylab="1 - Risk of low birthweight")
legend(31,0.3,lty=c(1,2),c("Nonsmoker","Smoker"))
summary(coxph(Surv(gestation, low) ~ as.factor(smoke), data=analytic_dataset))

#unadjusted odds ratio
control_unexposed = sum(!analytic_dataset$low[analytic_dataset$smoke==0])
case_unexposed = sum(analytic_dataset$low[analytic_dataset$smoke==0])
control_exposed =  sum(!analytic_dataset$low[analytic_dataset$smoke==1])
case_exposed = sum(analytic_dataset$low[analytic_dataset$smoke==1])
oddsratio(c(control_unexposed,case_unexposed,control_exposed,case_exposed))

#adjusted odds ratio estimated via logistic regression
summary(glm(low ~ as.factor(smoke) + as.factor(race) + as.factor(ptl_collapsed) + as.factor(ftv_collapsed) + gestation, data=analytic_dataset, family=binomial(link=logit)))
exp(0.3667)

#adjusted odds ratio estimated via conditional logistic regression
summary(clogit(low ~ as.factor(smoke) + as.factor(race) + as.factor(ptl_collapsed) + as.factor(ftv_collapsed) + gestation + age + strata(matched), data=matched_dataset))