################# # Source codes for A Researcher's Guide to Using Electronic Health Records, 2nd ed # https://www.routledge.com/A-Researchers-Guide-to-Using-Electronic-Health-Records-From-Planning-to/Goldstein/p/book/9781032169590 # (c) Neal D. Goldstein, 2023, https://www.goldsteinepi.com/books/ehr/index.html ################# ### Code #3.1. Saving the birthwt dataset from the MASS package as a CSV. ### #load the MASS package library(MASS) #load the birthwt dataset into the environment birthwt=birthwt #View the dataset in RStudio View(birthwt) #save as a CSV #note that missing values are encoded as empty strings, #but can be encoded #in any pattern recognizable by the destination, such as #"." for SAS write.csv(birthwt, file="birthwt.csv", na="", row.names=F) ### Code #4.1. Using SQL to extract and subset data from a Microsoft SQL Server database. ### #connect to SQL Server database using the RODBC package library(RODBC) #setup the channel using the ODBC data source #ImmunizationDB channel = odbcConnect(dsn="ImmunizationDB", readOnlyOptimize=T) #issue the query immunizations = sqlQuery(channel, paste("SELECT [patient id], [patient name], [gender], [address], [antigen], [date administered] FROM [patient demographics] LEFT JOIN [vaccine administered] ON [patient demographics]. [patient id]=[vaccine administered].[patient id] WHERE [county]='Philadelphia County';"), stringsAsFactors=F) close(channel) #save the output save(immunizations, file="research_database.Rdata") ### Code #5.1. Reshaping data from long to wide. ### #issue the reshape command reshape(long_format, v.names=c("Antigen","Date"), timevar="Vaccine", idvar="ID", direction="wide") ### Code #5.2. Linkage operation of two datasets with unique identifiers using the integrated merge function. ### #link the two vaccine variables from the second dataset #into the first dataset #join by ID, and keep all observations from the first #dataset merge(x=dataset1, y=dataset2[,c("ID","Antigen","Date")], by="ID", all.x=T) ### Code #5.3. Linkage operation of two datasets with unique identifiers using a coded algorithm. ### #initialize two new vaccine variables in the first dataset dataset1$Antigen.2 = NA dataset1$Date.2 = NA #initialize a variable to keep track of matches dataset2$Matched = F #loop through each observation in the first dataset for (i in 1:nrow(dataset1)) { #look for a match by ID matched = which(dataset2$ID==dataset1$ID[i]) if (length(matched) > 0) { #found a match, copy over vaccine variables dataset1$Antigen.2[i] = dataset2$Antigen[matched] dataset1$Date.2[i] = dataset2$Date[matched] #indicate a match occurred dataset2$Matched[matched] = T } } #the above code can also be accomplished with the #integrated merge function by #specifying the argument all.y=T ### Code #5.4. Fuzzy matching and linkage approach using the Levenshtein distance. ### library(vwr) #for levenshtein.distance #create matching variables in each dataset #a combination of last name, gender, and date of birth #upper case with non-alphanumeric characters removed dataset1$NewID = gsub("[^A-Z0-9]","", toupper(paste(dataset1$Last_name,dataset1$Gender,dataset1$Date_of_birth))) dataset2$NewID = gsub("[^A-Z0-9]","", toupper(paste(dataset2$Last_name,dataset2$Gender,dataset2$Date_of_birth))) #initialize two new vaccine variables in the first #dataset dataset1$Antigen.2 = NA dataset1$Date.2 = NA #initialize a variable to keep track of matches dataset2$Matched = F #loop through each observation in the first dataset for (i in 1:nrow(dataset1)) { #get a list of potential matches distance = levenshtein.distance(dataset1$NewID[i], dataset2$NewID); #check for a match using a distance of 3 as the #initial criteria if (min(distance)<=3) { #find the match, note there may be more than one #match, #but we'll just take the first matched = which(distance == min(distance))[1] #and copy the vaccine variables dataset1$Antigen.2[i] = dataset2$Antigen[matched] dataset1$Date.2[i] = dataset2$Date[matched] #indicate a match occurred dataset2$Matched[matched] = T } } ### Code #5.5. Common data type casting operations. ### #check the data types str(dataset) #cast to numeric dataset$dose_numeric = as.numeric(dataset$dose) #cast to date, the format argument specifies the format #of the date dataset$antigen_date = as.Date(dataset$date, format="%m/%d/%Y") #cast to text dataset$name_text = as.character(dataset$name) #make categorical dataset$gender_category = as.factor(dataset$gender) ### Code #5.6. Recoding examples. ### #scenario 1 dataset$gender_mf = ifelse(dataset$gender=="Male" | dataset$gender=="M", "M", ifelse(dataset$gender=="Female" | dataset$gender=="F", "F", NA)) #scenario 2 dataset$yesno_10 = ifelse(dataset$yesno_21==2, 1, ifelse(dataset$yesno_21==1, 0, NA)) #scenario 3 dataset$married_category = ifelse(dataset$married=="Married", 0, ifelse(dataset$married=="Not Married", 1, NA)) #scenario 4 dataset$weight_kgs = ifelse(is.numeric(dataset$weight_lbs), (dataset$weight_lbs / 2.2), NA) #scenario 5 dataset$race_category = ifelse(dataset$race_ethnicity=="non-Hispanic Caucasian" | dataset$race_ethnicity=="Hispanic Caucasian", 0, ifelse(dataset$race_ethnicity=="non-Hispanic African American" | dataset$race_ethnicity=="Hispanic African American", 1, ifelse(dataset$race_ethnicity=="non-Hispanic Asian" | dataset$race_ethnicity=="Hispanic Asian", 2, NA))) dataset$ethnicity_category = ifelse(dataset$race_ethnicity=="non-Hispanic Caucasian" | dataset$race_ethnicity=="non-Hispanic African American" | dataset$race_ethnicity=="non-Hispanic Asian", 0, ifelse(dataset$race_ethnicity=="Hispanic Caucasian" | dataset$race_ethnicity=="Hispanic African American" | dataset$race_ethnicity=="Hispanic Asian", 1, NA)) #scenario 6 dataset$education_category = ifelse(dataset$education== "grade school or below" | dataset$education=="middle school" | dataset$education=="high school no diploma", 0, ifelse(dataset$education=="high school diploma or equivalent" | dataset$education=="college without degree", 1, ifelse(dataset$education=="associates degree" | dataset$education=="bachelors degree" | dataset$education=="masters degree" | dataset$education=="doctoral degree", 2, NA))) # scenario 7 dataset$region_category = i felse(dataset$northeast==1, "Northeast", ifelse(dataset$south==1, "South", ifelse(dataset$midwest==1, "Midwest", ifelse(dataset$west==1, "West", NA)))) #scenario 8 dataset$bmi_calculated = ifelse(is.numeric(dataset$weight_kgs) & is.numeric(dataset$height_m), (dataset$weight_kgs / (dataset$height_m^2)), NA) #scenario 9 dataset$ibuprofen = ifelse(length(grep("ibuprofen", dataset$medications, ignore.case=T))>0, 1, 0) ### Code #5.7. Splitting one observation into two observations. ### #create a linking variable placeholder in original #dataset dataset$split_record = NA #copy the observation in position i to a temporary #dataset to split split_observations = dataset[i, ] #duplicate the observation for a total of two new #records split_observations = rbind(split_observations, split_observations[1,]) #create a linking variable that is based on position i #with the suffix of the #number of new records, separated by a period split_observations$split_record = paste(i, "2", sep=".") #perform any other variable manipulations here, such as #to remove any data #related to the second offspring from split_ #observations[1, ] and likewise to #remove any data related to the first offspring from #split_observations[2, ] #merge the two new observations back to the original #dataset dataset = rbind[dataset, split_observations] #drop the original observation dataset = dataset[-i, ] ### Code #5.8. Proposed fuzzy matching approach for duplicate observation detection. ### library(vwr) #for levenshtein.distance #create a matching variables in the dataset #a combination of last name, gender, and date of birth #upper case with non-alphanumeric characters removed dataset$NewID = gsub("[^A-Z0-9]","", toupper(paste(dataset$Last_name,dataset$Gender,dataset$Date_of_birth))) #initialize a variable to keep track of potential duplicates dataset$Duplicate = NA #loop through each observation in the dataset for (i in 1:nrow(dataset)) { #get a list of potential matches, but will self match distance = levenshtein.distance(dataset$NewID[i], dataset$NewID); #set the self match above the distance threshold distance[i] = 99 #check for a match using a distance of 3 as the #initial criteria if (min(distance)<=3) { #find the match, note there may be more than one #match matched = which(distance<=3) #record the potential duplicate(s) dataset$Duplicate[i] = paste(matched, collapse=",") } } #review list of potential duplicates dataset[!is.na(dataset$Duplicate), ] ### Code #7.1. Assembling a cross-section from the research database. ### #subset by inclusion/exclusion criteria analytic_dataset = subset(research_database, year>=2001 & year<=2020 & born_at_hospital==1) #keep only core variables analytic_dataset = analytic_dataset[, c("ID", "birth_weight", "staph_infection", "delivery_method", "sex", "race", "ethnicity", "mom_marital_status", "mom_age", "+other variables")] ### Code #7.2. Assembling a cohort from the research database. ### #subset by inclusion/exclusion criteria working_dataset = subset(research_database, year>=2001 & year<=2020 & born_at_hospital==1) #keep only core variables working_dataset = working_dataset[, c("ID", "birth_weight", "staph_infection", "delivery_method", "sex", "race", "ethnicity", "mom_marital_status", "mom_age", "+other variables")] ## OPTION1: entire cohort analytic_dataset = working_dataset ## OPTION2: randomly select unexposed 1:1 with exposed #sample on exposure, all VLBW<1500g analytic_dataset = subset(working_dataset, VLBW<1500) #seed ensures consistency if need to reproduce set.seed(777) #randomly select unexposed, without replacement analytic_dataset = rbind(analytic_dataset, working_dataset[sample(which(working_dataset$VLBW>=1500), nrow(analytic_dataset), replace=F), ]) ### Code #7.3. Assembling a case-control sample from the research database. ### #subset by inclusion/exclusion criteria working_dataset = subset(research_database, year>=2001 & year<=2020 & born_at_hospital==1) #keep only core variables working_dataset = working_dataset[, c("ID", "birth_weight", "staph_infection", "delivery_method", "sex", "race", "ethnicity", "mom_marital_status", "mom_age", "+other variables")] #create a placeholder for matched case and control working_dataset$matched = NA #sample all cases (assume case = 1 and control = 0) analytic_dataset = s ubset(working_dataset, case==1) #list of eligible controls by unique identifier eligible = working_dataset$MRN[working_dataset$case==0] #seed ensures consistency if need to reproduce set.seed(777) #iterate over all cases to select one control per case for (i in 1:nrow(analytic_dataset)) { #sample a control potential = sample(eligible, 1) need_control = TRUE while (need_control) { #check for matching characteristics, if any #when match, set need_control to FALSE, otherwise #sample a new potential #control using: potential = sample(eligible, 1) } #add as a control for this case analytic_dataset = rbind(analytic_dataset, working_dataset[which(working_dataset$MRN==potential), ]) #track match using the case ID analytic_dataset$matched[i] = analytic_dataset$ID[i] analytic_dataset$matched[nrow(analytic_dataset)] = analytic_dataset$ID[i] #remove from control eligible list eligible = eligible[-which(eligible==potential)] } ### Code #7.4. Transforming the analytic dataset from wide to long format. ### #issue the reshape command #Vaccine will define the vaccine number for each #individual #drop removes the original identifier to be replaced with #a new identifier reshape(analytic_dataset, varying=c("Antigen.1", "Date.1","Antigen.2","Date.2"), direction="long", timevar="Vaccine", drop="ID") ### Code #10.1. Loading the birthwt dataset and creating the follow-up time variable, gestation. ### #load the birthwt dataset from the MASS package library(MASS) analytic_dataset = birthwt #seed ensures consistency if need to reproduce set.seed(777) #simulate a gestational age variable, where #smoking increases likelihood of preterm birth analytic_dataset$gestation = ifelse(analytic_dataset$smoke==0, round(rnorm(n=nrow(analytic_dataset), mean=40, sd=1),0), round(rnorm(n=nrow(analytic_dataset), mean=37, sd=1),0)) ### Code #10.2. A descriptive analysis of the birthwt analytic dataset. ### library(gmodels) #CrossTable function library(psych) #describe function CrossTable(analytic_dataset$low) describe(analytic_dataset$age) #dichotomize maternal weight analytic_dataset$lwt_170 = ifelse(analytic_dataset$lwt<170, 0, 1) CrossTable(analytic_dataset$lwt_170) CrossTable(analytic_dataset$race) CrossTable(analytic_dataset$smoke) #collapse preterm labors analytic_dataset$ptl_collapsed = ifelse(analytic_dataset$ptl>1, 1, analytic_dataset$ptl) CrossTable(analytic_dataset$ptl_collapsed) CrossTable(analytic_dataset$ht) CrossTable(analytic_dataset$ui) #collapse doctor visits analytic_dataset$ftv_collapsed = ifelse(analytic_dataset$ftv>3, 3, analytic_dataset$ftv) CrossTable(analytic_dataset$ftv_collapsed) describe(analytic_dataset$bwt) describe(analytic_dataset$gestation) ### Code #10.3. Example regression code predicting infant birthweight from maternal weight (two parameterizations). ### #linear regression, continuous predictor summary(lm(bwt ~ lwt, data=analytic_dataset)) #linear regression, categorical predictor summary(lm(bwt ~ as.factor(lwt_170), data=analytic_dataset)) #logistic regression, categorical predictor summary(glm(low ~ as.factor(lwt_170), data=analytic_dataset, family=binomial(link=logit))) ### Code #11.1. Estimating the relationship of the potential confounders with the exposure and outcome. ### library(gmodels) #CrossTable function library(psych) #describe function #exposure describeBy(analytic_dataset$age, analytic_dataset$smoke); t.test(analytic_dataset$age ~ analytic_dataset$smoke) CrossTable(analytic_dataset$lwt_170, analytic_dataset$smoke, prop.r=F, prop.t=F, prop.chisq=F, chisq=T) CrossTable(analytic_dataset$race, analytic_dataset$smoke, prop.r=F, prop.t=F, prop.chisq=F, chisq=T) CrossTable(analytic_dataset$ptl_collapsed, analytic_dataset$smoke, prop.r=F, prop.t=F, prop.chisq=F, chisq=T) CrossTable(analytic_dataset$ht, analytic_dataset$smoke, prop.r=F, prop.t=F, prop.chisq=F, chisq=T) CrossTable(analytic_dataset$ui, analytic_dataset$smoke, prop.r=F, prop.t=F, prop.chisq=F, chisq=T) CrossTable(analytic_dataset$ftv_collapsed, analytic_dataset$smoke, prop.r=F, prop.t=F, prop.chisq=F, chisq=T) describeBy(analytic_dataset$gestation, analytic_dataset$smoke); t.test(analytic_dataset$gestation ~ analytic_dataset$smoke) ##outcome describeBy(analytic_dataset$age, analytic_dataset$low); t.test(analytic_dataset$age ~ analytic_dataset$low) CrossTable(analytic_dataset$lwt_170, analytic_dataset$low, prop.r=F, prop.t=F, prop.chisq=F, chisq=T) CrossTable(analytic_dataset$race, analytic_dataset$low, prop.r=F, prop.t=F, prop.chisq=F, chisq=T) CrossTable(analytic_dataset$ptl_collapsed, analytic_dataset$low, prop.r=F, prop.t=F, prop.chisq=F, chisq=T) CrossTable(analytic_dataset$ht, analytic_dataset$low, prop.r=F, prop.t=F, prop.chisq=F, chisq=T) CrossTable(analytic_dataset$ui, analytic_dataset$low, prop.r=F, prop.t=F, prop.chisq=F, chisq=T) CrossTable(analytic_dataset$ftv_collapsed, analytic_dataset$low, prop.r=F, prop.t=F, prop.chisq=F, chisq=T) describeBy(analytic_dataset$gestation, analytic_dataset$low); t.test(analytic_dataset$gestation ~ analytic_dataset$low) ### Code #11.2. Estimating the change in effect of the potential confounders with the exposure and outcome. ### #change in estimates, without confounder summary(glm(low ~ as.factor(smoke), data=analytic_dataset, family=binomial(link=logit))) #with confounder summary(glm(low ~ as.factor(smoke) + age, data=analytic_dataset, family=binomial(link=logit))) summary(glm(low ~ as.factor(smoke) + as.factor(lwt_170), data=analytic_dataset, family=binomial(link=logit))) summary(glm(low ~ as.factor(smoke) + as.factor(race), data=analytic_dataset, family=binomial(logit))) summary(glm(low ~ as.factor(smoke) + as.factor(ptl_collapsed), data=analytic_dataset, family=binomial(link=logit))) summary(glm(low ~ as.factor(smoke) + as.factor(ht), data=analytic_dataset, family=binomial(link=logit))) summary(glm(low ~ as.factor(smoke) + as.factor(ui), data=analytic_dataset, family=binomial(link=logit))) summary(glm(low ~ as.factor(smoke) + as.factor(ftv_collapsed), data=analytic_dataset, family=binomial(link=logit))) summary(glm(low ~ as.factor(smoke) + gestation, data=analytic_dataset, family=binomial(link=logit))) ### Code #11.3. Creating a matched case-control study from the birthwt analytic dataset. ### #create a unique identifier analytic_dataset$id = 1:nrow(analytic_dataset) #create a placeholder for matched case and control analytic_dataset$matched = NA #sample all cases matched_dataset = s ubset(analytic_dataset, low==1) #list of eligible controls by unique identifier eligible = analytic_dataset$id[analytic_dataset$low==0] #seed ensures consistency if need to reproduce set.seed(776) #iterate over all cases to select one control per case for (i in 1:nrow(matched_dataset)) { #sample a control potential = sample(eligible, 1) need_control = TRUE while (need_control) { #match on age +/- 2 years if (abs(analytic_dataset$age[potential] - matched_ dataset$age[i]) <=2) { need_control = FALSE } else { potential = sample(eligible, 1) } } #add as a control for this case matched_dataset = rbind(matched_dataset, analytic_dataset[which(analytic_dataset$id==potential), ]) #track match using the case id matched_dataset$matched[i] = matched_dataset$id[i] matched_dataset$matched[nrow(matched_dataset)] = matched_dataset$id[i] #remove from control eligible list eligible = eligible[-which(eligible==potential)] } ### Code #11.4. Example analytic models to estimate the association between smoking and low birthweight in the birthwt analytic dataset assuming various study designs. ### #unadjusted prevalance odds ratio via logistic regression summary(glm(low ~ as.factor(smoke), data=analytic_dataset, family=binomial(link=logit))) exp(0.7041) #unadjusted prevalance rate ratio via log-binomial regression summary(glm(low ~ as.factor(smoke), data=analytic_dataset, family=binomial(link=log))) exp(0.4748) #adjusted prevalance odds ratio via logistic regression summary(glm(low ~ as.factor(smoke) + as.factor(race) + as.factor(ptl_collapsed) + as.factor(ftv_collapsed) + gestation, data=analytic_dataset, family=binomial(link=logit))) exp(0.3667) #unadjusted cumulative incidence relative risk library(epitools) no_outcome_unexposed = sum(!analytic_dataset$low[analytic_dataset$smoke==0]) outcome_unexposed = sum(analytic_dataset$low[analytic_dataset$smoke==0]) no_outcome_exposed = sum(!analytic_dataset$low[analytic_dataset$smoke==1]) outcome_exposed = sum(analytic_dataset$low[analytic_dataset$smoke==1]) riskratio(c(no_outcome_unexposed,outcome_unexposed,no_outcome_exposed,outcome_exposed)) #unadjusted incidence rate ratio outcome_unexposed = sum(analytic_dataset$low[analytic_dataset$smoke==0]) outcome_exposed = sum(analytic_dataset$low[analytic_dataset$smoke==1]) persontime_unexposed = sum(analytic_dataset$gestation[analytic_dataset$smoke==0]) persontime_exposed = sum(analytic_dataset$gestation[analytic_dataset$smoke==1]) rateratio(c(outcome_unexposed,outcome_exposed,persontime_unexposed,persontime_exposed)) #adjusted cumulative incidence relative risk estimated via logistic regression summary(glm(low ~ as.factor(smoke) + as.factor(race) + as.factor(ptl_collapsed) + as.factor(ftv_collapsed), data=analytic_dataset, family=binomial(link=logit))) exp(0.8031) #adjusted cumulative incidence relative risk estimated via modified poisson regression library(sandwich) #robust standard errors poissonmod = glm(low ~ as.factor(smoke) + as.factor(race) + as.factor(ptl_collapsed) + as.factor(ftv_collapsed), data=analytic_dataset, family=poisson()) poissonmod.cov.matrix = vcovHC(poissonmod, type="HC0") poissonmod.std.err = sqrt(diag(poissonmod.cov.matrix)) poissonmod.r.est = cbind(Estimate= coef(poissonmod), "Robust SE" = poissonmod.std.err, "Pr(>|z|)" = 2 * pnorm(abs(coef(poissonmod)/poissonmod.std.err), lower.tail=FALSE), LL = coef(poissonmod) - 1.96 * poissonmod.std.err, UL = coef(poissonmod) + 1.96 * poissonmod.std.err) exp(poissonmod.r.est[, c(1,4,5)]) #adjusted cumulative incidence relative risk estimated via cox proportional hazards regression library(survival) summary(coxph(Surv(gestation, low)~as.factor(smoke) + as.factor(race) + as.factor(ptl_collapsed) + as.factor(ftv_collapsed), data=analytic_dataset)) #adjusted incidence rate ratio estimated via poisson regression summary(glm(low ~ as.factor(smoke) + as.factor(race) + as.factor(ptl_collapsed) + as.factor(ftv_collapsed), offset=log(gestation), data=analytic_dataset, family=poisson())) exp(0.5540) #unadjusted survival plot plot(survfit(Surv(gestation, low) ~ as.factor(smoke), data=analytic_dataset), lty=c(1,2), xlim=c(30,45), xlab="Gestation in weeks", ylab="1 - Risk of low birthweight") legend(31,0.3,lty=c(1,2),c("Nonsmoker","Smoker")) summary(coxph(Surv(gestation, low) ~ as.factor(smoke), data=analytic_dataset)) #unadjusted odds ratio control_unexposed = sum(!analytic_dataset$low[analytic_dataset$smoke==0]) case_unexposed = sum(analytic_dataset$low[analytic_dataset$smoke==0]) control_exposed = sum(!analytic_dataset$low[analytic_dataset$smoke==1]) case_exposed = sum(analytic_dataset$low[analytic_dataset$smoke==1]) oddsratio(c(control_unexposed,case_unexposed,control_exposed,case_exposed)) #adjusted odds ratio estimated via logistic regression summary(glm(low ~ as.factor(smoke) + as.factor(race) + as.factor(ptl_collapsed) + as.factor(ftv_collapsed) + gestation, data=analytic_dataset, family=binomial(link=logit))) exp(0.3667) #adjusted odds ratio estimated via conditional logistic regression summary(clogit(low ~ as.factor(smoke) + as.factor(race) + as.factor(ptl_collapsed) + as.factor(ftv_collapsed) + gestation + age + strata(matched), data=matched_dataset))