################################################################################# # Program 15.1 # Estimating the average causal effect within levels of confounders # under the assumption of effect-measure modification by smoking intensity ONLY # Data from NHEFS ################################################################################# #install.packages("readxl") # install package if required library("readxl") nhefs <- read_excel("V:/otago/NHEFS.xls") nhefs$cens <- ifelse(is.na(nhefs$wt82), 1, 0) # regression on covariates, allowing for some effect modification fit <- glm(wt82_71 ~ qsmk + sex + race + age + I(age*age) + as.factor(education) + smokeintensity + I(smokeintensity*smokeintensity) + smokeyrs + I(smokeyrs*smokeyrs) + as.factor(exercise) + as.factor(active) + wt71 + I(wt71*wt71) + I(qsmk*smokeintensity), data=nhefs) summary(fit) # (step 1) build the contrast matrix with all zeros # this function builds the blank matrix # install.packages("multcomp") # install packages if necessary library("multcomp") makeContrastMatrix <- function(model, nrow, names) { m <- matrix(0, nrow = nrow, ncol = length(coef(model))) colnames(m) <- names(coef(model)) rownames(m) <- names return(m) } K1 <- makeContrastMatrix(fit, 2, c('Effect of Quitting Smoking at Smokeintensity of 5', 'Effect of Quitting Smoking at Smokeintensity of 40')) # (step 2) fill in the relevant non-zero elements K1[1:2, 'qsmk'] <- 1 K1[1:2, 'I(qsmk * smokeintensity)'] <- c(5, 40) # (step 3) check the contrast matrix K1 # (step 4) estimate the contrasts, get tests and confidence intervals for them estimates1 <- glht(fit, K1) summary(estimates1) confint(estimates1) # regression on covariates, not allowing for effect modification fit2 <- glm(wt82_71 ~ qsmk + sex + race + age + I(age*age) + as.factor(education) + smokeintensity + I(smokeintensity*smokeintensity) + smokeyrs + I(smokeyrs*smokeyrs) + as.factor(exercise) + as.factor(active) + wt71 + I(wt71*wt71), data=nhefs) summary(fit2) ################################################## # Program 15.2 # Estimating and plotting the propensity score # Data from NHEFS ################################################## fit3 <- glm(qsmk ~ sex + race + age + I(age*age) + as.factor(education) + smokeintensity + I(smokeintensity*smokeintensity) + smokeyrs + I(smokeyrs*smokeyrs) + as.factor(exercise) + as.factor(active) + wt71 + I(wt71*wt71), data=nhefs, family=binomial()) summary(fit3) nhefs$ps <- predict(fit3, nhefs, type="response") summary(nhefs$ps[nhefs$qsmk==0]) summary(nhefs$ps[nhefs$qsmk==1]) # # plotting the estimated propensity score # install.packages("ggplot2") # install packages if necessary # install.packages("dplyr") library("ggplot2") library("dplyr") ggplot(nhefs, aes(x = ps, fill = qsmk)) + geom_density(alpha = 0.2) + xlab('Probability of Quitting Smoking During Follow-up') + ggtitle('Propensity Score Distribution by Treatment Group') + scale_fill_discrete('') + theme(legend.position = 'bottom', legend.direction = 'vertical') # alternative plot with histograms nhefs <- nhefs %>% mutate(qsmklabel = ifelse(qsmk == 1, yes = 'Quit Smoking 1971-1982', no = 'Did Not Quit Smoking 1971-1982')) ggplot(nhefs, aes(x = ps, fill = as.factor(qsmk), color = as.factor(qsmk))) + geom_histogram(alpha = 0.3, position = 'identity', bins=15) + facet_grid(as.factor(qsmk) ~ .) + xlab('Probability of Quitting Smoking During Follow-up') + ggtitle('Propensity Score Distribution by Treatment Group') + scale_fill_discrete('') + scale_color_discrete('') + theme(legend.position = 'bottom', legend.direction = 'vertical') # attempt to reproduce plot from the book nhefs %>% mutate(ps.grp = round(ps/0.05) * 0.05) %>% group_by(qsmk, ps.grp) %>% summarize(n = n()) %>% ungroup() %>% mutate(n2 = ifelse(qsmk == 0, yes = n, no = -1*n)) %>% ggplot(aes(x = ps.grp, y = n2, fill = as.factor(qsmk))) + geom_bar(stat = 'identity', position = 'identity') + geom_text(aes(label = n, x = ps.grp, y = n2 + ifelse(qsmk == 0, 8, -8))) + xlab('Probability of Quitting Smoking During Follow-up') + ylab('N') + ggtitle('Propensity Score Distribution by Treatment Group') + scale_fill_discrete('') + scale_x_continuous(breaks = seq(0, 1, 0.05)) + theme(legend.position = 'bottom', legend.direction = 'vertical', axis.ticks.y = element_blank(), axis.text.y = element_blank()) ############################################ # Program 15.3 # Stratification on the propensity score # Data from NHEFS ############################################ # calculation of deciles nhefs$ps.dec <- cut(nhefs$ps, breaks=c(quantile(nhefs$ps, probs=seq(0,1,0.1))), labels=seq(1:10), include.lowest=TRUE) #install.packages("psych") # install package if required library("psych") describeBy(nhefs$ps, list(nhefs$ps.dec, nhefs$qsmk)) # function to create deciles easily decile <- function(x) { return(factor(quantcut(x, seq(0, 1, 0.1), labels = FALSE))) } # regression on PS deciles, allowing for effect modification for (deciles in c(1:10)) { print(t.test(wt82_71~qsmk, data=nhefs[which(nhefs$ps.dec==deciles),])) } # regression on PS deciles, not allowing for effect modification fit.psdec <- glm(wt82_71 ~ qsmk + as.factor(ps.dec), data = nhefs) summary(fit.psdec) confint.lm(fit.psdec) ####################################################################################### # Program 15.4 #### # Standardization using the propensity score #### # Data from NHEFS #### ####################################################################################### #install.packages("boot") # install package if required library("boot") # standardization by propensity score, agnostic regarding effect modification std.ps <- function(data, indices) { d <- data[indices,] # 1st copy: equal to original one` # calculating propensity scores ps.fit <- glm(qsmk ~ sex + race + age + I(age*age) + as.factor(education) + smokeintensity + I(smokeintensity*smokeintensity) + smokeyrs + I(smokeyrs*smokeyrs) + as.factor(exercise) + as.factor(active) + wt71 + I(wt71*wt71), data=d, family=binomial()) d$pscore <- predict(ps.fit, d, type="response") # create a dataset with 3 copies of each subject d$interv <- -1 # 1st copy: equal to original one` d0 <- d # 2nd copy: treatment set to 0, outcome to missing d0$interv <- 0 d0$qsmk <- 0 d0$wt82_71 <- NA d1 <- d # 3rd copy: treatment set to 1, outcome to missing d1$interv <- 1 d1$qsmk <- 1 d1$wt82_71 <- NA d.onesample <- rbind(d, d0, d1) # combining datasets std.fit <- glm(wt82_71 ~ qsmk + pscore + I(qsmk*pscore), data=d.onesample) d.onesample$predicted_meanY <- predict(std.fit, d.onesample) # estimate mean outcome in each of the groups interv=-1, interv=0, and interv=1 return(c(mean(d.onesample$predicted_meanY[d.onesample$interv==-1]), mean(d.onesample$predicted_meanY[d.onesample$interv==0]), mean(d.onesample$predicted_meanY[d.onesample$interv==1]), mean(d.onesample$predicted_meanY[d.onesample$interv==1])- mean(d.onesample$predicted_meanY[d.onesample$interv==0]))) } # bootstrap results <- boot(data=nhefs, statistic=std.ps, R=5) # generating confidence intervals se <- c(sd(results$t[,1]), sd(results$t[,2]), sd(results$t[,3]), sd(results$t[,4])) mean <- results$t0 ll <- mean - qnorm(0.975)*se ul <- mean + qnorm(0.975)*se bootstrap <- data.frame(cbind(c("Observed", "No Treatment", "Treatment", "Treatment - No Treatment"), mean, se, ll, ul)) bootstrap ##################### # regression on the propensity score (linear term) model6 <- glm(wt82_71 ~ qsmk + p.qsmk, data = nhefs) summary(model6) # standarization on the propensity score # (step 1) create two new datasets, one with all treated and one with all untreated treated <- nhefs treated$qsmk <- 1 untreated <- nhefs untreated$qsmk <- 0 # (step 2) predict values for everyone in each new dataset based on above model treated$pred.y <- predict(model6, treated) untreated$pred.y <- predict(model6, untreated) # (step 3) compare mean weight loss had all been treated vs. that had all been untreated mean1 <- mean(treated$pred.y, na.rm = TRUE) mean0 <- mean(untreated$pred.y, na.rm = TRUE) mean1 mean0 mean1 - mean0 # (step 4) bootstrap a confidence interval # number of bootstraps nboot <- 100 # set up a matrix to store results boots <- data.frame(i = 1:nboot, mean1 = NA, mean0 = NA, difference = NA) # loop to perform the bootstrapping nhefs <- subset(nhefs, !is.na(p.qsmk) & !is.na(wt82_71)) for(i in 1:nboot) { # sample with replacement sampl <- nhefs[sample(1:nrow(nhefs), nrow(nhefs), replace = TRUE), ] # fit the model in the bootstrap sample bootmod <- glm(wt82_71 ~ qsmk + p.qsmk, data = sampl) # create new datasets sampl.treated <- sampl %>% mutate(qsmk = 1) sampl.untreated <- sampl %>% mutate(qsmk = 0) # predict values sampl.treated$pred.y <- predict(bootmod, sampl.treated) sampl.untreated$pred.y <- predict(bootmod, sampl.untreated) # output results boots[i, 'mean1'] <- mean(sampl.treated$pred.y, na.rm = TRUE) boots[i, 'mean0'] <- mean(sampl.untreated$pred.y, na.rm = TRUE) boots[i, 'difference'] <- boots[i, 'mean1'] - boots[i, 'mean0'] # once loop is done, print the results if(i == nboot) { cat('95% CI for the causal mean difference\n') cat(mean(boots$difference) - 1.96*sd(boots$difference), ',', mean(boots$difference) + 1.96*sd(boots$difference)) } } # a more flexible and elegant way to do this is to write a function # to perform the model fitting, prediction, bootstrapping, and reporting all at once # view the code contained in the file mstandardize.R to learn more # load the code for the mstandardize() function # (you may need to change the filepath) source('chapter15_mstandardize.R') # performt the standardization mstandardize(formula = wt82_71 ~ qsmk + decile(p.qsmk), family = 'gaussian', trt = 'qsmk', nboot = 100, data = nhefs)