# Author contact: Justin Bohn (jmb619@mail.harvard.edu)
# Last revision : 2015-08-04

#######################################################################################
#### Program 15.1                                                                  ####
#### Estimating the average causal effect within levels of confounders             ####
#### under the assumption of effect-measure modification by smoking intensity ONLY ####
#### Data from NHEFS                                                               ####
#######################################################################################

# load external packages (install if necessary)
library(dplyr)      # easy, readable data manipulation
library(multcomp)   # estimation of linear contrasts from GLMs
library(ggplot2)    # graphics
library(gtools)     # easy decile creation

# clean up workspace
rm(list = ls())

# establish working directory (the path may be different on your system)
setwd('~/Downloads/R_part2/')

# read in NHEFS data and preprocess
nhefs <- read.csv('nhefs.csv') %>%
  mutate(
    # define censoring variable cens
    cens = ifelse(is.na(wt82_71), yes = 1, no = 0),
    # categorize the school variable
    education = cut(school, breaks = c(0, 8, 11, 12, 15, 20),
                    include.lowest = TRUE, 
                    labels = c('1. 8th Grage or Less',
                               '2. HS Dropout',
                               '3. HS',
                               '4. College Dropout',
                               '5. College or More')),
    # establish active as a factor variable
    active = factor(active),
    # establish exercise as a factor variable
    exercise = factor(exercise),
    # create a treatment label variable
    qsmklabel = ifelse(qsmk == 1,
                       yes = 'Quit Smoking 1971-1982',
                       no = 'Did Not Quit Smoking 1971-1982')
  ) %>%
  # provisionally ignore those with missing values on some variables
  filter(!is.na(education))

# check the categorizations of education to be sure
xtabs(~ school + education, data = nhefs)


# model 1: regression on covariates, allowing for some effect modification
# notes: (1) poly(x, 2) adds an orthogonal polynomial of degree 2, add the argument raw = TRUE 
#            if you want it to produce the same coefficients as would x + x^2
#        (2) x1*x2 enters the main effects of x1 and x2 and their product term
#            x1:x2 enters just the product term (necessary here for smokeintensity because
#            we want smokeintensity treated linearly in the interaction but quadratically in
#            the main effect and thus a linear term for smokeintensity is not estimable)
#        (3) observations with missing values are automatically deleted
model1 <- glm(wt82_71 ~ qsmk + sex + race + poly(age, 2, raw = TRUE) + education +
              poly(smokeintensity, 2, raw = TRUE) + poly(smokeyrs, 2, raw = TRUE) +
              + exercise + active + poly(wt71, 2, raw = TRUE) + qsmk:smokeintensity,
              data = nhefs)
summary(model1) 
confint.lm(model1)

# estimate the effect of quitting smoking at various smoking intensities
# see help(glht) for details on estimating linear contrasts

# (step 1) build the contrast matrix with all zeros
# this function builds the blank matrix 
makeContrastMatrix <- function(model, nrow, names) {
  m <- matrix(0, nrow = nrow, ncol = length(coef(model)))
  colnames(m) <- names(coef(model))
  rownames(m) <- names
  return(m)
}
K1 <- makeContrastMatrix(model1, 2, c('Effect of Quitting Smoking at Smokeintensity of 5',
                                      'Effect of Quitting Smoking at Smokeintensity of 40'))
# (step 2) fill in the relevant non-zero elements 
K1[1:2, 'qsmk'] <- 1
K1[1:2, 'qsmk:smokeintensity'] <- c(5, 40)

# (step 3) check the contrast matrix
K1 

# (step 4) estimate the contrasts, get tests and confidence intervals for them
estimates1 <- glht(model1, K1)
  summary(estimates1)
  confint(estimates1)

# model 2, regression on covariates, not allowing for effect modification
model2 <- glm(wt82_71 ~ qsmk + sex + race + poly(age, 2, raw = TRUE) + education +
                poly(smokeintensity, 2, raw = TRUE) + poly(smokeyrs, 2, raw = TRUE) +
                + exercise + active + poly(wt71, 2, raw = TRUE),
              data = nhefs)
summary(model2)
confint.lm(model2)

#######################################################################################
#### Program 15.2                                                                  ####
#### Estimating and plotting the propensity score                                  ####
#### Data from NHEFS                                                               ####
#######################################################################################

# model to estimate propensity score
model3 <- glm(qsmk ~ sex + race + poly(age, 2, raw = TRUE) + education +
              poly(smokeintensity, 2, raw = TRUE) + poly(smokeyrs, 2, raw = TRUE) +
              + exercise + active + poly(wt71, 2, raw = TRUE),
              data = nhefs, family = 'binomial')

# summary(model3) to see the results...

# predict PS values and assign to variable in dataset
nhefs$p.qsmk <- predict(model3, nhefs, 'response')

# view summary statistics for PS by qsmk (base R method)
with(nhefs, by(p.qsmk, qsmk, summary))

# view summary statistics for PS by qsmk (dplyr method)
nhefs %>%
  filter(!is.na(p.qsmk)) %>%
  group_by(qsmk) %>%
  summarize(min = min(p.qsmk),
            p25 = quantile(p.qsmk, 0.25),
            med = median(p.qsmk),
            p75 = quantile(p.qsmk, 0.75),
            max = max(p.qsmk))

# plot PS distribution by qsmk status (much more helpful)
ggplot(nhefs, aes(x = p.qsmk, fill = qsmklabel)) + geom_density(alpha = 0.2) +
  xlab('Probability of Quitting Smoking During Follow-up') +
  ggtitle('Propensity Score Distribution by Treatment Group') +
  scale_fill_discrete('') +
  theme(legend.position = 'bottom', legend.direction = 'vertical')

# alternative plot with histograms  
ggplot(nhefs, aes(x = p.qsmk, fill = qsmklabel, color = qsmklabel)) +
  geom_histogram(alpha = 0.3, position = 'identity') +
  facet_grid(qsmklabel ~ .) +
  xlab('Probability of Quitting Smoking During Follow-up') +
  ggtitle('Propensity Score Distribution by Treatment Group') +
  scale_fill_discrete('') +
  scale_color_discrete('') +
  theme(legend.position = 'bottom', legend.direction = 'vertical')

# attempt to reproduce plot from the book
nhefs %>%
  mutate(ps.grp = round(p.qsmk/0.05) * 0.05) %>%
  group_by(qsmk, qsmklabel, ps.grp) %>%
  summarize(n = n()) %>%
  ungroup() %>%
  mutate(n2 = ifelse(qsmk == 0, yes = n, no =  -1*n)) %>%
  ggplot(aes(x = ps.grp, y = n2, fill = qsmklabel)) +
  geom_bar(stat = 'identity', position = 'identity') +
  geom_text(aes(label = n, x = ps.grp, y = n2 + ifelse(qsmk == 0, 8, -8))) +
  xlab('Probability of Quitting Smoking During Follow-up') +
  ylab('N') +
  ggtitle('Propensity Score Distribution by Treatment Group') +
  scale_fill_discrete('') +
  scale_x_continuous(breaks = seq(0, 1, 0.05)) +
  theme(legend.position = 'bottom', legend.direction = 'vertical',
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank())

#######################################################################################
#### Program 15.3                                                                  ####
#### Stratification on the propensity score                                        ####
#### Data from NHEFS                                                               ####
#######################################################################################

library(gtools)

# function to create deciles easily
decile <- function(x) {
  return(factor(quantcut(x, seq(0, 1, 0.1), labels = FALSE)))
}

# regression on PS deciles, not allowing for effect modification
model4 <- glm(wt82_71 ~ qsmk + decile(p.qsmk), data = nhefs)
summary(model4)
confint.lm(model4)

# regression on PS deciles, allowing for effect modification
model5 <- glm(wt82_71 ~ qsmk*decile(p.qsmk), data = nhefs)
summary(model5)

# estimate contrasts to see effect in each decile
# (step 1) set up contrast matrix
K5 <- makeContrastMatrix(model5, 10, paste('Effect of Quitting Smoking in PS Decile', 1:10))
# (step 2) fill in contrast matrix
K5[1:10, 'qsmk'] <- 1
K5[2, 'qsmk:decile(p.qsmk)2'] <- 1
K5[3, 'qsmk:decile(p.qsmk)3'] <- 1
K5[4, 'qsmk:decile(p.qsmk)4'] <- 1
K5[5, 'qsmk:decile(p.qsmk)5'] <- 1
K5[6, 'qsmk:decile(p.qsmk)6'] <- 1
K5[7, 'qsmk:decile(p.qsmk)7'] <- 1
K5[8, 'qsmk:decile(p.qsmk)8'] <- 1
K5[9, 'qsmk:decile(p.qsmk)9'] <- 1
K5[10, 'qsmk:decile(p.qsmk)10'] <- 1

# step(3) view the contrast matrix
K5

# estimate contrasts and view tests and CIs
estimates5 <- glht(model5, K5)
summary(estimates5)
confint(estimates5)

#######################################################################################
#### Program 15.4                                                                  ####
#### Standardization using the propensity score                                    ####
#### Data from NHEFS                                                               ####
#######################################################################################

# regression on the propensity score (linear term)
model6 <- glm(wt82_71 ~ qsmk + p.qsmk, data = nhefs)
summary(model6)

# standarization on the propensity score
# (step 1) create two new datasets, one with all treated and one with all untreated
treated <- nhefs
  treated$qsmk <- 1

untreated <- nhefs
  untreated$qsmk <- 0

# (step 2) predict values for everyone in each new dataset based on above model
treated$pred.y <- predict(model6, treated)
untreated$pred.y <- predict(model6, untreated)

# (step 3) compare mean weight loss had all been treated vs. that had all been untreated
mean1 <- mean(treated$pred.y, na.rm = TRUE)
mean0 <- mean(untreated$pred.y, na.rm = TRUE)
mean1
mean0
mean1 - mean0

# (step 4) bootstrap a confidence interval
# number of bootstraps
nboot <- 100
# set up a matrix to store results
boots <- data.frame(i = 1:nboot,
                    mean1 = NA,
                    mean0 = NA,
                    difference = NA)
# loop to perform the bootstrapping
nhefs <- subset(nhefs, !is.na(p.qsmk) & !is.na(wt82_71))
for(i in 1:nboot) {
  # sample with replacement
  sampl <- nhefs[sample(1:nrow(nhefs), nrow(nhefs), replace = TRUE), ]
  
  # fit the model in the bootstrap sample
  bootmod <- glm(wt82_71 ~ qsmk + p.qsmk, data = sampl)
  
  # create new datasets
  sampl.treated <- sampl %>%
    mutate(qsmk = 1)
  
  sampl.untreated <- sampl %>%
    mutate(qsmk = 0)
  
  # predict values
  sampl.treated$pred.y <- predict(bootmod, sampl.treated)
  sampl.untreated$pred.y <- predict(bootmod, sampl.untreated)
  
  # output results 
  boots[i, 'mean1'] <- mean(sampl.treated$pred.y, na.rm = TRUE)
  boots[i, 'mean0'] <- mean(sampl.untreated$pred.y, na.rm = TRUE)
  boots[i, 'difference'] <- boots[i, 'mean1'] - boots[i, 'mean0']
  
  # once loop is done, print the results
  if(i == nboot) {
    cat('95% CI for the causal mean difference\n')
    cat(mean(boots$difference) - 1.96*sd(boots$difference), 
        ',',
        mean(boots$difference) + 1.96*sd(boots$difference))
  }
}

# a more flexible and elegant way to do this is to write a function 
# to perform the model fitting, prediction, bootstrapping, and reporting all at once
# view the code contained in the file mstandardize.R to learn more

# load the code for the mstandardize() function 
# (you may need to change the filepath)
source('chapter15_mstandardize.R') 

# performt the standardization
mstandardize(formula = wt82_71 ~ qsmk + decile(p.qsmk), 
             family = 'gaussian',
             trt = 'qsmk', 
             nboot = 100, 
             data = nhefs)