################################################################
# PROGRAM 13.1
# Estimating the mean outcome within levels of treatment 
# and confounders: Data from NHEFS
################################################################

#install.packages("readxl") # install package if required
library("readxl")
nhefs <- read_excel("V:/otago/NHEFS.xls")


# some preprocessing of the data
nhefs$cens <- ifelse(is.na(nhefs$wt82), 1, 0)

fit <- glm(wt82_71 ~ qsmk + sex + race + age + I(age*age) + as.factor(education)
           + smokeintensity + I(smokeintensity*smokeintensity) + smokeyrs
           + I(smokeyrs*smokeyrs) + as.factor(exercise) + as.factor(active)
           + wt71 + I(wt71*wt71) + qsmk*smokeintensity, data=nhefs)
summary(fit)
nhefs$predicted.meanY <- predict(fit, nhefs)  

nhefs[which(nhefs$seqn==24770), c("predicted.meanY", "qsmk", "sex", 
                                 "race", "age", "education", 
                                 "smokeintensity", "smokeyrs", "exercise",
                                 "active", "wt71")]

summary(nhefs$predicted.meanY[nhefs$cens==0])
summary(nhefs$wt82_71[nhefs$cens==0])


################################################################
# PROGRAM 13.2
# Standardizing the mean outcome to the baseline confounders
# Data from Table 2.2
################################################################

id <- c("Rheia", "Kronos", "Demeter", "Hades", "Hestia", "Poseidon", 
  "Hera", "Zeus", "Artemis", "Apollo", "Leto", "Ares", "Athena", 
  "Hephaestus", "Aphrodite", "Cyclope", "Persephone", "Hermes", 
  "Hebe", "Dionysus")
N <- length(id)
L <- c(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
A <- c(0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1)
Y <- c(0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0)
interv <- rep(-1, N)
observed <- cbind(L, A, Y, interv)
untreated <- cbind(L, rep(0, N), rep(NA, N), rep(0, N))
treated <- cbind(L, rep(1, N), rep(NA, N), rep(1, N))
table22 <- as.data.frame(rbind(observed, untreated, treated))
table22$id <- rep(id, 3)

glm.obj <- glm(Y~ A*L, data = table22)
summary(glm.obj)
table22$predicted.meanY <- predict(glm.obj, table22)

mean(table22$predicted.meanY[table22$interv==-1])
mean(table22$predicted.meanY[table22$interv==0])
mean(table22$predicted.meanY[table22$interv==1])


################################################################
# PROGRAM 13.3
# Standardizing the mean outcome to the baseline confounders:
# Data from NHEFS
################################################################

# create a dataset with 3 copies of each subject
nhefs$interv <- -1 # 1st copy: equal to original one

interv0 <- nhefs # 2nd copy: treatment set to 0, outcome to missing
interv0$interv <- 0
interv0$qsmk <- 0
interv0$wt82_71 <- NA

interv1 <- nhefs # 3rd copy: treatment set to 1, outcome to missing
interv1$interv <- 1
interv1$qsmk <- 1
interv1$wt82_71 <- NA

onesample <- rbind(nhefs, interv0, interv1) # combining datasets

# linear model to estimate mean outcome conditional on treatment and confounders
# parameters are estimated using original observations only (nhefs)
# parameter estimates are used to predict mean outcome for observations with 
# treatment set to 0 (interv=0) and to 1 (interv=1)

std <- glm(wt82_71 ~ qsmk + sex + race + age + I(age*age)
           + as.factor(education) + smokeintensity 
           + I(smokeintensity*smokeintensity) + smokeyrs 
           + I(smokeyrs*smokeyrs) + as.factor(exercise) 
           + as.factor(active) + wt71 + I(wt71*wt71) + I(qsmk*smokeintensity), 
           data=onesample)
summary(std)   
onesample$predicted_meanY <- predict(std, onesample)

# estimate mean outcome in each of the groups interv=0, and interv=1
# this mean outcome is a weighted average of the mean outcomes in each combination 
# of values of treatment and confounders, that is, the standardized outcome
mean(onesample[which(onesample$interv==-1),]$predicted_meanY)
mean(onesample[which(onesample$interv==0),]$predicted_meanY)
mean(onesample[which(onesample$interv==1),]$predicted_meanY)


################################################################
# PROGRAM 13.4
# Computing the 95% confidence interval of the standardized means 
# and their difference: Data from NHEFS
################################################################

#install.packages("boot") # install package if required
library(boot)

# function to calculate difference in means
standardization <- function(data, indices) {
  # create a dataset with 3 copies of each subject
  d <- data[indices,] # 1st copy: equal to original one`
  d$interv <- -1
  d0 <- d # 2nd copy: treatment set to 0, outcome to missing
  d0$interv <- 0
  d0$qsmk <- 0
  d0$wt82_71 <- NA
  d1 <- d # 3rd copy: treatment set to 1, outcome to missing
  d1$interv <- 1
  d1$qsmk <- 1
  d1$wt82_71 <- NA
  d.onesample <- rbind(d, d0, d1) # combining datasets
  
  # linear model to estimate mean outcome conditional on treatment and confounders
  # parameters are estimated using original observations only (interv= -1)
  # parameter estimates are used to predict mean outcome for observations with set 
  # treatment (interv=0 and interv=1)
  fit <- glm(wt82_71 ~ qsmk + sex + race + age + I(age*age) + 
               as.factor(education) + smokeintensity + 
               I(smokeintensity*smokeintensity) + smokeyrs + I(smokeyrs*smokeyrs) +
               as.factor(exercise) + as.factor(active) + wt71 + I(wt71*wt71), 
             data=d.onesample)
  
  d.onesample$predicted_meanY <- predict(fit, d.onesample)
  
  # estimate mean outcome in each of the groups interv=-1, interv=0, and interv=1
  return(c(mean(d.onesample$predicted_meanY[d.onesample$interv==-1]),
           mean(d.onesample$predicted_meanY[d.onesample$interv==0]),
           mean(d.onesample$predicted_meanY[d.onesample$interv==1]),
           mean(d.onesample$predicted_meanY[d.onesample$interv==1])-
             mean(d.onesample$predicted_meanY[d.onesample$interv==0])))
}

# bootstrap
results <- boot(data=nhefs, statistic=standardization, R=5)

# generating confidence intervals
se <- c(sd(results$t[,1]), sd(results$t[,2]), 
        sd(results$t[,3]), sd(results$t[,4]))
mean <- results$t0
ll <- mean - qnorm(0.975)*se
ul <- mean + qnorm(0.975)*se

bootstrap <- data.frame(cbind(c("Observed", "No Treatment", "Treatment", 
                                "Treatment - No Treatment"), mean, se, ll, ul))
bootstrap