# Feel free to report errors/suggestions to ehsan@stat.ubc.ca

################################################################
# PROGRAM 13.1
# Estimating the mean outcome within levels of treatment 
# and confounders: Data from NHEFS
################################################################

# rm(list=ls(all=TRUE))
# setwd("C:\\Users\\Ehsan\\Dropbox\\causal") # contains SAS/CSV datafile
# reading external data file
# install.packages("sas7bdat")
library(sas7bdat)
nhefs <- read.sas7bdat("nhefs_book.sas7bdat")
# or if CSV file is available:
# nhefs <- read.csv("nhefs.csv") 
nhefs <- as.data.frame(nhefs)
dim(nhefs)

# some preprocessing of the data

nhefs$cens <- as.numeric(is.na(nhefs$wt82))
nhefs$older <- as.numeric(nhefs$age > 50 & !is.na(nhefs$age))
# install.packages("car")
library(car)
nhefs$education.code <- recode(nhefs$school, " 0:8 = 1; 9:11 = 2; 12 = 3; 
                         13:15 = 4; 16:hi = 5; NA = 6 ")
nhefs$education <- recode(nhefs$education.code, " 1 = '1. 8th grade or less';
                    2 = '2. HS dropout';
                    3 = '3. HS';
                    4 = '4. College dropout';
                    5 = '5. College or more';
                    6 = 'Unknown' ")

# Analysis restricted to N=1566 
# with non-missing values in the following covariates
nhefs.original <- nhefs # Original data saved for later use
nhefs$id <- 1:nrow(nhefs)
nhefs2 <- nhefs[c("id", "qsmk", "sex", "race", "age", "school", 
                  "smokeintensity", "smokeyrs", "exercise", "active", 
                  "wt71", "wt82")]
dim(nhefs2)

# restricting data for non-missing
nhefs2 <- as.data.frame(na.omit(nhefs2))
dim(nhefs2)
nhefs.id.matched <- subset(nhefs, id %in% nhefs2$id)
dim(nhefs)

# restricting data for uncensored, for comparison with observed outcome
nhefs0 <- subset(nhefs.id.matched, cens == 0)
dim(nhefs0)

table(nhefs0$qsmk) # untreated vs treated

# Estimates
glm.obj <- glm(wt82_71~ as.factor(qsmk) + as.factor(sex) + as.factor(race) + 
  age + I(age^2) + as.factor(education) + smokeintensity + I(smokeintensity^2) + 
  smokeyrs + I(smokeyrs^2) + as.factor(exercise) + as.factor(active) + wt71 + 
  I(wt71^2) + I(qsmk*smokeintensity), data = nhefs0)
summary(glm.obj)

nhefs0$meanY <- predict(glm.obj, type = "response")

# print variable values corresponding to subject with unique identifier 24770
nhefs0[nhefs0$seqn ==24770,c("meanY", "qsmk", "sex", "race", "age", "education",
                             "smokeintensity", "smokeyrs", "exercise", "active", 
                             "wt71")]

summary(nhefs0$meanY)
summary(nhefs0$wt82_71)

################################################################
# PROGRAM 13.2
# Standardizing the mean outcome to the baseline confounders
# Data from Table 2.2
################################################################

id <- c("Rheia", "Kronos", "Demeter", "Hades", "Hestia", "Poseidon", 
  "Hera", "Zeus", "Artemis", "Apollo", "Leto", "Ares", "Athena", 
  "Hephaestus", "Aphrodite", "Cyclope", "Persephone", "Hermes", 
  "Hebe", "Dionysus")
N <- length(id)
L <- c(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
A <- c(0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1)
Y <- c(0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0)
interv <- rep(-1, N)
observed <- cbind(L, A, Y, interv)
untreated <- cbind(L, rep(0, N), rep(NA, N), rep(0, N))
treated <- cbind(L, rep(1, N), rep(NA, N), rep(1, N))
data <- as.data.frame(rbind(observed, untreated, treated))
data$id <- rep(id, 3)

# Estimates
glm.obj <- glm(Y~ A*L, data = data)
summary(glm.obj)

# data$meanY <- c(predict(glm.obj, data[data$interv == -1,], type = "response"),
##                predict(glm.obj, data[data$interv == 0,], type = "response"), 
#                predict(glm.obj, data[data$interv == 1,], type = "response"))

data$meanY <- c(predict(glm.obj, data, type = "response"))

#index <- sort(unique(data$interv))
#ia <- mean(data$meanY[data$interv == index[1]])
#ib <- mean(data$meanY[data$interv == index[2]])
#ic <- mean(data$meanY[data$interv == index[3]])
#predicted.values <- c(ia, ib, ic)
#cbind(index, predicted.values)

with(data, tapply(meanY, interv, mean))

################################################################
# PROGRAM 13.3
# Standardizing the mean outcome to the baseline confounders:
# Data from NHEFS
################################################################

# 1st copy: equal to original one
names(nhefs)
nhefs0$interv <- rep(-1, nrow(nhefs0))
# nhefs0$qsmk
# nhefs0$wt82_71
dim(nhefs0)
# 2nd copy: treatment set to 0, outcome to missing
nhefs.untr <- nhefs0
nhefs.untr$interv <- rep(0, nrow(nhefs.untr))
nhefs.untr$qsmk <- rep(0, nrow(nhefs.untr))
nhefs.untr$wt82_71 <- rep(NA, nrow(nhefs.untr))
dim(nhefs.untr)
# 3rd copy: treatment set to 1, outcome to missing
nhefs.tr <- nhefs0
nhefs.tr$interv <- rep(1, nrow(nhefs.tr))
nhefs.tr$qsmk <- rep(1, nrow(nhefs.tr))
nhefs.tr$wt82_71 <- rep(NA, nrow(nhefs.tr))
dim(nhefs.tr)
# create a dataset with 3 copies of each subject
onesample <- as.data.frame(rbind(nhefs0, nhefs.untr, nhefs.tr))
#onesample$qsmk
dim(onesample)

# Estimates
# linear model to estimate mean outcome conditional on treatment & confounders,
# parameters are estimated using original observations only (interv= -1),
# parameter estimates are used to predict mean outcome for observations 
# with treatment set to 0 (interv=0) and to 1 (innterv=1);

glm.obj <- glm(wt82_71~ as.factor(qsmk) + as.factor(sex) + as.factor(race) + 
  age + I(age^2) + as.factor(education) + smokeintensity + I(smokeintensity^2) + 
  smokeyrs + I(smokeyrs^2) + as.factor(exercise) + as.factor(active) + wt71 + 
  I(wt71^2), data = onesample)
summary(glm.obj)

#onesample$meanY <- c(predict(glm.obj, nhefs0, type = "response"),
#                     predict(glm.obj, nhefs.untr, type = "response"), 
#                     predict(glm.obj, nhefs.tr, type = "response"))

onesample$meanY <- predict(glm.obj, onesample, type = "response")

# estimate mean outcome in each of the groups interv=0, and interv=1;
# this mean outcome is a weighted average of the mean outcomes 
# in each combination of values of treatment and confounders, 
# that is, the standardized outcome;

#index <- sort(unique(onesample$interv))
#ia <- mean(onesample$meanY[onesample$interv == index[1]])
#ib <- mean(onesample$meanY[onesample$interv == index[2]])
#ic <- mean(onesample$meanY[onesample$interv == index[3]])
#predicted.values <- c(ia, ib, ic)
#cbind(index, predicted.values)

with(onesample, tapply(meanY, list(interv), mean))

################################################################
# PROGRAM 13.4
# Computing the 95% confidence interval of the standardized means 
# and their difference: Data from NHEFS
################################################################

# install.packages("boot")
require(boot)
# Compute basic bootstrap confidence interval.
# Below is the function to estimate mean outcome in each of the groups 
# interv=-1, interv=0 & interv=1 conditional on treatment and confounders,
# also include the mean outcome of interv = 1 vs. interv = 0
boot.fun <- function(dat, index, type){
  sampled.data <- dat[index,]
  fit <- glm(formula = wt82_71~ as.factor(qsmk) + as.factor(sex) + as.factor(race) + 
    age + I(age^2) + as.factor(education) + smokeintensity + I(smokeintensity^2) + 
    smokeyrs + I(smokeyrs^2) + as.factor(exercise) + as.factor(active) + wt71 + 
    I(wt71^2), data = sampled.data)
  sampled.data$meanY <- predict(fit, sampled.data, type = "response")
  std.mean <- with(sampled.data, tapply(meanY, list(interv), mean))
  mean.list <- as.numeric(unlist(c(std.mean, diff(std.mean[c(2,3)]))))
  return(mean.list)
    }

# Parametric g-formula: Bootstrap results using nboot samples
nboot <- 100
# number of bootstrap samples to run
# creation of bootstarp samples using fixed seed to reproduce results
set.seed(1232)
bootres <- boot(onesample, boot.fun, R = nboot)
bootres
nrow(bootres$t)
boot.stat <- matrix(NA, 4, 4)
for (i in 1:4){
boot.stat[i,] <- c(bootres$t0[i], sd(bootres$t[,i]), 
                  bootres$t0[i] - 1.96 * sd(bootres$t[,i]), 
                  bootres$t0[i] + 1.96 * sd(bootres$t[,i]))
}
dimnames(boot.stat) <- list(c("observed", "No treatment", 
                              "Treatment", "Difference"), 
                            c("mean", "sd", "LCL", "UCL"))
boot.stat