# Adapted from programs found here https://www.hsph.harvard.edu/miguel-hernan/causal-inference-book/

################################################################
# PROGRAM 13.1
# Estimating the mean outcome within levels of treatment 
# and confounders: Data from NHEFS
################################################################

# rm(list=ls(all=TRUE))
nhefs <- read.csv("V:\\sismid2020\\nhefs.csv") 
nhefs <- as.data.frame(nhefs)

# restricting data for uncensored, for comparison with observed outcome
nhefs$cens <- as.numeric(is.na(nhefs$wt82))
nhefs0 <- subset(nhefs, cens == 0)
dim(nhefs0)

table(nhefs0$qsmk) # untreated vs treated

summary(nhefs0$wt82_71)


################################################################
# PROGRAM 13.3
# Standardizing the mean outcome to the baseline confounders:
# Data from NHEFS
################################################################

# 1st copy: equal to original one
nhefs0$interv <- rep(-1, nrow(nhefs0))

# 2nd copy: treatment set to 0, outcome to missing
nhefs.untr <- nhefs0
nhefs.untr$interv <- rep(0, nrow(nhefs.untr))
nhefs.untr$qsmk <- rep(0, nrow(nhefs.untr))
nhefs.untr$wt82_71 <- rep(NA, nrow(nhefs.untr))

# 3rd copy: treatment set to 1, outcome to missing
nhefs.tr <- nhefs0
nhefs.tr$interv <- rep(1, nrow(nhefs.tr))
nhefs.tr$qsmk <- rep(1, nrow(nhefs.tr))
nhefs.tr$wt82_71 <- rep(NA, nrow(nhefs.tr))

# create a dataset with 3 copies of each subject
onesample <- as.data.frame(rbind(nhefs0, nhefs.untr, nhefs.tr))
#onesample$qsmk
dim(onesample)

# Estimates
# linear model to estimate mean outcome conditional on treatment & confounders,
# parameters are estimated using original observations only (interv= -1),
# parameter estimates are used to predict mean outcome for observations 
# with treatment set to 0 (interv=0) and to 1 (innterv=1);

glm.obj <- glm(wt82_71~ as.factor(qsmk) + as.factor(sex) + as.factor(race) + 
  age + I(age^2) + as.factor(education) + smokeintensity + I(smokeintensity^2) + 
  smokeyrs + I(smokeyrs^2) + as.factor(exercise) + as.factor(active) + wt71 + 
  I(wt71^2), data = onesample)
summary(glm.obj)

#onesample$meanY <- c(predict(glm.obj, nhefs0, type = "response"),
#                     predict(glm.obj, nhefs.untr, type = "response"), 
#                     predict(glm.obj, nhefs.tr, type = "response"))

onesample$meanY <- predict(glm.obj, onesample, type = "response")

# estimate mean outcome in each of the groups interv=0, and interv=1;
# this mean outcome is a weighted average of the mean outcomes 
# in each combination of values of treatment and confounders, 
# that is, the standardized outcome;

#index <- sort(unique(onesample$interv))
#ia <- mean(onesample$meanY[onesample$interv == index[1]])
#ib <- mean(onesample$meanY[onesample$interv == index[2]])
#ic <- mean(onesample$meanY[onesample$interv == index[3]])
#predicted.values <- c(ia, ib, ic)
#cbind(index, predicted.values)

with(onesample, tapply(meanY, list(interv), mean))