/***************************************************************
PROGRAM 12.1
Descriptive statistics from NHEFS data (Table 12.1)
***************************************************************/

clear
use "P:\CIBook\NHEFSdata\nhefs.dta"

gen byte cens = (wt82 == .)
gen byte older = (age > 50 & age < .)
label var older "50+ years"

/* provisionally ignore subjects with missing values*/
*Analysis restricted to N=1566 with no missing values*/
foreach var of varlist school wt82 {
drop if `var'==.
}

/* mean weight change in those with and without smoking cessation*/
label define qsmk 0 "No smoking cessation" 1 "Smoking cessation"
label values qsmk qsmk
label var wt82_71 "weight change in kg"
bysort qsmk: sum wt82_71 if cens == 0
regress wt82_71 qsmk if cens == 0, noheader
by qsmk, sort: egen years = mean(age) if age < . & cens == 0
label var years "Age, years"
by qsmk, sort: egen cigs = mean(smokeintensity) if smokeintensity < . & cens== 0
label var cigs "Cigarettes/day"
by qsmk, sort: egen kg = mean(wt71) if wt71 < . & cens == 0
label var kg "Weight, kg"
by qsmk, sort: egen white = mean(100 * (race==0)) if race < . & cens == 0
label var white "White, %"
by qsmk, sort: egen male = mean(100 * (sex==0)) if sex < . & cens == 0
label var male "Men, %"
by qsmk, sort: egen inactive = mean(100 * (active==2)) if active < . & cens ==0
label var inactive "Inactive daily life"
by qsmk, sort: egen noexer = mean(100 * (exercise == 2)) if exercise < . & cens == 0
label var noexer "Little/no exercise"
by qsmk, sort: egen university = mean(100 * (education == 5)) if education <. & cens == 0
label var university "University education"
by qsmk, sort: egen syrs = mean(smokeyrs) if smokeyrs < . & cens== 0
label var syrs "Years of smoking"
by qsmk, sort: egen fiftyplus = mean(100 * (older==1)) if sex < . & cens == 0
label var fiftyplus "50+ years, %"

/*Table*/
foreach var of varlist years male white university kg cigs noexer inactive syrs fiftyplus {
tabdisp qsmk, cell(`var') format(%3.1f)
}



/***************************************************************
PROGRAM 12.2
Estimating IP weights
Data from NHEFS
***************************************************************/

* estimation of ip weights via a logistic model 
logit qsmk sex race c.age##c.age ib(last).education c.smokeintensity##c.smokeintensity ///
c.smokeyrs##c.smokeyrs ib(last).exercise ib(last).active c.wt71##c.wt71 
predict p_qsmk, pr

gen w=.
replace w=1/p_qsmk if qsmk==1
replace w=1/(1-p_qsmk) if qsmk==0
summarize w

regress wt82_71 qsmk [pweight=w], cluster(seqn) 


/* no association between sex and qsmk in pseudo-population*/
tab sex qsmk [aw=w]

/* "check" for positivity*/ 
tab  age qsmk if race==0 & sex==1 & wt82!=.



/***************************************************************
PROGRAM 12.3
Estimating stabilized IP weights
Data from NHEFS
***************************************************************/

/* estimation of denominator of ip weights*/ 
logit qsmk sex race c.age##c.age ib(last).education c.smokeintensity##c.smokeintensity ///
c.smokeyrs##c.smokeyrs ib(last).exercise ib(last).active c.wt71##c.wt71 

predict pd_qsmk, pr

/* estimation of numerator of ip weights*/ 
logit qsmk 
predict pn_qsmk, pr

gen sw_a=.
replace sw_a=pn_qsmk/pd_qsmk if qsmk==1
replace sw_a=(1-pn_qsmk)/(1-pd_qsmk) if qsmk==0
summarize sw_a

regress wt82_71 qsmk [pweight=sw_a], cluster(seqn) 

/* no association between sex and qsmk in pseudo-population*/
tab sex qsmk [aw=sw_a]



/***************************************************************
PROGRAM 12.4
Estimating the parameters of a marginal structural mean model
with a continuous treatment Data from NHEFS
***************************************************************/

clear
use "P:\CIBook\NHEFSdata\nhefs.dta"


/* provisionally ignore subjects with missing values*/ 
foreach var of varlist qsmk sex race age school smokeintensity smokeyrs exercise active wt71 wt82 {
drop if `var'==.
}

/* Analysis restricted to subjects reporting <=25 cig/day at baseline*/
keep if  smokeintensity <=25


/* estimation of denominator of ip weights*/

regress smkintensity82_71 sex race c.age##c.age ib(last).education c.smokeintensity##c.smokeintensity ///
c.smokeyrs##c.smokeyrs ib(last).exercise ib(last).active c.wt71##c.wt71

quietly predict p_den
gen dens_den = normalden(smkintensity82_71, p_den, e(rmse))
*NOTE: The regress command in STATA saves the root mean squared error for the immediate regression as e(rmse), thus there is no need to calculate it again. 

/* estimation of numerator of ip weights*/

quietly regress smkintensity82_71
quietly predict p_num

gen dens_num = normalden( smkintensity82_71, p_num, e(rmse))

/* estimation of Stabilized weights*/
gen sw_a=dens_num/dens_den
summarize sw_a
regress wt82_71  c.smkintensity82_71##c. smkintensity82_71 [pweight=sw_a], cluster(seqn)



/***************************************************************
PROGRAM 12.5
Estimating the parameters of a marginal structural logistic model
Data from NHEFS
***************************************************************/
clear
use "P:\CIBook\NHEFSdata\nhefs.dta"


* provisionally ignore subjects with missing values 
foreach var of varlist qsmk sex race age school smokeintensity smokeyrs exercise active wt71 wt82 {
drop if `var'==.
}

* First, estimation of stabilized weights sw_a (same as in PROGRAM 12.3)
logit qsmk sex race c.age##c.age ib(last).education c.smokeintensity##c.smokeintensity ///
c.smokeyrs##c.smokeyrs ib(last).exercise ib(last).active c.wt71##c.wt71 

predict pd_qsmk, pr

* estimation of numerator of ip weights 
logit qsmk 
predict pn_qsmk, pr

gen sw_a=.
replace sw_a=pn_qsmk/pd_qsmk if qsmk==1
replace sw_a=(1-pn_qsmk)/(1-pd_qsmk) if qsmk==0

* Second, fit logistic model below
logistic death qsmk [pweight=sw_a], cluster(seqn) 

/*NOTE:Stata has two commands for logistic regression, logit and logistic. 
The former displays the coefficients while the latter displays the odds ratios. 
You can also obtain the odds ratios by using the logit command with the or option/




***************************************************************
*PROGRAM 12.6
*Assessing effect modification by sex using a marginal structural mean model
*Data from NHEFS
***************************************************************/
clear
use "P:\CIBook\NHEFSdata\nhefs.dta"

* provisionally ignore subjects with missing values 
foreach var of varlist qsmk sex race age school smokeintensity smokeyrs exercise active wt71 wt82 {
drop if `var'==.
}


tab sex

/* estimation of denominator of ip weights */
logit qsmk sex race c.age##c.age ib(last).education c.smokeintensity##c.smokeintensity ///
c.smokeyrs##c.smokeyrs ib(last).exercise ib(last).active c.wt71##c.wt71 

predict pd_qsmk, pr

/* estimation of numerator of ip weights */
logit qsmk sex
predict pn_qsmk, pr

gen sw_a=.
replace sw_a=pn_qsmk/pd_qsmk if qsmk==1
replace sw_a=(1-pn_qsmk)/(1-pd_qsmk) if qsmk==0

summarize sw_a
regress wt82_71 qsmk##sex [pw=sw_a], cluster(seqn)



/***************************************************************
PROGRAM 12.7
Estimating IP weights to adjust for selection bias due to censoring
Data from NHEFS
***************************************************************/

clear
use "P:\CIBook\NHEFSdata\nhefs.dta"

*Analysis restricted to N=1629. Missing values of wt82 are retained while the others are dropped 

foreach var of varlist qsmk sex  race school active exercise age  wt71  smokeintensity smokeyrs {
drop if `var'==.
}


gen byte cens = (wt82 == .)

tab cens qsmk, column
bys cens: summarize wt71

/*Estimation of the denominator of ip weight for A*/
logit qsmk sex race c.age##c.age ib(last).education c.smokeintensity##c.smokeintensity ///
c.smokeyrs##c.smokeyrs ib(last).exercise ib(last).active c.wt71##c.wt71 

predict pd_qsmk, pr

/* estimation of numerator of ip weights for A */
logit qsmk
predict pn_qsmk, pr


/* estimation of denominator of ip weights for C */
logit cens qsmk sex race c.age##c.age ib(last).education c.smokeintensity##c.smokeintensity ///
c.smokeyrs##c.smokeyrs ib(last).exercise ib(last).active c.wt71##c.wt71 

predict pd_cens, pr

/* estimation of numerator of ip weights for C */
logit cens qsmk

predict pn_cens, pr

/*Estimation of stabilized weight for A (sw_a)*/

gen sw_a=.
replace sw_a=pn_qsmk/pd_qsmk if qsmk==1
replace sw_a=(1-pn_qsmk)/(1-pd_qsmk) if qsmk==0

/*Estimation of stabilized weight for C (sw_c)*/

/*NOTE: STATA, by default gives the probability of an event being 1. However, we are interested in Pr[C=0|A,L]
Observations with cens=1 only contribute to censoring models*/

gen sw_c=.
replace sw_c=(1-pn_cens)/(1-pd_cens) if cens==0


/*Estimation of Stabilized Censoring weight (sw)*/
gen sw=sw_a*sw_c
summarize sw
regress wt82_71 qsmk [pw=sw], cluster(seqn)