#############################################################################################
# PROGRAM:      pse.R
# AUTHOR:       Michael Hudgens         
# EMAIL:        mhudgens@bios.unc.edu
# WEB:          http://www.bios.unc.edu/~mhudgens/
# DATE:         15 Nov 2007
#
# DESCRIPTION:  R code to accompany Gilbert and Hudgens Biometrics paper (2008)
#		"Evaluating Candidate Principal Surrogate Endpoints".
#		Point estimates, CIs, and p-values associated with
#		CEP, PAE, and AS are computed under A4-NP as described
#	 	in the paper. The Breslow-Day type test is also computed.
#
# DISCLAIMER:
# 
#       THIS INFORMATION IS PROVIDED PROVIDED "AS IS".  THERE ARE NO
#       WARRANTIES, EXPRESSED OR IMPLIED, AS TO MERCHANTABILITY OR
#       FITNESS FOR A PARTICULAR PURPOSE REGARDING THE ACCURACY OF
#       THE MATERIALS OR CODE CONTAINED HEREIN.
#
# TO EXECUTE:
#	1. create a dataframe called "mydata" with variables 
#			mydata$s		observed surrogate in treatment arm 1, i.e., S=S(1)
#			mydata$w		baseline predictor 
#			mydata$y		binary outcome: 0 or 1
#			mydata$delta		indicator whether w sampled
#			mydata$rx		treatment arm: 0 or 1
#	
#	mydata$s and mydata$w should be discrete variables having four levels (i.e., factor variables binned into quartiles).
#	It is assumed there is no missing data, other than for s and w.  
#       Missing values should equal NA.
#       Note for treatment arm 1 subjects, delta=1 implies both w and s are measured;
#       for treatment arm 0 subjects, delta=1 implies only w is measured. For both
#       treatment arms delta = 0 implies neither w nor s are measured.
#
#	2. specify number of boots and link function below
#
#	3. source this file i.e. > source("pse.R")
#
#############################################################################################

#############################################################################################
#
# Example simulated dataset
#
#############################################################################################

# M1/M2- number of vaccine/placebo subjects
        M1 <- 250
        M2 <- 250
        M <- M1+M2

# Induce a correlation between w and s- the bigger tau the bigger the correlation
# (tau = 0 equals no correlation)
        tau <- .25

# Category probabilities for S (4 levels)
        p1 <- .25
        p2 <- .25
        p3 <- .25
        p4 <- .25

        rnds <- runif(M)
        rnds2 <- runif(M)
        rnds3 <- runif(M)
        w <- ifelse(rnds < p1,1,ifelse(rnds < (p1+p2),2,ifelse(rnds < (p1+p2+p3),3,4)))
        s <- ifelse(rnds2 < tau,w,ifelse(rnds3 < p1,1,ifelse(rnds3 < (p1+p2),2,ifelse(rnds3 < (p1+p2+p3),3,4)))) 

# p0- infection rate in placebo group with lowest category for S
# trueRR- the relative risk (vaccine/placebo) for subjects with lowest category for S
# beta1- the regr coeff for S in the risk model for category 2 vs 1
# beta2- the regr coeff for S in the risk model for category 3 vs 1
# beta3- the regr coeff for S in the risk model for category 4 vs 1

# beta1=beta2=beta3=0 corresponds to no surrogate value case
# At least one of beta1, beta2, beta3 not = 0 corresponds to some surrogate value

       p0 <- .5
       trueRR <- .5
       beta1 <- 0
       beta2 <- 0
       beta3 <- 0

       beta0 <- qnorm(p0)
       beta10 <- qnorm(trueRR*p0) - qnorm(p0)


       s1 <- ifelse(s==2,1,0)
       s2 <- ifelse(s==3,1,0)
       s3 <- ifelse(s==4,1,0)

       s1vacc <- s1[1:M1]
       s2vacc <- s2[1:M1]
       s3vacc <- s3[1:M1]

       s1plac <- s1[(M1+1):M]
       s2plac <- s2[(M1+1):M]
       s3plac <- s3[(M1+1):M]

       yvacc <- rbinom(M1,1,pnorm(beta0+beta10+beta1*s1vacc+beta2*s2vacc+beta3*s3vacc))
       yplac <- rbinom(M2,1,pnorm(beta0+beta1*s1plac+beta2*s2plac+beta3*s3plac))

       y <- c(yvacc,yplac)
       rx <- c(rep(1,M1),rep(0,M2))

# Sample all cases and 50% of controls (f = fraction of controls sampled)
       f <- .5
       delta <- rep(1,M) 
       delta[y==0] <- ifelse(runif(length(y[y==0]))<.5,1,0)

# Remove s for placebo recipients:
       s[(M1+1):M] <- NA

       mydata <- data.frame(s,w,y,delta,rx)

# specify number of bootstrap iterations
        boots <- 5

# link function (h(x,y) in paper)
   	link <- "probit"		# options "1-x/y", "log(x/y)", "x-y", "probit"
                                        # The probit link is h(x,y) = Phi^{-1}(x) - Phi^{-1}(y),
                                        # where Phi is the  stdandard normal cdf

	print("")
	print("pse.R being executed")
	print(paste("Bootstraps:",boots))
	print(paste("Link:",link));print("")

#############################################################################################
# helper functions, variables, etc.
#############################################################################################


mycep <- function(mles,cep="log(x/y)"){
        eps <- 1e-06
        mles <- mles + eps      # avoid division by zero
        if (cep=="1-x/y")	ans <- 1 - mles[(quants+1):(quants*2)]/mles[1:quants]
        if (cep=="log(x/y)")	ans <- log(mles[(quants+1):(quants*2)]/mles[1:quants])
        if (cep=="x-y")		ans <-  mles[(quants+1):(quants*2)] - mles[1:quants]
        if (cep=="probit")	ans <-  qnorm(mles[(quants+1):(quants*2)]) - qnorm(mles[1:quants])
        ans
        }

mypae <- function(mles,marg,cep="log(x/y)",wts="none"){

        # cep(x,y)=log(x/y)
 
        if (wts=="none")        weights <- rep(1,quants-1)
        if (wts=="linear")      weights <- (2:quants)
        if (wts=="extreme")     weights <- c(rep(0,quants-2),1)

        p.s1 <- sum(marg[2:quants]*weights)

        eps <- 1e-06
        mles <- mles + eps      # avoid division by zero

        if (cep=="1-x/y"){
                eae <- sum( (1 - mles[(quants+2):(quants*2)]/mles[2:quants]) * marg[2:quants] * weights)/p.s1
                ede <- 1 - mles[(quants+1)]/mles[1]
                }

        if (cep=="log(x/y)"){
                eae <- sum( log(mles[(quants+2):(quants*2)]/mles[2:quants]) * marg[2:quants] * weights)/p.s1
                ede <- log(mles[(quants+1)]/mles[1])
                }

        if (cep=="x-y"){
                eae <- sum( (mles[(quants+2):(quants*2)] - mles[2:quants]) * marg[2:quants] * weights)/p.s1
                ede <- mles[(quants+1)] - mles[1]
                }

        if (cep=="probit"){
                eae <- sum( (qnorm(mles[(quants+2):(quants*2)]) - qnorm(mles[2:quants])) * marg[2:quants] * weights)/p.s1
                ede <- qnorm(mles[(quants+1)]) - qnorm(mles[1])
                }

        pae <- abs(eae) / ( abs(ede) + abs(eae) )
        list(eae=eae,ede=ede,pae=pae)
        }

bd <- function(mles){
        mu0 <- mean(mles[1:quants])
        mu1 <- mean(mles[1:quants+quants])
        t <- 0
        for (jj in 2:quants)
                t <- t + (jj-1) * (mles[jj] - (mles[jj]+mles[jj+quants]) * (mu0/(mu0+mu1)) )
        t
        }


#############################################################################################
# wrapper for constrained optimizer
#############################################################################################
	
	
myopt <- function(){

        index <- is.na(mydata$s)
        coarse.s <- sort(unique(mydata$s[!index]))

        index <- is.na(mydata$w)
        coarse.w <- sort(unique(mydata$w[!index]))

	# calculate infections rates to be used below

	nvac <- sum(mydata$rx==1)
	npla <- sum(mydata$rx==0)

	infvac <- sum(mydata$rx==1 & mydata$y==1)
	infpla <- sum(mydata$rx==0 & mydata$y==1)

	# consistent estimates of joint distn of (s,w), marginal of s
	
	joint <- matrix(0,quants,quants)
	den <- sum(!is.na(mydata$s) & mydata$y==0)
	den1 <- sum(!is.na(mydata$s) & mydata$y==1)
	for (ss in 1:quants)
	 	for (ww in 1:quants){
			num <- sum(!is.na(mydata$s) & mydata$s==coarse.s[ss] & mydata$w==coarse.w[ww] & mydata$y==0)
			num1 <- sum(!is.na(mydata$s) & mydata$s==coarse.s[ss] & mydata$w==coarse.w[ww] & mydata$y==1)
			joint[ss,ww] <- num/den*(1-infvac/nvac) + num1/den1*infvac/nvac
			}

	marg.s <- matrix(0,1,quants)
	for (ss in 1:quants)
		marg.s[ss] <- sum(joint[ss,])/sum(joint)

	# choose initial values

	beta0 <- c( rep(infpla/npla,quants), rep(infvac/nvac,quants) )
	gamma0 <- matrix(0.1,1,quants-1)	# effect of w in nonparametric model

	# all possible observed data combinations

	delta0 <- matrix(0,2,1)
	for (zz in 0:1)
			delta0[zz+1,1] <- sum(mydata$delta==0 & mydata$rx==zz)			

	delta1z0 <- matrix(0,2,4)
	for (yy in 0:1)
		for (ww in 1:4)
			delta1z0[yy+1,ww] <- sum(mydata$delta==1 & mydata$rx==0 & mydata$y==yy & mydata$w==coarse.w[ww])			

	delta1z1 <- array(0,c(2,4,4))
	for (yy in 0:1)
		for (ss in 1:4)
			for (ww in 1:4)
				delta1z1[yy+1,ss,ww] <- sum(mydata$delta==1 & mydata$rx==1 & mydata$y==yy & mydata$s==coarse.s[ss] & mydata$w==coarse.w[ww])			

	# likelihood

	like <- function(theta){
		beta <- theta[1:8]; gamma <- c(theta[9:11],0)
		ans <- 0

		# delta=0
		# z=0,y=1
			obg0 <- outer(beta[1:4],gamma,"+")*joint
			s0 <- sum(obg0)
			ans <- ans + log((1-s0)^delta0[1,1])

		# z=1,y=1
			obg1 <- outer(beta[1:4+quants],gamma,"+")*joint
			s1 <- sum(obg1)
			ans <- ans + log((1-s1)^delta0[2,1])

		# delta=1, z=0
			m1 <- matrix(1,1,4)
			arg1 <- m1 %*% obg0
			ans <- ans + m1 %*% t(log(arg1^delta1z0[2,]))

			sj <- m1 %*% joint
			arg0 <- sj-arg1
			ans <- ans + m1 %*% t(log(arg0^delta1z0[1,]))


		# delta=1, z=1
			ans <- ans + sum(log((joint-obg1)^delta1z1[1,,]))
			ans <- ans + sum(log(obg1^delta1z1[2,,]))

		-ans		# constrOptim is minimizing
		}
			
	# constraints
	ui <- matrix(0,43,11)
	ui[1:11,1:11] <- diag(11)
	counter <- 12
	for (ii in 1:8)
		for (jj in 1:4){
			ui[counter,ii] <- -1
			if (jj<4) ui[counter,8+jj] <- -1
			counter <- counter + 1
			}
	ci <- c(rep(1e-8,11),rep(1e-8-1,32))

	itheta <- c(beta0,gamma0)

	beta1 <- constrOptim(itheta,like,NULL,ui=ui,ci=ci,control=list(maxit=100),outer.iterations=100)$par

	ans <- list(mle=beta1[1:8],marg=marg.s,gamma=beta1[9:11])

	}


#############################################################################################
# call optimizer, bootstrap, compute results, etc.
#############################################################################################

	quants <- 4
	parms <- quants*2

	# call optimizer
  	opt.out <- myopt()
	print("Optimization complete")

	opt.out$mle <- opt.out$mle + sum(opt.out$gamma)/4	# average risk estimate

	# compute estimates, test statistics, etc
        cep <- mycep(opt.out$mle,cep=link)

	pae <- matrix(0,3,1)
	pae[1] <- mypae(opt.out$mle,opt.out$marg,cep=link,	wts="none")$pae
	pae[2] <- mypae(opt.out$mle,opt.out$marg,cep=link,	wts="linear")$pae
	ext    <- mypae(opt.out$mle,opt.out$marg,cep=link,	wts="extreme")
	pae[3] <- ext$pae

	as  <- abs(ext$eae)-abs(ext$ede)

	bd.mle <- bd(opt.out$mle)


	# bootstrap
	print(paste("Begin",boots,"boostrap iterations"))
	n <- dim(mydata)[1]

	bd.boot <- matrix(0,boots,1)
        cep.boot <- matrix(0,boots,quants)
	pae.boot <- matrix(0,boots,3)
	as.boot <- matrix(0,boots,1)

        cep.ci.boot <- matrix(0,quants,2)
        cep.sd.boot <- matrix(0,quants,1)
        cep.mn.boot <- matrix(0,quants,1)

        bd.sd.boot <- 0
        bd.mn.boot <- 0

        as.ci.boot <- 0
        as.sd.boot <- 0
        as.mn.boot <- 0

        pae.ci.boot <- matrix(0,3,2)
        pae.sd.boot <- matrix(0,3,1)
        pae.mn.boot <- matrix(0,3,1)

	o.data <- mydata
	for (ii in 1:boots){

		samp <- sample(c(1:n),size=n,replace=T)
		mydata <- o.data[samp,]

		opt.out <- myopt()
		opt.out$mle <- opt.out$mle + sum(opt.out$gamma)/4	# average risk estimate

                cep.boot[ii,] <- mycep(opt.out$mle,cep=link)

		pae.boot[ii,1] 	<- mypae(opt.out$mle,opt.out$marg,cep=link,	wts="none")$pae
		pae.boot[ii,2] 	<- mypae(opt.out$mle,opt.out$marg,cep=link,	wts="linear")$pae
		ext 		<- mypae(opt.out$mle,opt.out$marg,cep=link,	wts="extreme")
		pae.boot[ii,3] 	<- ext$pae
		as.boot[ii] 	<- abs(ext$eae)-abs(ext$ede)

		bd.boot[ii] <- bd(opt.out$mle)
		}
	mydata <- o.data
	print(paste("End",boots,"boostrap iterations"))
	print("")


        for (ii in 1:quants){
 		cep.sd.boot[ii] <- sd(cep.boot[,ii])
 		cep.mn.boot[ii] <- mean(cep.boot[,ii])
		cep.ci.boot[ii,] <- quantile(cep.boot[,ii],c(.025,.975))
		}

	bd.sd.boot <- sd(bd.boot)
	bd.mn.boot <- mean(bd.boot)

	for (ii in 1:3)	pae.sd.boot[ii] <- sd(pae.boot[,ii])
	for (ii in 1:3)	pae.mn.boot[ii] <- mean(pae.boot[,ii])
	for (ii in 1:3)	pae.ci.boot[ii,] <- quantile(pae.boot[,ii],c(.025,.975))

	as.ci.boot <- quantile(as.boot,c(.025,.975))
	as.sd.boot <- sd(as.boot)
	as.mn.boot <- mean(as.boot)

	# finish up, printout results, etc


	print("CEP estimates, 95% confidence intervals, and two-sided p-values")
	for (ii in 1:quants){
		p <- 2*pnorm(-abs(cep[ii]/cep.sd.boot[ii]))
		print(paste("CEP(",ii,",1)=",round(cep[ii],3), " (",round(cep.ci.boot[ii,1],3),", ",round(cep.ci.boot[ii,2],3),"), p=",round(p,5),sep=""))
		}

	print("");print("PAE estimates, 95% confidence intervals, and one-sided p-values")
	for (ii in 1:3){
		p <- 1-pnorm((pae[ii]-.5)/pae.sd.boot[ii])
		print(paste("PAE(w",ii,")=",round(pae[ii],3), " (",round(pae.ci.boot[ii,1],3),", ",round(pae.ci.boot[ii,2],3),"), p=",round(p,5),sep=""))
		}

	print("");print("AS estimate, 95% confidence interval, and one-sided p-value")
		p <- 1-pnorm(as/as.sd.boot)               
		print(paste("AS=",round(as,3), " (",round(as.ci.boot[1],3),", ",round(as.ci.boot[2],3),"), p=",round(p,5),sep=""))

	print("");print("BD test statistic, estimated standard error, and one-sided p-value")
	   bd.p <- 1-pnorm(bd.mle/bd.sd.boot)
		print(paste("BD=",round(bd.mle,3),", SE=",round(bd.sd.boot,3),", p=",round(bd.p,5),sep=""))