############################################################################################# # PROGRAM: pse.R # AUTHOR: Michael Hudgens # EMAIL: mhudgens@bios.unc.edu # WEB: http://www.bios.unc.edu/~mhudgens/ # DATE: 15 Nov 2007 # # DESCRIPTION: R code to accompany Gilbert and Hudgens Biometrics paper (2008) # "Evaluating Candidate Principal Surrogate Endpoints". # Point estimates, CIs, and p-values associated with # CEP, PAE, and AS are computed under A4-NP as described # in the paper. The Breslow-Day type test is also computed. # # DISCLAIMER: # # THIS INFORMATION IS PROVIDED PROVIDED "AS IS". THERE ARE NO # WARRANTIES, EXPRESSED OR IMPLIED, AS TO MERCHANTABILITY OR # FITNESS FOR A PARTICULAR PURPOSE REGARDING THE ACCURACY OF # THE MATERIALS OR CODE CONTAINED HEREIN. # # TO EXECUTE: # 1. create a dataframe called "mydata" with variables # mydata$s observed surrogate in treatment arm 1, i.e., S=S(1) # mydata$w baseline predictor # mydata$y binary outcome: 0 or 1 # mydata$delta indicator whether w sampled # mydata$rx treatment arm: 0 or 1 # # mydata$s and mydata$w should be discrete variables having four levels (i.e., factor variables binned into quartiles). # It is assumed there is no missing data, other than for s and w. # Missing values should equal NA. # Note for treatment arm 1 subjects, delta=1 implies both w and s are measured; # for treatment arm 0 subjects, delta=1 implies only w is measured. For both # treatment arms delta = 0 implies neither w nor s are measured. # # 2. specify number of boots and link function below # # 3. source this file i.e. > source("pse.R") # ############################################################################################# ############################################################################################# # # Example simulated dataset # ############################################################################################# # M1/M2- number of vaccine/placebo subjects M1 <- 250 M2 <- 250 M <- M1+M2 # Induce a correlation between w and s- the bigger tau the bigger the correlation # (tau = 0 equals no correlation) tau <- .25 # Category probabilities for S (4 levels) p1 <- .25 p2 <- .25 p3 <- .25 p4 <- .25 rnds <- runif(M) rnds2 <- runif(M) rnds3 <- runif(M) w <- ifelse(rnds < p1,1,ifelse(rnds < (p1+p2),2,ifelse(rnds < (p1+p2+p3),3,4))) s <- ifelse(rnds2 < tau,w,ifelse(rnds3 < p1,1,ifelse(rnds3 < (p1+p2),2,ifelse(rnds3 < (p1+p2+p3),3,4)))) # p0- infection rate in placebo group with lowest category for S # trueRR- the relative risk (vaccine/placebo) for subjects with lowest category for S # beta1- the regr coeff for S in the risk model for category 2 vs 1 # beta2- the regr coeff for S in the risk model for category 3 vs 1 # beta3- the regr coeff for S in the risk model for category 4 vs 1 # beta1=beta2=beta3=0 corresponds to no surrogate value case # At least one of beta1, beta2, beta3 not = 0 corresponds to some surrogate value p0 <- .5 trueRR <- .5 beta1 <- 0 beta2 <- 0 beta3 <- 0 beta0 <- qnorm(p0) beta10 <- qnorm(trueRR*p0) - qnorm(p0) s1 <- ifelse(s==2,1,0) s2 <- ifelse(s==3,1,0) s3 <- ifelse(s==4,1,0) s1vacc <- s1[1:M1] s2vacc <- s2[1:M1] s3vacc <- s3[1:M1] s1plac <- s1[(M1+1):M] s2plac <- s2[(M1+1):M] s3plac <- s3[(M1+1):M] yvacc <- rbinom(M1,1,pnorm(beta0+beta10+beta1*s1vacc+beta2*s2vacc+beta3*s3vacc)) yplac <- rbinom(M2,1,pnorm(beta0+beta1*s1plac+beta2*s2plac+beta3*s3plac)) y <- c(yvacc,yplac) rx <- c(rep(1,M1),rep(0,M2)) # Sample all cases and 50% of controls (f = fraction of controls sampled) f <- .5 delta <- rep(1,M) delta[y==0] <- ifelse(runif(length(y[y==0]))<.5,1,0) # Remove s for placebo recipients: s[(M1+1):M] <- NA mydata <- data.frame(s,w,y,delta,rx) # specify number of bootstrap iterations boots <- 5 # link function (h(x,y) in paper) link <- "probit" # options "1-x/y", "log(x/y)", "x-y", "probit" # The probit link is h(x,y) = Phi^{-1}(x) - Phi^{-1}(y), # where Phi is the stdandard normal cdf print("") print("pse.R being executed") print(paste("Bootstraps:",boots)) print(paste("Link:",link));print("") ############################################################################################# # helper functions, variables, etc. ############################################################################################# mycep <- function(mles,cep="log(x/y)"){ eps <- 1e-06 mles <- mles + eps # avoid division by zero if (cep=="1-x/y") ans <- 1 - mles[(quants+1):(quants*2)]/mles[1:quants] if (cep=="log(x/y)") ans <- log(mles[(quants+1):(quants*2)]/mles[1:quants]) if (cep=="x-y") ans <- mles[(quants+1):(quants*2)] - mles[1:quants] if (cep=="probit") ans <- qnorm(mles[(quants+1):(quants*2)]) - qnorm(mles[1:quants]) ans } mypae <- function(mles,marg,cep="log(x/y)",wts="none"){ # cep(x,y)=log(x/y) if (wts=="none") weights <- rep(1,quants-1) if (wts=="linear") weights <- (2:quants) if (wts=="extreme") weights <- c(rep(0,quants-2),1) p.s1 <- sum(marg[2:quants]*weights) eps <- 1e-06 mles <- mles + eps # avoid division by zero if (cep=="1-x/y"){ eae <- sum( (1 - mles[(quants+2):(quants*2)]/mles[2:quants]) * marg[2:quants] * weights)/p.s1 ede <- 1 - mles[(quants+1)]/mles[1] } if (cep=="log(x/y)"){ eae <- sum( log(mles[(quants+2):(quants*2)]/mles[2:quants]) * marg[2:quants] * weights)/p.s1 ede <- log(mles[(quants+1)]/mles[1]) } if (cep=="x-y"){ eae <- sum( (mles[(quants+2):(quants*2)] - mles[2:quants]) * marg[2:quants] * weights)/p.s1 ede <- mles[(quants+1)] - mles[1] } if (cep=="probit"){ eae <- sum( (qnorm(mles[(quants+2):(quants*2)]) - qnorm(mles[2:quants])) * marg[2:quants] * weights)/p.s1 ede <- qnorm(mles[(quants+1)]) - qnorm(mles[1]) } pae <- abs(eae) / ( abs(ede) + abs(eae) ) list(eae=eae,ede=ede,pae=pae) } bd <- function(mles){ mu0 <- mean(mles[1:quants]) mu1 <- mean(mles[1:quants+quants]) t <- 0 for (jj in 2:quants) t <- t + (jj-1) * (mles[jj] - (mles[jj]+mles[jj+quants]) * (mu0/(mu0+mu1)) ) t } ############################################################################################# # wrapper for constrained optimizer ############################################################################################# myopt <- function(){ index <- is.na(mydata$s) coarse.s <- sort(unique(mydata$s[!index])) index <- is.na(mydata$w) coarse.w <- sort(unique(mydata$w[!index])) # calculate infections rates to be used below nvac <- sum(mydata$rx==1) npla <- sum(mydata$rx==0) infvac <- sum(mydata$rx==1 & mydata$y==1) infpla <- sum(mydata$rx==0 & mydata$y==1) # consistent estimates of joint distn of (s,w), marginal of s joint <- matrix(0,quants,quants) den <- sum(!is.na(mydata$s) & mydata$y==0) den1 <- sum(!is.na(mydata$s) & mydata$y==1) for (ss in 1:quants) for (ww in 1:quants){ num <- sum(!is.na(mydata$s) & mydata$s==coarse.s[ss] & mydata$w==coarse.w[ww] & mydata$y==0) num1 <- sum(!is.na(mydata$s) & mydata$s==coarse.s[ss] & mydata$w==coarse.w[ww] & mydata$y==1) joint[ss,ww] <- num/den*(1-infvac/nvac) + num1/den1*infvac/nvac } marg.s <- matrix(0,1,quants) for (ss in 1:quants) marg.s[ss] <- sum(joint[ss,])/sum(joint) # choose initial values beta0 <- c( rep(infpla/npla,quants), rep(infvac/nvac,quants) ) gamma0 <- matrix(0.1,1,quants-1) # effect of w in nonparametric model # all possible observed data combinations delta0 <- matrix(0,2,1) for (zz in 0:1) delta0[zz+1,1] <- sum(mydata$delta==0 & mydata$rx==zz) delta1z0 <- matrix(0,2,4) for (yy in 0:1) for (ww in 1:4) delta1z0[yy+1,ww] <- sum(mydata$delta==1 & mydata$rx==0 & mydata$y==yy & mydata$w==coarse.w[ww]) delta1z1 <- array(0,c(2,4,4)) for (yy in 0:1) for (ss in 1:4) for (ww in 1:4) delta1z1[yy+1,ss,ww] <- sum(mydata$delta==1 & mydata$rx==1 & mydata$y==yy & mydata$s==coarse.s[ss] & mydata$w==coarse.w[ww]) # likelihood like <- function(theta){ beta <- theta[1:8]; gamma <- c(theta[9:11],0) ans <- 0 # delta=0 # z=0,y=1 obg0 <- outer(beta[1:4],gamma,"+")*joint s0 <- sum(obg0) ans <- ans + log((1-s0)^delta0[1,1]) # z=1,y=1 obg1 <- outer(beta[1:4+quants],gamma,"+")*joint s1 <- sum(obg1) ans <- ans + log((1-s1)^delta0[2,1]) # delta=1, z=0 m1 <- matrix(1,1,4) arg1 <- m1 %*% obg0 ans <- ans + m1 %*% t(log(arg1^delta1z0[2,])) sj <- m1 %*% joint arg0 <- sj-arg1 ans <- ans + m1 %*% t(log(arg0^delta1z0[1,])) # delta=1, z=1 ans <- ans + sum(log((joint-obg1)^delta1z1[1,,])) ans <- ans + sum(log(obg1^delta1z1[2,,])) -ans # constrOptim is minimizing } # constraints ui <- matrix(0,43,11) ui[1:11,1:11] <- diag(11) counter <- 12 for (ii in 1:8) for (jj in 1:4){ ui[counter,ii] <- -1 if (jj<4) ui[counter,8+jj] <- -1 counter <- counter + 1 } ci <- c(rep(1e-8,11),rep(1e-8-1,32)) itheta <- c(beta0,gamma0) beta1 <- constrOptim(itheta,like,NULL,ui=ui,ci=ci,control=list(maxit=100),outer.iterations=100)$par ans <- list(mle=beta1[1:8],marg=marg.s,gamma=beta1[9:11]) } ############################################################################################# # call optimizer, bootstrap, compute results, etc. ############################################################################################# quants <- 4 parms <- quants*2 # call optimizer opt.out <- myopt() print("Optimization complete") opt.out$mle <- opt.out$mle + sum(opt.out$gamma)/4 # average risk estimate # compute estimates, test statistics, etc cep <- mycep(opt.out$mle,cep=link) pae <- matrix(0,3,1) pae[1] <- mypae(opt.out$mle,opt.out$marg,cep=link, wts="none")$pae pae[2] <- mypae(opt.out$mle,opt.out$marg,cep=link, wts="linear")$pae ext <- mypae(opt.out$mle,opt.out$marg,cep=link, wts="extreme") pae[3] <- ext$pae as <- abs(ext$eae)-abs(ext$ede) bd.mle <- bd(opt.out$mle) # bootstrap print(paste("Begin",boots,"boostrap iterations")) n <- dim(mydata)[1] bd.boot <- matrix(0,boots,1) cep.boot <- matrix(0,boots,quants) pae.boot <- matrix(0,boots,3) as.boot <- matrix(0,boots,1) cep.ci.boot <- matrix(0,quants,2) cep.sd.boot <- matrix(0,quants,1) cep.mn.boot <- matrix(0,quants,1) bd.sd.boot <- 0 bd.mn.boot <- 0 as.ci.boot <- 0 as.sd.boot <- 0 as.mn.boot <- 0 pae.ci.boot <- matrix(0,3,2) pae.sd.boot <- matrix(0,3,1) pae.mn.boot <- matrix(0,3,1) o.data <- mydata for (ii in 1:boots){ samp <- sample(c(1:n),size=n,replace=T) mydata <- o.data[samp,] opt.out <- myopt() opt.out$mle <- opt.out$mle + sum(opt.out$gamma)/4 # average risk estimate cep.boot[ii,] <- mycep(opt.out$mle,cep=link) pae.boot[ii,1] <- mypae(opt.out$mle,opt.out$marg,cep=link, wts="none")$pae pae.boot[ii,2] <- mypae(opt.out$mle,opt.out$marg,cep=link, wts="linear")$pae ext <- mypae(opt.out$mle,opt.out$marg,cep=link, wts="extreme") pae.boot[ii,3] <- ext$pae as.boot[ii] <- abs(ext$eae)-abs(ext$ede) bd.boot[ii] <- bd(opt.out$mle) } mydata <- o.data print(paste("End",boots,"boostrap iterations")) print("") for (ii in 1:quants){ cep.sd.boot[ii] <- sd(cep.boot[,ii]) cep.mn.boot[ii] <- mean(cep.boot[,ii]) cep.ci.boot[ii,] <- quantile(cep.boot[,ii],c(.025,.975)) } bd.sd.boot <- sd(bd.boot) bd.mn.boot <- mean(bd.boot) for (ii in 1:3) pae.sd.boot[ii] <- sd(pae.boot[,ii]) for (ii in 1:3) pae.mn.boot[ii] <- mean(pae.boot[,ii]) for (ii in 1:3) pae.ci.boot[ii,] <- quantile(pae.boot[,ii],c(.025,.975)) as.ci.boot <- quantile(as.boot,c(.025,.975)) as.sd.boot <- sd(as.boot) as.mn.boot <- mean(as.boot) # finish up, printout results, etc print("CEP estimates, 95% confidence intervals, and two-sided p-values") for (ii in 1:quants){ p <- 2*pnorm(-abs(cep[ii]/cep.sd.boot[ii])) print(paste("CEP(",ii,",1)=",round(cep[ii],3), " (",round(cep.ci.boot[ii,1],3),", ",round(cep.ci.boot[ii,2],3),"), p=",round(p,5),sep="")) } print("");print("PAE estimates, 95% confidence intervals, and one-sided p-values") for (ii in 1:3){ p <- 1-pnorm((pae[ii]-.5)/pae.sd.boot[ii]) print(paste("PAE(w",ii,")=",round(pae[ii],3), " (",round(pae.ci.boot[ii,1],3),", ",round(pae.ci.boot[ii,2],3),"), p=",round(p,5),sep="")) } print("");print("AS estimate, 95% confidence interval, and one-sided p-value") p <- 1-pnorm(as/as.sd.boot) print(paste("AS=",round(as,3), " (",round(as.ci.boot[1],3),", ",round(as.ci.boot[2],3),"), p=",round(p,5),sep="")) print("");print("BD test statistic, estimated standard error, and one-sided p-value") bd.p <- 1-pnorm(bd.mle/bd.sd.boot) print(paste("BD=",round(bd.mle,3),", SE=",round(bd.sd.boot,3),", p=",round(bd.p,5),sep=""))