/*------------------------------------------------------------------------------------------------*
| PROGRAM: 	cr_npmle_8.sas
| AUTHOR: 	Michael Hudgens		
| EMAIL: 	mhudgens@bios.unc.edu
| WEB:		http://www.bios.unc.edu/~mhudgens/soft.html		
| DATE: 	1 April 2004
|
| 	This is a Proc IML macro to compute nonparametric maximum likelhood estimator
| 	for interval censored competing risks survival data based on
|	Hudgens, Satten and Longini (Biometrics, March 2001). 
|	Contrary to the paper's EM algorithm, maximization is done via an IML constrained
|	optimization routine based on Quasi-Newton Methods. Confidence intervals
|	are computed via bootstrap as described in Hudgens, Satten and Longini.
|
|	Note: 	1. present version does not allow for truncation 
|		2. as in the Biometrics paper, intervals are assumed to be of the form [l, r].
|		if intervals of the form (l, r] or [l, r) are desired, it is suggested that a small
|		number be added or subtracted to either the left or right endpoint, respectively, of all intervals.
|	
| INPUT PARAMETERS, DATA and OUTPUT DATA:
|
|       &data_in should be a matrix with each row corresponding to a 
|	subject and the 3 columns containing variables l r type, which are the left and right interval endpoints
|	and the failure type. Failures with unknown type (including right censored observations)
|	should have failure type equal to 0. Other failure types should be numbered 1, 2, ...
|	up to the total number of failure types present in the data. For right censored observations,
|	the right endpoint of the interval should be set to a large number exceeding the right endpoint
|	of all observations that are not right censored. An example data set:
|
|		data zido;
|		  input l r type;
|		  cards;
|		  1 	112 	1
|		  30 	33 	1
|		  5 	255 	2
|		  320 	400 	2
|		  50	10000	0
|	
|       &times_in is a set of exact time points at which estimates and confidence
|	intervals are returned
|
|       &boots  is the number of bootstrap samples drawn for CI's. 
|	--> Let &boots=0 if no bootstrap samples are desired.
|
|       &est_out is an output data set containing the variables:
|		FTYPE    Q_DATA    P_DATA      MASS        CIF
|		where FTYPE is the failure type,
|		[Q_DATA, P_DATA] is an interval potentially containing mass,
|		MASS is the amount of mass contained in the interval, and
|		CIF is the cumulative incidence function at the right endpoint.
|
|       &times_out is an output data set containing the variables:
|		FTYPE    TIMES      LEST       UEST       L_CI       U_CI 	SE
|		where FTYPE is the failure type,
|		TIMES is one of the specific time points given in &times_in,
|		LEST is the lower bound on the cumulative incidence function at time point,
|		UEST is upper bound on time point,
|		L_CI is the lower 95% confidence interval based on bootstrap iterations, 
|		U_CI is the upper 95% confidence interval based on bootstrap iterations, and		
|		SE is the estimate of the standard error based on bootstrap iterations.
|
|       &boot_out contains each bootstrap iteration's estimate of time points specified
|	in &times_in. These values are returned primarily for use in the accompanying 
|	macro: cr_net_2.sas. Note there are two values returned for each time point - a lower
|	and upper bound - see Biometrics paper for details.	
|	--> If the user choses, &boot_out can be left blank to conserve memory.
|
|	An example call to the macro:
|
|		%cr_npmle(	data_in=zido, 	times_in  = {48 102 187 366 568 730}  , boots=0,
|				est_out=zest,	times_out=ztest,			boot_out=)
|
|
| Comment:	If largest observation has unknown failure type the NPMLE is non-unique beyond the largest observation time
|		In this scenario, assign all mass to first cumulative incidence function
|			 
|
| DISCLAIMER:
| 
| 	THIS INFORMATION IS PROVIDED PROVIDED "AS IS".  THERE ARE NO
| 	WARRANTIES, EXPRESSED OR IMPLIED, AS TO MERCHANTABILITY OR
| 	FITNESS FOR A PARTICULAR PURPOSE REGARDING THE ACCURACY OF
| 	THE MATERIALS OR CODE CONTAINED HEREIN.
|
|
*------------------------------------------------------------------------------------------------*/


%macro 	cr_npmle(data_in=,times_in=,boots=, est_out=,times_out=,boot_out=);
        
      use &data_in;
      read all var{l r type};
      nobs= nrow(l);
      J=max(type); 
 
       /********************************************************
         module to get support (maximal cliques)
       ********************************************************/
      start mclique(l,r,type) global(_x, nobs, nparm, J, p, q, m);	
   
            
   	   ppp= unique(r); nppp= ncol(ppp);
   	   q= j(J,nppp, .);
   	   m= j(J,1, .);
   	   p= j(J,nppp, .);

	* is largest observation right censored? if yes, flag=1;
	* check (1) by looking at largest r from observations with known type;
	* and  (2) by looking at largest l from right censored observations;
	flag=1;		
	
	ind=type>0;
	rmx=max(r[loc(ind=1)]);
	lmx=max(l);
	if (rmx>lmx) then flag=0;

	ind=(l=lmx & type>0);
	if (sum(ind)>0) then flag=0;

	if flag=1 then do;
		print("Largest observation has unknown failure type for dataset: &data_in.");
		print("All mass assigned to cumulative incidence function for type 1 failure.");
		print("Subsequent warnings concerning uniqueness are regarding earlier time points.");
		end;

  
      do tt = 1 to J;
	      
	      * get unique r values for type tt or 0 events;
	      ind=(type=tt | type=0);
	      if (flag=1 & tt>1) then ind=((type=tt | type=0) & l<rmx);
	
	      pp=unique(r[loc(ind=1)]); npp= ncol(pp);	
	      qq=unique(l[loc(ind=1)]); nqq= ncol(qq);
        	 	
	    	do;
	         do i= 1 to npp;
	            do jj= 1 to nqq;
		               if ( qq[jj] <= pp[i] ) then q[tt,i]= qq[jj];
	            end;
	            if q[tt,i] = qq[nqq] then goto lab1;
	         end;
	         lab1:
		end;
		
	      if i > npp then nq= npp;
	      else            nq= i;
	      
	      m[tt]=ncol(unique(q[tt,1:nq]));
      	      q[tt,1:m[tt]] = unique(q[tt,1:nq]);
      	    
      	        do i= 1 to m[tt];
                do jj= npp to 1 by -1;
                  if ( pp[jj] >= q[tt,i] ) then p[tt,i]= pp[jj];
                end;
	      end;	
	
		
	end;	/* tt = 1 to J */      
	nparm=sum(m);
       
       finish mclique;
	  	
      /********************************************************
         module to assign _x matrix for likelihood
       ********************************************************/
       start assign_x(l,r,type) global(_x, nobs, nparm, J, m, q, p);	
        	
     		 _x= j(nobs, nparm, 0);
    
      		counter=1;
		do tt = 1 to J;
	     	  do jj= 1 to m[tt];
         		_x[,counter]= choose(	(l <= q[tt,jj]  & p[tt,jj] <= r) & 
         					(tt=type | type=0), 1, 0);
      			counter=counter+1;
      		  end;
      		end;
        finish assign_x;
      
      
      /********************************************************
        log-likelihood function (LL)
      ********************************************************/
      start LL(theta) global(_x,nparm);
	  f=-1000000;
         if (min (_x*theta`) >0) then do;  
         	xlt= log(_x * theta`);
         	f= xlt[+];
          	end;
             return(f);
      finish LL;
      
    
      /********************************************************
        gradient vector (GRAD)
      *******************************************************/
      start GRAD(theta) global(_x,nparm);
         g= j(1,nparm,1000);
         if (min (_x*theta`) >0) then do;  
        	 tmp= _x # (1 /  (_x * theta`) );
        	 g= tmp[+,];
         end;	 
         return(g);
      finish GRAD;

     /********************************************************
        constraints and initial vector module
      *******************************************************/
      start init;        
        con= j(3, nparm + 2, .);
        con[1, 1:nparm]= 1.e-8;		/* lower bound on parameter ranges */
        con[2:3, 1:nparm]= 1;		/* upper bound on parameter ranges + part of linear constraint */
        con[3,nparm + 1]=0;		/* linear constraint is an equality constraint */
        con[3,nparm + 2]=1;		/* linear constraint equals 1 */
        x0= j(1, nparm, 1/nparm);				* initial estimates ;
       finish init;

      /*************************************************************
        estimate the masses using quasi-newton technique
      *************************************************************/
      optn= {1 0};			/* options */
      tc={400 1000};
        
      call mclique(l,r,type);				* get maximal cliques matrix ;
      call assign_x(l,r,type);				* assign x matrix ;
      call init;
      call nlpqn(rc,rx,"LL",x0,optn,con,tc,,,"GRAD"); 	* call the optimization routine ;
     
     /*************************************************************
       check uniqueness of npmle via projected hessian 
       cf. hudgens (emory dissertation, 2000), gentleman and geyer (1994, biometrika), fletcher (1987)
       lack of uniqueness can only arise from right censored observations
      ************************************************************/
       start unique_pg;
	
		* get columns of _x that correspond to possible paths consistent with data;
		i=loc(rx>(con[1,1]*100));
		proj_x=_x[,i];	
		call svd(u,svdq,v,proj_x);
		rank_proj_x=nrow(svdq[loc(svdq>1E-14)]);
		if (rank_proj_x<ncol(proj_x)) then do;
    			print "Warning: projected hessian is not negative definite.";
    			print "NPMLE not unique for dataset: &data_in.";
    		end;
    		
     	finish UNIQUE_PG;
    	
    	call unique_pg;	

      /*************************************************************
       Cumulative incidence function estimates
      ************************************************************/
	start ciest;
   	  mm=cusum(m);
          sdf= j(J,max(m), .);
    	  sdf[1,1:m[1]]=cusum(rx[1:m[1]])`;
    	  do tt = 2 to J;
	    sdf[tt,1:(mm[tt]-mm[tt-1])] = cusum(rx[(mm[tt-1]+1):mm[tt]])`;
	  end;
	finish ciest; 
	
	call ciest;
	   

       /*************************************************************
       write estimates of masses to &est_out
      *************************************************************/
  		
	out_mat=j(nparm,5,.);	* failure type, left endpoint, right endpoint, mass, cumulative incidence function;
	out_mat[1:m[1],1]=1;
	out_mat[1:m[1],2]=q[1,1:m[1]]`;
	out_mat[1:m[1],3]=p[1,1:m[1]]`;
	out_mat[1:m[1],4]=rx[1:m[1]];
	out_mat[1:m[1],5]=sdf[1,1:m[1]]`;
	
	do tt = 2 to J;
	    out_mat[(mm[tt-1]+1):mm[tt],1] = tt;
	    out_mat[(mm[tt-1]+1):mm[tt],2] = q[tt,1:m[tt]]`;
	    out_mat[(mm[tt-1]+1):mm[tt],3] = p[tt,1:m[tt]]`;
	    out_mat[(mm[tt-1]+1):mm[tt],4] = rx[(mm[tt-1]+1):mm[tt]];
	    out_mat[(mm[tt-1]+1):mm[tt],5] = sdf[tt,1:m[tt]]`;
	end;
	
	ftype=out_mat[,1];	q_data=out_mat[,2];	p_data=out_mat[,3];
	mass=out_mat[,4];	cif=out_mat[,5];
	
	create &est_out var {ftype q_data p_data mass cif};	
	    append;
	close &est_out;
    
    	
    /**************************************************************
     cumulative incidence function estimate and confidence intervals
     for specific time points &times_in 
    *************************************************************/   
      time=&times_in;
      ntimes= ncol(time);

      lower=j(ntimes*J,1+&boots,0);	* get upper and lower bounds for cif at relevant time point ;
      upper=j(ntimes*J,1+&boots,0);
      times=j(ntimes*J,1,0);
      ftype=j(ntimes*J,1,0);
      iterate=1;
      
      start lowup;

      do tt=1 to ntimes;
       do jj=1 to J;   
       	
       	counter=tt+ (jj-1)*ntimes;
        
        do ss=1 to m[jj]-1;
     
       		if (q[jj,ss]<time[1,tt] & time[1,tt]<p[jj,ss]) then do;
       			if (ss>1) then lower[counter,iterate]=sdf[jj,ss-1];
       			upper[counter,iterate]=sdf[jj,ss];
       			end;

       		if (p[jj,ss]<=time[1,tt] & time[1,tt]<=q[jj,ss+1]) then do;
       			lower[counter,iterate]=sdf[jj,ss];
       			upper[counter,iterate]=sdf[jj,ss];
       			end;
       	   		
       		end;	

       	if (q[jj,m[jj]]<time[1,tt] & time[1,tt]<p[jj,m[jj]]) then do;
       		lower[counter,iterate]=sdf[jj,m[jj]-1];
       		upper[counter,iterate]=sdf[jj,m[jj]];
       		end;
       	
       	if (p[jj,m[jj]]<=time[1,tt]) then do;
       		lower[counter,iterate]=sdf[jj,m[jj]];
       		upper[counter,iterate]=sdf[jj,m[jj]];
       		end;
       			
       			
	if (iterate=1) then times[counter,1]=time[1,tt];
	if (iterate=1) then ftype[counter,1]=jj;
 
         end;
       end;
      finish lowup;
    
      call lowup;
     
     /**************************************************************
     	bootstrap  and quantiles
    *************************************************************/   
	l_ci=j(ntimes*J,1,0);
        u_ci=j(ntimes*J,1,0);
	se=j(ntimes*J,1,0);
	  
    	if &boots>0 then do;
     	do h=1 to &boots;
    		index=j(nobs,1,0);
	  	do i=1 to nobs;
   	 		index[i,1]=int(uniform(1)*nobs+1);
		end;
    	
		ltemp=l[index];
		rtemp=r[index];
		typetemp=type[index];
	  	
	       	
	 	call mclique(ltemp,rtemp,typetemp);		* get maximal cliques matrix ;
      		call assign_x(ltemp,rtemp,typetemp);		* assign x matrix ;
     		call init;      	
     		call nlpqn(rc,rx,"LL",x0,optn,con,tc,,,"GRAD"); 	* call the optimization routine ;
     		call ciest;
      		iterate=h+1;
		call lowup;  	
		
		end;
	
	start QUANT(ans,arg,q);
		temp=arg;
		temp[1,rank(arg)]=arg;
		nq=ncol(arg);
		qindex=int(q*nq);
		qmod=mod(q*nq,1);
		
		ans=min(temp);
		if (qindex>0 & qmod=0) then ans=temp[1,qindex];
		if (qindex>0 & qmod>0) then ans=(temp[1,qindex]+temp[1,qindex+1])/2;
	finish QUANT;
	
	   start STE(ans,arg);
	      nq=ncol(arg);
	      mean=sum(arg)/nq; 
	      smean=arg-repeat(mean,1,nq);            
	      ss=sum(smean#smean) ;                 
	      ans=sqrt(ss/(nq-1));         
	   finish STE;
	   
	   
	 do tt=1 to ntimes;
	   do jj=1 to J;  
       
      		counter=tt+ (jj-1)*ntimes;    

		 call QUANT(temp,lower[counter,2:(&boots+1)],.025); 	l_ci[counter,1]=temp;
		 call QUANT(temp,upper[counter,2:(&boots+1)],.975); 	u_ci[counter,1]=temp;
	 	 call STE(temp,upper[counter,2:(&boots+1)]); 	se[counter,1]=temp;
	 
	  end;
	end;        
	
	 bounds=j(ntimes*J,&boots*2,0);
         bounds[,1:&boots]=lower[,2:(&boots+1)];
         bounds[,(&boots+1):(&boots*2)]=upper[,2:(&boots+1)];
     
   	%if "&boot_out" ne "" %then %do;
   	create &boot_out from bounds;	
		   append from bounds;
	 close &boot_out;
	 %end;

	end;
    	
     /**************************************************************
     	write estimates and bootstrap confidence intervals for timepoints of interest
    *************************************************************/

    lest=lower[,1]; uest=upper[,1];	
    create &times_out var {ftype times lest uest l_ci u_ci se};	
     append;
    close &times_out;
        

 %mend  cr_npmle;