/*
 *  lmEQTL.c
 *
 *  Created by Wei Sun on 10/08/2007.
 *
 */

#include <stdio.h>
#include <stddef.h>
#include <time.h>
#include <string.h>
#include <gsl/gsl_vector.h>
#include <gsl/gsl_matrix.h>
#include <gsl/gsl_cdf.h>
#include <gsl/gsl_multifit.h>
#include <R.h>
#include "utility.h"

int lmEQTL_slr (int* dims, double* ve, double* vm, int* ve_na, 
  int* vm_na, char** output, double* RP_cut, int* cis_only, 
  int* cis_distance, int* eChr, int* ePos, int* mChr, int* mPos, 
  double* Rtol, double* Rnna_percent, int* succeed)
{

  int i, j, k, c, pos_diff;
  double chisq0, t_stat;

  /* p-values of slope */
  double b_p;
  /* rank of model matrix */
  size_t rank;
  
  /* 
   * p-value frequencies
   * freqs[100] = #{ pval < P_cut }
   * i = 0:99
   * freqs[i]   = #{ pval < [i/100, (i+1)/100) }
   */
   
  double tmp;
  unsigned long freqs[101];
  for(i=0; i<=100; i++){
     freqs[i] = 0;
  }
  
  const int nrow_e = dims[0];
  const int nrow_m = dims[1];
  const int ncol   = dims[2];
  const double P_cut = RP_cut[0];
  const double tol = Rtol[0];
  const double nna_percent = Rnna_percent[0];
  
  Rprintf("\n------------------------------------------------------\n");
  Rprintf("dims=(%d, %d, %d)\t", nrow_e, nrow_m, ncol);
  Rprintf("P_cut=%e\n", P_cut);

  gsl_matrix * me = gsl_matrix_alloc (nrow_e, ncol); // gene expression matrix
  gsl_matrix * mm = gsl_matrix_alloc (nrow_m, ncol); // marker genotype matrix
  gsl_matrix_int * ie = gsl_matrix_int_alloc (nrow_e, ncol); // missing index
  gsl_matrix_int * im = gsl_matrix_int_alloc (nrow_m, ncol); // missing index
  
  gsl_vector * gd = gsl_vector_alloc (ncol); // gene expression data
  gsl_vector * md = gsl_vector_alloc (ncol); // marker genotype data
  gsl_vector_int * gi = gsl_vector_int_alloc (ncol); // missing value index
  gsl_vector_int * mi = gsl_vector_int_alloc (ncol); // missing value index
  
  // intercept to be used in model matrix
  gsl_vector * intercept = gsl_vector_alloc (ncol); 
  gsl_vector_set_all(intercept, 1.0);

  // the model matrix
  gsl_matrix * X = gsl_matrix_alloc (ncol, 2);
  gsl_matrix_view Xa;

  // reorganize the data into gsl_vector/matrix structure
  for(j=0; j<ncol; j++){
    for(i=0; i<nrow_e; i++){
      gsl_matrix_set(me, i, j, ve[j*nrow_e+i]);
      gsl_matrix_int_set(ie, i, j, ve_na[j*nrow_e+i]);
    }
  }
  
  for(j=0; j<ncol; j++){
    for(i=0; i<nrow_m; i++){
      gsl_matrix_set(mm, i, j, vm[j*nrow_m+i]);
      gsl_matrix_int_set(im, i, j, vm_na[j*nrow_m+i]);
    }
  }

  /**
   * model: g ~ add
   **/

  // coefficients different models
  gsl_vector * c0 = gsl_vector_alloc (2);
  gsl_vector * c0_p = gsl_vector_alloc (2);
  gsl_matrix * c0_cov = gsl_matrix_alloc (2, 2);

  // work space for linear model fitting
  gsl_multifit_linear_workspace * work;

  // output file handles
  FILE * fo, *ff;
    
  // time records
  time_t sec_s;
  time_t sec_e;

  sec_s  = time(NULL);
  
  /*
  Rprintf("matrix of gene expression[1, 1:8]\n");
  Rprint_matrix(me, 0, 0, 0, 7, 1);
  Rprintf("matrix of markers[1, 1:8]\n");
  Rprint_matrix_i(mm, 0, 0, 0, 7, 1);
  */
    
  fo  = fopen (output[0], "w");
  /**
   * model 0: g ~ add
   */

  fprintf(fo, "Gene_ID\tMarker_ID\t");
  fprintf(fo, "a\ta_p\tb\tb_p\tN\n");

  for(i=0; i<nrow_e; i++){ // the (i+1)-th gene expression profile
  
    if(i%1000 == 0){ Rprintf("i=%d\n", i); }
    gsl_vector_view gd_v = gsl_matrix_row (me, i);
    gsl_matrix_int_get_row (gi, ie, i);
    
    // check whether there is enough observations for gi
    c = 0;
    for(k = 0; k<ncol; k++){
      if(gsl_vector_int_get(gi,k) == 1){ c++; }
    }
    if(c < ncol * nna_percent){ continue; }
    
    for(j=0; j<nrow_m; j++){ // the (j+1)-th marker genotype profile
      if(*cis_only){
        if(eChr[i] != mChr[j]){
          continue;
        }
        pos_diff = ePos[i] - mPos[j];
        if(pos_diff > *cis_distance || pos_diff < - *cis_distance){
          continue;
        }
      }
      
      gsl_vector_view md_v = gsl_matrix_row (mm, j);
      gsl_matrix_int_get_row (mi, im, j);
            
      // after multiplication, mi is the index whether there is 
      // missing value, either gene expression or genotype
      gsl_vector_int_mul (mi, gi);
      
      c = 0;
      for(k = 0; k<ncol; k++){
        if(gsl_vector_int_get(mi,k) == 1){
          gsl_vector_set(gd, c, gsl_vector_get(&gd_v.vector,k));
          gsl_vector_set(md, c, gsl_vector_get(&md_v.vector,k));
          c++;
        }
      }

      // if there is not enough observations left, exit
      if(c < ncol * nna_percent){ continue; }
                
      // generate the model matrix
      // g = intercept + m.add
      gsl_matrix_set_col(X, 0, intercept);
      gsl_matrix_set_col(X, 1, md);

      // Rprint_matrix_i(X, 0, ncol-1, 0, 1, 0);

      // vector include non-missing values
      gsl_vector_view gd_a = gsl_vector_subvector(gd, 0, c);

      // model 0: g ~ add
      Xa  = gsl_matrix_submatrix(X, 0, 0, c, 2);
      work = gsl_multifit_linear_alloc (c,2);
      gsl_multifit_linear_svd (&Xa.matrix, &gd_a.vector, tol, 
                &rank, c0, c0_cov, &chisq0, work);
      gsl_multifit_linear_free(work);

      for(k=0; k<2; k++){
        t_stat = gsl_vector_get(c0,k)/sqrt(gsl_matrix_get(c0_cov,k,k));
        gsl_vector_set(c0_p,k,2*gsl_cdf_tdist_Q(fabs(t_stat),c-2));
      }
      
      b_p = gsl_vector_get(c0_p, 1);
      k = (int) (b_p / 0.01);
      freqs[k] += 1;
            
      if(b_p < P_cut){
        freqs[100] += 1;
        fprintf(fo, "%d\t%d\t", i+1, j+1);
        for(k=0; k<2; k++){
           fprintf(fo, "%.3f\t", gsl_vector_get(c0, k));
           fprintf(fo, "%.2e\t", gsl_vector_get(c0_p,k));
        }
        fprintf(fo, "%d\n", c);
      }
    }
  }
  
  fclose(fo);
  
  // print out the frequencies
  ff  = fopen(output[1], "w");
  tmp = 0.0;
  
  fprintf(ff, "<%.2e", P_cut);
  for(i=0; i<100; i++){
    fprintf(ff, "\t%.2f-%.2f", tmp, tmp+0.01);
    tmp += 0.01;
  }
  fprintf(ff, "\n");
  
  fprintf(ff, "%d", freqs[100]);
  for(i=0; i<100; i++){
    fprintf(ff, "\t%d", freqs[i]);
  }
  fprintf(ff, "\n");  
  fclose(ff);

  // finish up
  sec_e  = time(NULL);
  Rprintf("total time spent is %ld secs\n", sec_e-sec_s);
  Rprintf("------------------------------------------------------\n");

  gsl_matrix_free(me);
  gsl_matrix_free(mm);
  gsl_matrix_int_free(ie);
  gsl_matrix_int_free(im);
  gsl_vector_free(gd);
  gsl_vector_free(md);
  gsl_vector_int_free(gi);
  gsl_vector_int_free(mi);
  gsl_vector_free(c0);
  gsl_vector_free(c0_p);
  gsl_matrix_free(c0_cov);
  gsl_vector_free(intercept);
  gsl_matrix_free(X);
    
  *succeed = 1;
  return(1);
}

void lmEQTL_slr_max1 (int* dims, gsl_matrix * me, gsl_matrix * mm, 
  gsl_matrix_int * ie, gsl_matrix_int * im, double P_cut, int* best_m, 
  double* pval, int* cis_only, int* cis_distance, int* eChr, int* ePos, 
  int* mChr, int* mPos, double tol, double nna_percent)
{

  int i, j, k, c, pos_diff;
  double chisq0, t_stat;

  /* p-values of slope */
  double b_p, b_p_min;
  /* rank of model matrix */
  size_t rank;

  const int nrow_e = dims[0];
  const int nrow_m = dims[1];
  const int ncol   = dims[2];
  
  gsl_vector * gd = gsl_vector_alloc (ncol); // gene expression data
  gsl_vector * md = gsl_vector_alloc (ncol); // marker genotype data
  gsl_vector_int * gi = gsl_vector_int_alloc (ncol); // missing value index
  gsl_vector_int * mi = gsl_vector_int_alloc (ncol); // missing value index
  
  // intercept to be used in model matrix
  gsl_vector * intercept = gsl_vector_alloc (ncol); 
  gsl_vector_set_all(intercept, 1.0);

  // the model matrix
  gsl_matrix * X = gsl_matrix_alloc (ncol, 2);
  gsl_matrix_view Xa;

  /**
   * model: g ~ add
   **/

  // coefficients different models
  gsl_vector * c0 = gsl_vector_alloc (2);
  gsl_vector * c0_p = gsl_vector_alloc (2);
  gsl_matrix * c0_cov = gsl_matrix_alloc (2, 2);

  // work space for linear model fitting
  gsl_multifit_linear_workspace * work;
  
  for(i=0; i<nrow_e; i++){ // the (i+1)-th gene expression profile
    gsl_vector_view gd_v = gsl_matrix_row (me, i);
    gsl_matrix_int_get_row (gi, ie, i);
    
    // check whether there is enough observations for gi
    c = 0;
    for(k = 0; k<ncol; k++){
      if(gsl_vector_int_get(gi,k) == 1){ c++; }
    }

    pval[i] = 1.0;
    best_m[i] = -1;

    if(c < ncol * nna_percent){
      continue;
    }
    
    b_p_min = 1.0;
    for(j=0; j<nrow_m; j++){ // the (j+1)-th marker genotype profile
      if(*cis_only){
        if(eChr[i] != mChr[j]){
          continue;
        }
        pos_diff = ePos[i] - mPos[j];
        if(pos_diff > *cis_distance || pos_diff < - *cis_distance){
          continue;
        }
      }
      
      gsl_vector_view md_v = gsl_matrix_row (mm, j);
      gsl_matrix_int_get_row (mi, im, j);
            
      // after multiplication, mi is the index whether there is 
      // missing value, either gene expression or genotype
      gsl_vector_int_mul (mi, gi);
      
      c = 0;
      for(k = 0; k<ncol; k++){
        if(gsl_vector_int_get(mi,k) == 1){
          gsl_vector_set(gd, c, gsl_vector_get(&gd_v.vector,k));
          gsl_vector_set(md, c, gsl_vector_get(&md_v.vector,k));
          c++;
        }
      }

      // if there is not enough observations left, exit
      if(c < ncol * nna_percent){ continue; }
                
      // generate the model matrix
      // g = intercept + m.add
      gsl_matrix_set_col(X, 0, intercept);
      gsl_matrix_set_col(X, 1, md);

      // Rprint_matrix_i(X, 0, ncol-1, 0, 1, 0);

      // vector include non-missing values
      gsl_vector_view gd_a = gsl_vector_subvector(gd, 0, c);

      // model 0: g ~ add
      Xa  = gsl_matrix_submatrix(X, 0, 0, c, 2);
      work = gsl_multifit_linear_alloc (c,2);
      gsl_multifit_linear_svd (&Xa.matrix, &gd_a.vector, tol, 
                &rank, c0, c0_cov, &chisq0, work);
      gsl_multifit_linear_free(work);

      for(k=0; k<2; k++){
        t_stat = gsl_vector_get(c0,k)/sqrt(gsl_matrix_get(c0_cov,k,k));
        gsl_vector_set(c0_p,k,2*gsl_cdf_tdist_Q(fabs(t_stat),c-2));
      }
      
      b_p = gsl_vector_get(c0_p, 1);
      
      if(b_p < b_p_min){
        b_p_min = b_p;
        best_m[i] = j;
      }
    }
    
    pval[i] = b_p_min;
  }
  
  gsl_vector_free(gd);
  gsl_vector_free(md);
  gsl_vector_int_free(gi);
  gsl_vector_int_free(mi);
  gsl_vector_free(c0);
  gsl_vector_free(c0_p);
  gsl_matrix_free(c0_cov);
  gsl_vector_free(intercept);
  gsl_matrix_free(X);
    
}

int lmEQTL_slr_permute (int* dims, double* ve, double* vm, int* ve_na, 
  int* vm_na, double* RP_cut, int* best_m, double* pval, double* p_pval,
  int* cis_only, int* cis_distance, int* eChr, int* ePos, int* mChr, 
  int* mPos, int* trace, double* Rtol, double* Rnna_percent, int* Ridum, 
  int* Rgrp, int* Rngrp, int* Rgrpn, int* Rgrps, int* succeed)
{
  int i, j, p, q, *best_m0, *perm1;
  double *pval0;
  time_t timer;
  long idum = (long) *Ridum;
  int grp  = Rgrp[0];
  int ngrp = Rngrp[0];
  if(ngrp==0){ ngrp = 1; }

  const int nrow_e = dims[0];
  const int nrow_m = dims[1];
  const int ncol   = dims[2];
  const int npermute = dims[3];
  perm1   = (int *)R_alloc(ncol, sizeof(int));
  best_m0 = (int *)R_alloc(nrow_e, sizeof(int));
  pval0   = (double *)R_alloc(nrow_e, sizeof(double));
 
  const double P_cut = RP_cut[0];
  const double tol = Rtol[0];
  const double nna_percent = Rnna_percent[0];
  
  Rprintf("\n------------------------------------------------------\n");
  Rprintf("dims=(%d, %d, %d, %d)\n", nrow_e, nrow_m, ncol, npermute);
  //Rprintf("P_cut=%e\n", P_cut);

  int **grps;
  grps = (int**) R_alloc(ngrp, sizeof(int*));
  grps[0] = (int*) R_alloc(ngrp*ncol, sizeof(int));
  for(i=1; i<ngrp; i++){
  	grps[i] = grps[0] + i*ncol;
  }
    
  if(grp){
    Rprintf("permutation by grpup, ngrp=%d\n", ngrp);

    // reorganize the data into gsl_vector/matrix structure
    for(j=0; j<ncol; j++){
      for(i=0; i<ngrp; i++){
        grps[i][j] = Rgrps[j*ngrp+i];
      }
    }
    /*
    for(i=0; i<ngrp; i++){
    	for(j=0; j<ncol; j++){
    		Rprintf("%d ", grps[i][j]);
    	}
    	Rprintf("\n");
    }
    exit(99);
    */
  }
  
      
  gsl_matrix * me = gsl_matrix_alloc (nrow_e, ncol); // gene expression matrix
  gsl_matrix * mm = gsl_matrix_alloc (nrow_m, ncol); // marker genotype matrix
  gsl_matrix_int * ie = gsl_matrix_int_alloc (nrow_e, ncol); // missing index
  gsl_matrix_int * im = gsl_matrix_int_alloc (nrow_m, ncol); // missing index
  
  // permuted data
  gsl_matrix * pme = gsl_matrix_alloc (nrow_e, ncol); // gene expression matrix
  gsl_matrix_int * pie = gsl_matrix_int_alloc (nrow_e, ncol); // missing index

  // reorganize the data into gsl_vector/matrix structure
  for(j=0; j<ncol; j++){
    for(i=0; i<nrow_e; i++){
      gsl_matrix_set(me, i, j, ve[j*nrow_e+i]);
      gsl_matrix_int_set(ie, i, j, ve_na[j*nrow_e+i]);
    }
  }
  
  for(j=0; j<ncol; j++){
    for(i=0; i<nrow_m; i++){
      gsl_matrix_set(mm, i, j, vm[j*nrow_m+i]);
      gsl_matrix_int_set(im, i, j, vm_na[j*nrow_m+i]);
    }
  }
  
  timer=time(NULL);
  Rprintf("unpermuted data %s\n",asctime(localtime(&timer)));

  lmEQTL_slr_max1 (dims, me, mm, ie, im, P_cut, best_m, pval, cis_only, 
    cis_distance, eChr, ePos, mChr, mPos, tol, nna_percent);
  
  for(p=1; p<=npermute; p++){
    timer=time(NULL);
    
    if(*trace > 0){
      if(p % *trace == 0){
        Rprintf("%d-th permutation %s\n",p,asctime(localtime(&timer)));
      }
    }
    
    if(grp){
    	getPermuteGrp(perm1, ncol, &idum, grps, Rgrpn, ngrp);    	
    }else{
      getPermute(perm1, ncol, &idum);
    }
    /*
    for(q=0; q<ncol; q++){
      Rprintf("%d ", perm1[q]);
    }
    Rprintf("\n");
    */
    permute(me, pme, ie, pie, perm1, nrow_e, ncol);

    lmEQTL_slr_max1 (dims, pme, mm, pie, im, P_cut, best_m0, pval0, cis_only, 
      cis_distance, eChr, ePos, mChr, mPos, tol, nna_percent);
      
    for(q=0; q<nrow_e; q++){
      if(pval0[q] <= pval[q]){ p_pval[q] += 1.0; }
    }
  }
  
  for(q=0; q<nrow_e; q++){
    p_pval[q] /= npermute;
  }
  
  gsl_matrix_free(me);
  gsl_matrix_free(pme);
  gsl_matrix_free(mm);
  gsl_matrix_int_free(ie);
  gsl_matrix_int_free(pie);
  gsl_matrix_int_free(im);

  *succeed = 1;
  return(1);
}
