Measuring performance of Matrix eQTL and other eQTL software

For BLAS performance testing see this page.

This page contains details on how we measured the performance of Matrix eQTL and other eQTL methods. We consider it very important that all details are clear, so that any one can generate the same dataset, run the same scripts, and observe the same results.

The cystic fibrosis data set used for testing has not been made public at this moment, so we provide an artificial data set with the same number of samples and variables:

7zip logo icon  Big sample data set (7zip archive external link).

The test data set was generated to have 2,201 genes and 57,333 SNPs over 840 samples, thus having about 10% of genes, 10% SNPS, and full set of samples of the cystic fibrosis dataset. The dataset was generated with a fixed seed of random number generator. The following Matlab code generates the data and saves it in all required formats.

Matlab code for generating the test dataset.

The analysis time for the whole dataset is then estimated as 100 times the time to analyze the test set. It took Matrix eQTL only 9 to 12 seconds to analyze the test dataset, which is too small to make precise prediction about the full dataset. Thus, I ran Matrix eQTL on the full dataset as well.

Where possible, the analysis time and the time requred to load the data were measured separately. In other instances, the 'load' time and the 'load+analysis' time were measured and the time of eQTL analysis was estimated as the difference between those.

The following computer was used for the tests: CPU: Intel Xeon X3430 (2.4 ghz, 4 cores, 38.4 gflop); RAM: 16 GB DDR3; OS: Windows 7 (64 bit);

The following software was used in the testing:

It is known that file read time depend greatly on whether the file is in the system file cache. To reduce this randomness, I ran the following scripts before each respective performance tests. Each script was saved as 1_cache_data.bat file in the respective folder.

Matrix eQTL copy Testing_cvrt.txt nul
copy Testing_genes.txt nul
copy Testing_snps.txt nul
Merlin copy Testing.dat nul
copy Testing.map nul
copy Testing.ped nul
Plink copy Testing.map nul
copy Testing.ped nul
copy TestingCvrt.txt nul
copy TestingPheno.txt nul
R/qtl copy Testing_cvrt.txt nul
copy Testing_genes.csv nul
copy Testing_snps.csv nul
snpMatrix copy ..\Plink\Testing.map nul
copy ..\Plink\Testing.ped nul
copy ..\Plink\TestingCvrt.txt nul
copy ..\Plink\TestingPheno.txt nul

Each tool, except FastMap, was ran in a script mode. The scripts are as follows.

Matrix eQTL, Matlab

Batch file:

cls
cd Matrix_eQTL

call 1_cache_data.bat
matlab -nodesktop -nojvm -wait -r "covariates_file_name='Testing_cvrt.txt';timeto='..\1_Matrix_eQTL_Matlab_cvrt_with.txt';Matrix_eQTL_run;quit()"

call 1_cache_data.bat
matlab -nodesktop -nojvm -wait -r "covariates_file_name=[];timeto='..\1_Matrix_eQTL_Matlab_cvrt_no.txt';Matrix_eQTL_run;quit()"

cd ..

Matlab code (Matrix_eQTL_run.m):

% Matrix eQTL by Andrey A. Shabalin
% http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/
%

% clear;clc;
% Load the library
addpath C:\AllWorkFiles\aaa\Matrix_eQTL\ALL_Work\Matrix_eQTL_Matlab\

%% Settings

% Linear model to use, modelANOVA or modelLINEAR
useModel = modelLINEAR; % modelANOVA or modelLINEAR

% Genotype file name
SNP_file_name = 'Testing_snps.txt';

% Gene expression file name
expression_file_name = 'Testing_genes.txt';

% Covariates file name
% Set to [] for no covariates
% covariates_file_name = [];
% covariates_file_name = 'Testing_cvrt.txt';

% Output file name
output_file_name = 'eQTL_results_M_cvrt.txt';

% Only associations significant at this level will be saved
pvOutputThreshold = 1e-5;

% Error covariance matrix
% Set to [] for identity.
errorCovariance = [];


%% Load genotype data
tic_load = tic;
snps = SlicedData;
snps.fileDelimiter = char(9); % the TAB character
snps.fileOmitCharacters = 'NA'; % denote missing values;
snps.fileSkipRows = 1; % one row of column labels
snps.fileSkipColumns = 1; % one column of row labels
snps.fileSliceSize = 10000; % read file in pieces of 10,000 rows
snps.LoadFile( SNP_file_name );

%% Load gene expression data

gene = SlicedData;
gene.fileDelimiter = char(9); % the TAB character
gene.fileOmitCharacters = 'NA'; % denote missing values;
gene.fileSkipRows = 1; % one row of column labels
gene.fileSkipColumns = 1; % one column of row labels
gene.fileSliceSize = 10000; % read file in pieces of 10,000 rows
gene.LoadFile( expression_file_name );

%% Load covariates

cvrt = SlicedData;
cvrt.fileDelimiter = char(9); % the TAB character
cvrt.fileOmitCharacters = 'NA'; % denote missing values;
cvrt.fileSkipRows = 1; % one row of column labels
cvrt.fileSkipColumns = 1; % one column of row labels
cvrt.fileSliceSize = snps.nCols + 1; % read file in one piece
if(~isempty(covariates_file_name))
    cvrt.LoadFile( covariates_file_name );
end;

tLoad = toc(tic_load);

%% Run the analysis
tStart = tic;
Matrix_eQTL_engine( snps, ...
gene, ...
cvrt, ...
output_file_name, ...
pvOutputThreshold, ...
useModel, ...
errorCovariance, ...
true);
tEQTL = toc(tStart);
tt = [tLoad; tEQTL];
save(timeto, 'tt', '-ascii');

Matrix eQTL, R (all versions of R)

Batch file:

cls
cd Matrix_eQTL

call 1_cache_data.bat
"C:\Revolution\R-Enterprise-4.3\R-2.12.2\bin\x64\Rscript.exe" --vanilla Matrix_eQTL_local_cvrt_with.r > ..\1_Matrix_eQTL_R_cvrt_with_Rev_R.txt

call 1_cache_data.bat
..\..\R-2.14.1_BLAS\bin\x64\Rscript.exe --vanilla Matrix_eQTL_local_cvrt_with.r > ..\1_Matrix_eQTL_R_cvrt_with_R_Japan.txt

call 1_cache_data.bat
..\..\R-2.14.1\bin\x64\Rscript.exe --vanilla Matrix_eQTL_local_cvrt_with.r > ..\1_Matrix_eQTL_R_cvrt_with_R_regular.txt

call 1_cache_data.bat
"C:\Revolution\R-Enterprise-4.3\R-2.12.2\bin\x64\Rscript.exe" --vanilla Matrix_eQTL_local_cvrt_no.r > ..\1_Matrix_eQTL_R_cvrt_no_Rev_R.txt

call 1_cache_data.bat
..\..\R-2.14.1_BLAS\bin\x64\Rscript.exe --vanilla Matrix_eQTL_local_cvrt_no.r > ..\1_Matrix_eQTL_R_cvrt_no_R_Japan.txt

call 1_cache_data.bat
..\..\R-2.14.1\bin\x64\Rscript.exe --vanilla Matrix_eQTL_local_cvrt_no.r > ..\1_Matrix_eQTL_R_cvrt_no_R_regular.txt

cd ..

Code file for no covariates (Matrix_eQTL_local_cvrt_no.r):

# Matrix eQTL by Andrey A. Shabalin
# http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/
#
# Be sure to use an up to date version of R.
#
setwd('C:/AllWorkFiles/aaa/Matrix_eQTL/The_formal_testing/MeQTL/Matrix_eQTL/');
library(methods)
library(MatrixEQTL)

## Settings

# Linear model to use, modelANOVA or modelLINEAR
useModel = modelLINEAR; # modelANOVA or modelLINEAR

# Genotype file name
SNP_file_name = 'Testing_snps.txt';

# Gene expression file name
expression_file_name = 'Testing_genes.txt';

# Covariates file name
# Set to character() for no covariates
# covariates_file_name = character();
covariates_file_name = character();

# Output file name
output_file_name = 'eQTL_results_R.txt';

# Only associations significant at this level will be output
pvOutputThreshold = 1e-5;

# Error covariance matrix
# Set to character() for identity.
errorCovariance = numeric();
# errorCovariance = read.table("Sample_Data/errorCovariance.txt");


## Load genotype data
tic_load = proc.time()[3];

snps = SlicedData$new();
snps$fileDelimiter = "\t"; # the TAB character
snps$fileOmitCharacters = 'Nh' ;# denote missing values;
snps$fileSkipRows = 1; # one row of column labels
snps$fileSkipColumns = 1; # one column of row labels
snps$fileSliceSize = 10000; # read file in pieces of 10,000 rows
snps$LoadFile(SNP_file_name);

## Load gene expression data

gene = SlicedData$new();
gene$fileDelimiter = '\t'; # the TAB character
gene$fileOmitCharacters = 'NA'; # denote missing values;
gene$fileSkipRows = 1; # one row of column labels
gene$fileSkipColumns = 1; # one column of row labels
gene$fileSliceSize = 10000; # read file in pieces of 10,000 rows
gene$LoadFile(expression_file_name);

## Load covariates

cvrt = SlicedData$new();
cvrt$fileDelimiter = '\t'; # the TAB character
cvrt$fileOmitCharacters = 'NA'; # denote missing values;
cvrt$fileSkipRows = 1; # one row of column labels
cvrt$fileSkipColumns = 1; # one column of row labels
cvrt$fileSliceSize = snps$nCols()+1; # read file in one piece
if(length(covariates_file_name)>0) {
    cvrt$LoadFile(covariates_file_name);
}

toc_load = proc.time()[3];
#cat('eQTL time: ', toc_load-tic_load, ' sec\n');

## Run the analysis
{
    tic_eqtl = proc.time()[3];
    Matrix_eQTL_engine(snps, gene,     cvrt,output_file_name,pvOutputThreshold,useModel, errorCovariance, verbose=TRUE);
    toc_eqtl = proc.time()[3];
}
# cat('eQTL time: ', toc_eqtl-tic_eqtl, ' sec\n');
cat('\n\n');
show(data.frame(load = toc_load-tic_load, eQTL = toc_eqtl-tic_eqtl))

Code file for model with 10 covariates (Matrix_eQTL_local_cvrt_with.r):

# Matrix eQTL by Andrey A. Shabalin
# http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/
#
# Be sure to use an up to date version of R.
#
setwd('C:/AllWorkFiles/aaa/Matrix_eQTL/The_formal_testing/MeQTL/Matrix_eQTL/');
library(methods)
library(MatrixEQTL)


## Settings

# Linear model to use, modelANOVA or modelLINEAR
useModel = modelLINEAR; # modelANOVA or modelLINEAR

# Genotype file name
SNP_file_name = 'Testing_snps.txt';

# Gene expression file name
expression_file_name = 'Testing_genes.txt';

# Covariates file name
# Set to character() for no covariates
# covariates_file_name = character();
covariates_file_name = 'Testing_cvrt.txt';

# Output file name
output_file_name = 'eQTL_results_R_cvrt.txt';

# Only associations significant at this level will be output
pvOutputThreshold = 1e-5;

# Error covariance matrix
# Set to character() for identity.
errorCovariance = numeric();
# errorCovariance = read.table("Sample_Data/errorCovariance.txt");


## Load genotype data
tic_load = proc.time()[3];

snps = SlicedData$new();
snps$fileDelimiter = "\t"; # the TAB character
snps$fileOmitCharacters = 'Nh' ;# denote missing values;
snps$fileSkipRows = 1; # one row of column labels
snps$fileSkipColumns = 1; # one column of row labels
snps$fileSliceSize = 10000; # read file in pieces of 10,000 rows
snps$LoadFile(SNP_file_name);

## Load gene expression data

gene = SlicedData$new();
gene$fileDelimiter = '\t'; # the TAB character
gene$fileOmitCharacters = 'NA'; # denote missing values;
gene$fileSkipRows = 1; # one row of column labels
gene$fileSkipColumns = 1; # one column of row labels
gene$fileSliceSize = 10000; # read file in pieces of 10,000 rows
gene$LoadFile(expression_file_name);

## Load covariates

cvrt = SlicedData$new();
cvrt$fileDelimiter = '\t'; # the TAB character
cvrt$fileOmitCharacters = 'NA'; # denote missing values;
cvrt$fileSkipRows = 1; # one row of column labels
cvrt$fileSkipColumns = 1; # one column of row labels
cvrt$fileSliceSize = snps$nCols()+1; # read file in one piece
if(length(covariates_file_name)>0) {
    cvrt$LoadFile(covariates_file_name);
}

toc_load = proc.time()[3];
#cat('eQTL time: ', toc_load-tic_load, ' sec\n');

## Run the analysis
{
    tic_eqtl = proc.time()[3];
    Matrix_eQTL_engine(snps, gene,     cvrt,output_file_name,pvOutputThreshold,useModel, errorCovariance, verbose=TRUE);
    toc_eqtl = proc.time()[3];
}
# cat('eQTL time: ', toc_eqtl-tic_eqtl, ' sec\n');
cat('\n\n');
show(data.frame(load = toc_load-tic_load, eQTL = toc_eqtl-tic_eqtl))

R/qtl

Batch file:

cls
cd Rqtl

call 1_cache_data.bat
"C:\Revolution\R-Enterprise-4.3\R-2.12.2\bin\x64\Rscript.exe" --vanilla test_cvrt_no.r > ..\1_Rqtl_cvrt_no_Rev_R.txt

call 1_cache_data.bat
..\..\R-2.14.1_BLAS\bin\x64\Rscript.exe --vanilla test_cvrt_no.r > ..\1_Rqtl_cvrt_no_Japan.txt

call 1_cache_data.bat
..\..\R-2.14.1\bin\x64\Rscript.exe --vanilla test_cvrt_no.r > ..\1_Rqtl_cvrt_no_R_Regular.txt


call 1_cache_data.bat
"C:\Revolution\R-Enterprise-4.3\R-2.12.2\bin\x64\Rscript.exe" --vanilla test_cvrt_with.r > ..\1_Rqtl_cvrt_with_Rev_R.txt

call 1_cache_data.bat
..\..\R-2.14.1_BLAS\bin\x64\Rscript.exe --vanilla test_cvrt_with.r > ..\1_Rqtl_cvrt_with_Japan.txt

call 1_cache_data.bat
..\..\R-2.14.1\bin\x64\Rscript.exe --vanilla test_cvrt_with.r > ..\1_Rqtl_cvrt_with_R_Regular.txt

cd ..

Code for model with no covariates (test_cvrt_no.r):

memory.limit(100000)
library(methods)
library(qtl)
ph = "C:/AllWorkFiles/aaa/Matrix_eQTL/The_formal_testing/MeQTL/Rqtl/";
setwd(ph)

tic_load = proc.time()[3];
listeria.a3 <- read.cross(format = "csvsr",ph,"Testing_snps.csv", "Testing_genes.csv");
# cvrt = read.table('Testing_cvrt.txt');
listeria.a3 <- calc.genoprob(listeria.a3, step=0)
toc_load = proc.time()[3];
t1 = toc_load-tic_load;
# cat("Load data time: ",toc-tic," sec\n");


tic_eqtl = proc.time()[3];
z = scanone(listeria.a3, model="normal", method="hk", pheno.col=(1:(dim(listeria.a3$pheno)[2]-1)));
toc_eqtl = proc.time()[3];
t2 = toc_eqtl-tic_eqtl;
# cat("eQTL time: ",t2," sec\n");

show(data.frame(load = toc_load-tic_load, eQTL = toc_eqtl-tic_eqtl))

Code for model with 10 covariates (test_cvrt_with.r):

memory.limit(100000)
library(methods)
library(qtl)
ph = "C:/AllWorkFiles/aaa/Matrix_eQTL/The_formal_testing/MeQTL/Rqtl/";
setwd(ph)

tic_load = proc.time()[3];
listeria.a3 <- read.cross(format = "csvsr",ph,"Testing_snps.csv", "Testing_genes.csv");
cvrt = read.table('Testing_cvrt.txt');
listeria.a3 <- calc.genoprob(listeria.a3, step=0)
toc_load = proc.time()[3];
t1 = toc_load-tic_load;
# cat("Load data time: ",toc-tic," sec\n");


tic_eqtl = proc.time()[3];
z = scanone(listeria.a3, model="normal", method="hk", addcovar=cvrt ,pheno.col=(1:(dim(listeria.a3$pheno)[2]-1)));
toc_eqtl = proc.time()[3];
t2 = toc_eqtl-tic_eqtl;
# cat("eQTL time: ",t2," sec\n");

show(data.frame(load = toc_load-tic_load, eQTL = toc_eqtl-tic_eqtl))

Plink

Batch file:

cls
cd Plink

call 1_cache_data.bat
timethis "plink.exe --file Testing --noweb                                                                                      --out C:\AllWorkFiles\local\Plink_out\Load.txt   >out\1_load_plink.txt"      >..\1_Plink_load_timing.txt

call 1_cache_data.bat
timethis "plink.exe --file Testing --pheno TestingPheno.txt --all-pheno --assoc                          --pfilter 1e-5 --noweb --out C:\AllWorkFiles\local\Plink_out\Assoc.txt  >out\1_cvrt_no_plink.txt"   >..\1_Plink_cvrt_no_timing.txt

call 1_cache_data.bat
timethis "plink.exe --file Testing --pheno TestingPheno.txt --all-pheno --linear --covar TestingCvrt.txt --pfilter 1e-5 --noweb --out C:\AllWorkFiles\local\Plink_out\Linear.txt >out\1_cvrt_with_plink.txt" >..\1_Plink_cvrt_with_timing.txt

cd ..

snpMatrix

Batch file:

cls
cd snpMatrix

call 1_cache_data.bat
"C:\Revolution\R-Enterprise-4.3\R-2.12.2\bin\x64\Rscript.exe" --vanilla test_cvrt_no.r > ..\1_snpMatrix_cvrt_no_Rev_R.txt

call 1_cache_data.bat
..\..\R-2.12.0_BLAS\bin\x64\Rscript.exe --vanilla test_cvrt_no.r > ..\1_snpMatrix_cvrt_no_Japan.txt

call 1_cache_data.bat
..\..\R-2.12.0\bin\x64\Rscript.exe      --vanilla test_cvrt_no.r > ..\1_snpMatrix_cvrt_no_R_Regular.txt


call 1_cache_data.bat
"C:\Revolution\R-Enterprise-4.3\R-2.12.2\bin\x64\Rscript.exe" --vanilla test_cvrt_with.r > ..\1_snpMatrix_cvrt_with_Rev_R.txt

call 1_cache_data.bat
..\..\R-2.12.0_BLAS\bin\x64\Rscript.exe --vanilla test_cvrt_with.r > ..\1_snpMatrix_cvrt_with_Japan.txt

call 1_cache_data.bat
..\..\R-2.12.0\bin\x64\Rscript.exe      --vanilla test_cvrt_with.r > ..\1_snpMatrix_cvrt_with_R_Regular.txt

cd ..

Code for model with no covariates (test_cvrt_no.r):

setwd("C:/AllWorkFiles/aaa/Matrix_eQTL/The_formal_testing/MeQTL/snpMatrix/");

source("http://bioconductor.org/biocLite.R")
biocLite("snpMatrix")
library(methods);
library(snpMatrix);

tic_load = proc.time()[3];
snpmat = read.snps.pedfile("../Plink/Testing.ped");
pheno = read.table("../Plink/TestingPheno.txt");
pheno = as.matrix(pheno);
cvrt = read.table("../Plink/TestingCvrt.txt");
cvrt = as.matrix(cvrt );
cvrt = cvrt[ , 3:ncol(cvrt)];
toc_load = proc.time()[3];

snps1 = snpmat$snp.data;

tic_eqtl = proc.time()[3];
for( i in 1:(ncol(pheno)-2)) {
    d = pheno[ ,i+2];
    res1 = snpMatrix::single.snp.tests(phenotype = d, snp.data = snps1 )
}
toc_eqtl = proc.time()[3];
show(data.frame(load = toc_load-tic_load, eQTL = toc_eqtl-tic_eqtl))

Code for model with 10 covariates (test_cvrt_with.r):

setwd("C:/AllWorkFiles/aaa/Matrix_eQTL/The_formal_testing/MeQTL/snpMatrix/");

source("http://bioconductor.org/biocLite.R")
biocLite("snpMatrix")

library(methods);
library(snpMatrix);

tic_load = proc.time()[3];
snpmat = read.snps.pedfile("../Plink/Testing.ped");
pheno = read.table("../Plink/TestingPheno.txt");
pheno = as.matrix(pheno);
cvrt = read.table("../Plink/TestingCvrt.txt");
cvrt = as.matrix(cvrt );
cvrt = cvrt[ , 3:ncol(cvrt)];
toc_load = proc.time()[3];

snps1 = snpmat$snp.data;

tic_eqtl = proc.time()[3];
for( i in 1:(ncol(pheno)-2)) {
    d = pheno[ ,i+2];
    res3 = snpMatrix::snp.rhs.tests(d~cvrt, family="Gaussian", snp.data = snps1 )
}
toc_eqtl = proc.time()[3];
show(data.frame(load = toc_load-tic_load, eQTL = toc_eqtl-tic_eqtl))

Merlin

Batch file:

cls
cd Merlin

call 1_cache_data.bat
timethis "merlin -d Testing.dat -p Testing.ped -m Testing.map                                           >0_Merlin_load.txt"      >..\1_Merlin_load_timing.txt

call 1_cache_data.bat
timethis "merlin -d Testing.dat -p Testing.ped -m Testing.map --fastAssoc --filter 1e-5                 >0_Merlin_cvrt_no.txt"   >..\1_Merlin_cvrt_no_timing.txt

call 1_cache_data.bat
timethis "merlin -d Testing.dat -p Testing.ped -m Testing.map --fastAssoc --filter 1e-5 --useCovariates >0_Merlin_cvrt_with.txt" >..\1_Merlin_cvrt_with_timing.txt

cd ..


Back to Matrix eQTL page.

By Andrey Shabalin