After applying Matrix eQTL and recording the results for GTEx data we get 10 files.
Nine files named like eQTL_results_AL_Adipose_cis.txt
contain Matrix eQTL output for each respective tissue. The last file df.txt
keeps information about tissue names and the number of degrees of freedom for each tissue, such as:
Adipose | 73 |
---|---|
Artery | 91 |
Blood | 135 |
Heart | 62 |
Lung | 98 |
Muscle | 117 |
Nerve | 67 |
Skin | 75 |
Thyroid | 84 |
The code below takes these files and creates a big matrix of z-scores with each row containing informaiton for a particular gene-SNP pair and each column for a particular tissue. It prodices 3 files:
z-score.matrix.Rdata | — | R data file with the big matrix of z-scores |
---|---|---|
z-score.matrix.genes.txt | — | text file with gene names for the rows of the big matrix |
z-score.matrix.snps.txt | — | text file with SNP names for the rows of the big matrix |
### Read df.txt for the list of tissues
### and degrees of freedom of linear models
df = read.table("df.txt", stringsAsFactors=FALSE);
names(df) = c("tissue", "df");
show(df)
### List vector for storing Matrix eQTL results
big.list = vector("list", nrow(df));
### Store gene and SNP names from the first tissue
### for matching with other tissues
genes = NULL;
snps = NULL;
### colClasses for faster reading of Matrix eQTL output
cc.file = NA;
### Loop over tissues
for(t1 in 1:nrow(df) ) {
### Get tissue name
tissue = df$tissue[t1];
### Load Matrix eQTL output for the given tissue
start.time = proc.time()[3];
tbl = read.table(paste0("eQTL_results_AL_",tissue,"_cis.txt"),
header = T, stringsAsFactors=FALSE, colClasses=cc.file);
end.time = proc.time()[3];
cat(tissue, "loaded in", end.time - start.time, "sec.", nrow(tbl), "gene-SNP pairs.", "\n");
### set colClasses for faster loading of other results
if(any(is.na(cc.file))) {
cc.file = sapply(tbl, class);
}
### Set gene and SNP names for matching
if(is.null(snps))
snps = unique(tbl$SNP);
if(is.null(genes))
genes = unique(tbl$gene);
### Match gene and SNP names from Matrix eQTL output
### to "snps" and "genes"
gpos = match(tbl$gene, genes, nomatch=0L);
spos = match(tbl$SNP, snps, nomatch=0L);
### Assign each gene-SNP pair a unique id for later matching with other tissues
id = gpos + 2*spos*length(genes);
### Transform t-statistics into correlations
r = tbl$t.stat / sqrt(df$df[t1] + tbl$t.stat^2);
### Record id's and correlations
big.list[[t1]] = list(id = id, r = r);
### A bit of clean up to reduce memory requirements
rm(tbl, gpos, spos, r, id, tissue, start.time, end.time);
gc();
}
rm(t1, cc.file);
### Find the set of gene-SNP pairs
### present in results for all tissues
keep = rep(TRUE, length(big.list[[1]]$id));
for(t1 in 2:nrow(df)) {
mch = match(big.list[[1]]$id, big.list[[t1]]$id, nomatch=0L);
keep[ mch == 0] = FALSE;
cat(df$tissue[t1], ", overlap size", sum(keep), "\n");
}
final.ids = big.list[[1]]$id[keep];
rm(keep, mch, t1);
### Create and fill in the matrix of z-scores
### Z-scores are calculated from correlations
big.matrix = matrix(NA_real_, nrow=length(final.ids), ncol=nrow(df));
fisher.transform = function(r){0.5*log((1+r)/(1-r))};
for(t1 in 1:nrow(df)) {
mch = match(final.ids, big.list[[t1]]$id);
big.matrix[,t1] = fisher.transform(big.list[[t1]]$r[mch]) * sqrt(df$df[t1]-1);
cat(t1,"\n");
}
stopifnot( !any(is.na(big.matrix)) )
rm(t1, mch);
### Save the big matrix
save(list="big.matrix", file="z-score.matrix.Rdata", compress=FALSE);
### Save gene names and SNP names for rows of big matrix
writeLines(text=genes[final.ids %% (length(genes)*2)], con="z-score.matrix.genes.txt")
writeLines(text=snps[final.ids %/% (length(genes)*2)], con="z-score.matrix.snps.txt")