% \VignetteIndexEntry{rxSeq manual}
% \VignetteDepends{rxSeq}
% \VignetteKeywords{Expression Analysis of the reciprocal cross in RNA-seq data}
% \VignettePackage{rxSeq}

\documentclass[11pt, a4paper]{article}
 \setlength{\topmargin}{-0.2in}
 \setlength{\oddsidemargin}{0.05 in}
 \setlength{\textwidth}{6in}
 \setlength{\textheight}{9in}
 \headsep=0in
 \oddsidemargin=0in \evensidemargin=0in


\title{Reciprocal Cross in RNA-seq}

\author{Vasyl Zhabotynsky \thanks{vasyl@unc.edu} \and Wei Sun \and Fei Zou}


\begin{document}

\maketitle 

\section{Overview}
\label{sec:over}
This vignette describes how to use \texttt{R/rxSeq} to perform an analysis on 
RNA-seq data 
from F1 reciprocal crosses.
<<initialize, results=hide>>=
library(rxSeq)
@
<<echo=FALSE, results=hide>>=
options(width = 80)
@


\section{Introduction}
\label{sec:intro}
RNA sequencing (RNA-seq) not only measures total gene expression but may also measure allele-specific gene expression in 
diploid individuals. RNA-seq data collected from F1 reciprocal crosses in mouse can powerfully dissect strain and 
parent-of-origin effects on allelic imbalance of gene expression. This R package, rxSeq, implements a novel 
statistical approach for RNA-seq data from F1 and inbred strains. Zou {\em et al.} (2013) \cite{Zou13}

\section{Citing \texttt{R/rxSeq}}
When using the results from the \texttt{R/rxSeq} package, please cite:
\begin{quote}
Zou F {\em et al.} (2013) `RNA-seq analysis for F1 reciprocal crosses', {\em submitted}.
\end{quote}
The article describes the methodological framework behind the \texttt{R/rxSeq} package.


\section{rxSeq implementation and output}
\label{sec:imp}
\subsection{Fitting the data}
\subsubsection{Joint model (TReCASE model) for total read counts (TReC) and allele specific expression (ASE) counts}
%The package \texttt{R/rxSeq} was developed for analyzing RNA-seq data from F1 reciprocal mouse crosses (2013)\cite{Crowley13}). 
The TReC and ASE can be produced using the R package \texttt{R/asSeq}\cite{asSeq} developed by our group. A detailed pipeline 
of producing gene-level (or transcript-level) allele specific counts can be found in the asSeq document.


~\\
For autosomal genes, the TReCASE model requires the following input data:

\begin{itemize}
\item(1) a vector \textbf{index}, classifying each mouse into a cross type: for one sex or female mice  AB=1,BA=2,AA=3,BB=4, and for male AB=5,BA=6,AA=7,BB=8
\item(2) matrix of total counts \textbf{y} with columns representing mice, and rows - genes, F1s in first columns
\item(3) matrix of allele specific counts for both alleles \textbf{n} (note, that the columns for these mice should match columns of F1 mice for total read counts)
\item(4) matrix of allele specific counts for allele allele B \textbf{n0B}
\item(5) a vector of log(total read counts for each mouse) - \textbf{kappas}. If not provided, the given set of total read counts \textbf{y} will be used to estimate it.
\item(6) a vector of gene IDs \textbf{geneid}, if not calculated, row names are used, if they are NULL, 1:nrow(\textbf{y}) will be substituted.
\item(7) \textbf{hessian} - a logical value requesting to calculate a Hessian matrix. The default value is FALSE. It is not needed for the basic analysis, however, if a subset of genes of special interest is identified, this option can be switched to TRUE, and in addition to the regular output an extra item will be added: a list of Hessian matrices for each gene.
\end{itemize}


<<>>=
#fit trecase autosome genes:
trecase.A.out<-proc.trecase.A(index=data.A$index,kappas=data.A$kappas,
               y=data.A$y[1:2,],n=data.A$n[1:2,],n0B=data.A$n0B[1:2,],
               geneids=data.A$geneids[1:2])
@



~\\
The following command runs the TReCASE model for Chromsome X genes, which requires two additional parameters for dealing with X-chromosome inactivations:
\begin{itemize}
\item(8) a vector of \textbf{tausB} - \textit{Xce} effect for a given cross (can be estimated using overall allele specific counts imbalance for an AB cross, or literature values could be used.)
\item(9) a vector \textbf{genes.switch} of geneids for which \textit{Xce} should be switched to $1-\textbf{tausB}$, for example \textit{Xist}. The default value is an Ensembl ID for a known gene with switched \textit{Xce} effect - \textit{Xist}: ENSMUSG00000086503. 
\end{itemize}

<<>>=
#fit trecase X chromosome genes:
trecase.X.out<-proc.trecase.X(index=data.X$index,kappas=data.X$kappas,
               tausB=data.X$tausB,y=data.X$y[1:2,],n=data.X$n[1:2,],
               n0B=data.X$n0B[1:2,],geneids=data.X$geneids[1:2])
@

These functions return the following outputs: parameter estimates from the full models and 
associated p-values, and all reduced short models, followed by the list of errors:
<<>>=
names(trecase.A.out)
trecase.A.out$pval[,1:2]
names(trecase.X.out)
trecase.X.out$pval[,1:2]
@



%For TReC only counts model the similar input and output are utilized.

\subsubsection{TReC model for TReC only}
The package also allows for fitting the data with only TReC when for a given gene, there is no enough SNP or indel information 
for estimating ASE. 

~\\
~\\
The following function fits the TReC model for autosomal genes:
<<>>=
#fit trec autosome genes
trec.A.out<-proc.trec.A(index=data.A$index,kappas=data.A$kappas,
            y=data.A$y[1:2,],geneids=data.A$geneids[1:2])
names(trec.A.out)
trec.A.out$pval[,1:2]
@

   ~\\
The following function fits the TReC model for Chromosome X genes:
<<>>=
#fit trec X chromosome genes
trec.X.out<-proc.trec.X(index=data.X$index,kappas=data.X$kappas,
            tausB=data.X$tausB,y=data.X$y[1:2,],
            geneids=data.X$geneids[1:2])
names(trec.X.out)
trec.X.out$pval[,1:2]
@

\subsection{Estimating \textit{Xce} effect for X chromosome}
Both proc.trecase.X and proc.trec.X require an estimate of the \textit{Xce} effect.
In the above examples, we used an estimated value from the data in Crowley (2013) \cite{Crowley13}. 


~\\
The following function estimates the $Xce$ effect for any given data:

<<>>=
get.tausB(n=data.X$n,n0B=data.X$n0B,geneids=data.X$geneids,
          Xist.ID="ENSMUSG00000086503")
@

For genes that are known to escape X inactivation or have different \textit{Xce} control effects, adjusted analysis can be done 
provided their ids are given. A default gene - \textit{Xist} which is known to have an opposite inactivation 
pattern with the other X chromosome genes, we set its estimate to $1-\textit{Xce}$.
We may also exclude genes with too low ASE (which is set to 50 by default) and/or with too low proportion of one of the alleles. 
The default value for the latter is set to 0.05 to avoid fully imprinted genes.
%Note, that even though we used only 8 randomly chosen genes from the set, the resulting \textbf{tausB} are very similar to the full 
%set estimate:

<<>>=
data.X$tausB
@

The first row of the \textbf{get.tausB} output provides a medain estimate of the \textit{Xce} effect 
and the second row provides an average estimate of \textit{Xce} effect. The
two estimates are expected to be close, though median would be more stable.

<<>>=
get.tausB(n=data.X$n,n0B=data.X$n0B,geneids=data.X$geneids,Xist.ID = "")
@

\section{Simulations}
\label{sec:sim}
RNA-seq data can be simulated using function \textbf{simRX} which requires the following input variables:
\begin{itemize}
\item(1)\textbf{b0f} - a female additive strain effect
\item(2)\textbf{b0m} - a male additive strain effect 
\item(3)\textbf{b1f} - a female parent of origin effect
\item(4)\textbf{b1m} - a male parent of origin effect 
\item(5)\textbf{beta\_sex} - a sex effect
\item(6)\textbf{beta\_dom} - a dominance effect
\item(7)\textbf{beta\_k} - an effect associated with the library size kappas
\item(8)\textbf{phi} - a Negative-Binomial overdispersion, default value is 1
\item(9)\textbf{theta} - a Beta-Binomial overdispersion, default value is 1
\item(10)\textbf{n} - a vector defining number of mice in each cross, default value is 6
\item(11)\textbf{mean.base.cnt} - a target expected number of counts for the base group (with no effects), default value is 50
\item(12)\textbf{range.base.cnt} - a range in which the expected number of counts  for the base group will vary, default value is 60
\item(13)\textbf{perc.ase} - percent of TReC that are allele-specific, default value is 35\%
\item(14)\textbf{n.simu} - a number of simulations, default value is 1E4
\item(15)\textbf{is.X} - a flag for X chromosome genes (TRUE), default value is FALSE
\item(16)\textbf{tauB} - a value describing allelic imbalance - \textit{Xce} effect, default value is NULL, i.e. 0.5.
\item(17)\textbf{seed} - a random seed, not set by default.
\end{itemize}

It produces three data matrices:
\begin{itemize}
  \item(1)\textbf{y} - TReC
  \item(2)\textbf{n} - total ASE
  \item(3)\textbf{n0B} - allele specific counts associated with allele B
\end{itemize}

<<>>=
dat.A<-simRX(b0f=.5,b0m=.6,b1f=.3,b1m=.4,beta_sex=.1,beta_dom=.1,n.simu=1E1)
names(dat.A)
dat.X<-simRX(b0f=.5,b0m=.6,b1f=.3,b1m=.4,beta_sex=.1,beta_dom=.1,n.simu=1E1,
is.X=TRUE,tauB=.3)
names(dat.X)
@


\section{References}
\label{sec:ref}
\begin{thebibliography}{}

\bibitem{asSeq}
Wei Sun, Vasyl Zhabotynsky (2013) {asSeq: A set of tools for the study of allele-specific RNA-seq data}. {\em http://www.bios.unc.edu/~weisun/software/asSeq.pdf, to be provided on Bioconductor}.
\bibitem{Crowley13}
Crowley, J. J.,  Zhabotynsky, V., Sun, W.,  Huang, S., Pakatci, I. K.,  Kim, Y., Wang, J. R.,  Morgan, A., P., Calaway, 
J. D., Aylor, D. L., Yun, Z., Bell, T. A.,  Buus, R. J., Calaway, M. E., Didion, J. P., Gooch, T. J.,  Hansen, S. D.,  
Robinson, N. N., Shaw, G. D., Spence, J. S.,  Quackenbush, C. R., Barrick, C. J., Xie, Y., Valdar, W.,  Lenarcic, A. B., 
Wang, W., Welsh, C. E., Fu, C. P., Zhang, Z., Holt, J., Guo, Z.,  Threadgill, D. W., Tarantino, L. M., Miller, D., R., Zou, F., 
McMillan, L.,  Sullivan, P. F.,  and Pardo-Manuel de Villena, F. (2013), {Pervasive allelic imbalance revealed by allele-specific 
gene expression in highly divergent mouse crosses.}, {\em To be submitted.}
\bibitem{CC12}
Collaborative Cross Consortium (2012), {The Genome artichecture of the Collaborative Cross Mouse Genetic Reference Population}, 
{\em Genetics.} {\bf190}(2):389-401
\bibitem{Zou13}
Zou, F., Sun, W., Crowley, J. J., Zhabotynsky, V., Sullivan, P. F., and Pardo Manuel de  Villena, F. (2013) (2013) 
{RNA-seq analysis for F1 reciprocal crosses.}, {\em Submitted.}.
\end{thebibliography}

\end{document}
cument}