1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
|
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/core.R
\name{DESeq}
\alias{DESeq}
\title{Differential expression analysis based on the Negative Binomial (a.k.a. Gamma-Poisson) distribution}
\usage{
DESeq(
object,
test = c("Wald", "LRT"),
fitType = c("parametric", "local", "mean", "glmGamPoi"),
sfType = c("ratio", "poscounts", "iterate"),
betaPrior,
full = design(object),
reduced,
quiet = FALSE,
minReplicatesForReplace = 7,
modelMatrixType,
useT = FALSE,
minmu = if (fitType == "glmGamPoi") 1e-06 else 0.5,
parallel = FALSE,
BPPARAM = bpparam()
)
}
\arguments{
\item{object}{a DESeqDataSet object, see the constructor functions
\code{\link{DESeqDataSet}},
\code{\link{DESeqDataSetFromMatrix}},
\code{\link{DESeqDataSetFromHTSeqCount}}.}
\item{test}{either "Wald" or "LRT", which will then use either
Wald significance tests (defined by \code{\link{nbinomWaldTest}}),
or the likelihood ratio test on the difference in deviance between a
full and reduced model formula (defined by \code{\link{nbinomLRT}})}
\item{fitType}{either "parametric", "local", "mean", or "glmGamPoi"
for the type of fitting of dispersions to the mean intensity.
See \code{\link{estimateDispersions}} for description.}
\item{sfType}{either "ratio", "poscounts", or "iterate"
for the type of size factor estimation. See
\code{\link{estimateSizeFactors}} for description.}
\item{betaPrior}{whether or not to put a zero-mean normal prior on
the non-intercept coefficients
See \code{\link{nbinomWaldTest}} for description of the calculation
of the beta prior. In versions \code{>=1.16}, the default is set
to \code{FALSE}, and shrunken LFCs are obtained afterwards using
\code{\link{lfcShrink}}.}
\item{full}{for \code{test="LRT"}, the full model formula,
which is restricted to the formula in \code{design(object)}.
alternatively, it can be a model matrix constructed by the user.
advanced use: specifying a model matrix for full and \code{test="Wald"}
is possible if \code{betaPrior=FALSE}}
\item{reduced}{for \code{test="LRT"}, a reduced formula to compare against,
i.e., the full formula with the term(s) of interest removed.
alternatively, it can be a model matrix constructed by the user}
\item{quiet}{whether to print messages at each step}
\item{minReplicatesForReplace}{the minimum number of replicates required
in order to use \code{\link{replaceOutliers}} on a
sample. If there are samples with so many replicates, the model will
be refit after these replacing outliers, flagged by Cook's distance.
Set to \code{Inf} in order to never replace outliers.
It set to \code{Inf} for \code{fitType="glmGamPoi"}.}
\item{modelMatrixType}{either "standard" or "expanded", which describe
how the model matrix, X of the GLM formula is formed.
"standard" is as created by \code{model.matrix} using the
design formula. "expanded" includes an indicator variable for each
level of factors in addition to an intercept. for more information
see the Description of \code{\link{nbinomWaldTest}}.
betaPrior must be set to TRUE in order for expanded model matrices
to be fit.}
\item{useT}{logical, passed to \code{\link{nbinomWaldTest}}, default is FALSE,
where Wald statistics are assumed to follow a standard Normal}
\item{minmu}{lower bound on the estimated count for fitting gene-wise dispersion
and for use with \code{nbinomWaldTest} and \code{nbinomLRT}.
If \code{fitType="glmGamPoi"}, then 1e-6 will be used
(as this fitType is optimized for single cell data, where a lower
minmu is recommended), otherwise the default value
as evaluated on bulk datasets is 0.5}
\item{parallel}{if FALSE, no parallelization. if TRUE, parallel
execution using \code{BiocParallel}, see next argument \code{BPPARAM}.
Two notes on running in parallel using \code{BiocParallel}:
1) it is recommended to filter out genes where all samples have
low counts before running DESeq2 in parellel: this improves efficiency
as otherwise you will be sending data to child processes, though those
have little power for detection of differences, and will likely be
removed by independent filtering anyway;
2) it may be
advantageous to remove large, unneeded objects from your current
R environment before calling \code{DESeq},
as it is possible that R's internal garbage collection
will copy these files while running on worker nodes.}
\item{BPPARAM}{an optional parameter object passed internally
to \code{\link{bplapply}} when \code{parallel=TRUE}.
If not specified, the parameters last registered with
\code{\link{register}} will be used.}
}
\value{
a \code{\link{DESeqDataSet}} object with results stored as
metadata columns. These results should accessed by calling the \code{\link{results}}
function. By default this will return the log2 fold changes and p-values for the last
variable in the design formula. See \code{\link{results}} for how to access results
for other variables.
}
\description{
This function performs a default analysis through the steps:
\enumerate{
\item estimation of size factors: \code{\link{estimateSizeFactors}}
\item estimation of dispersion: \code{\link{estimateDispersions}}
\item Negative Binomial GLM fitting and Wald statistics: \code{\link{nbinomWaldTest}}
}
For complete details on each step, see the manual pages of the respective
functions. After the \code{DESeq} function returns a DESeqDataSet object,
results tables (log2 fold changes and p-values) can be generated
using the \code{\link{results}} function.
Shrunken LFC can then be generated using the \code{\link{lfcShrink}} function.
All support questions should be posted to the Bioconductor
support site: \url{http://support.bioconductor.org}.
}
\details{
The differential expression analysis uses a generalized linear model of the form:
\deqn{ K_{ij} \sim \textrm{NB}( \mu_{ij}, \alpha_i) }{ K_ij ~ NB(mu_ij, alpha_i) }
\deqn{ \mu_{ij} = s_j q_{ij} }{ mu_ij = s_j q_ij }
\deqn{ \log_2(q_{ij}) = x_{j.} \beta_i }{ log2(q_ij) = x_j. beta_i }
where counts \eqn{K_{ij}}{K_ij} for gene i, sample j are modeled using
a Negative Binomial distribution with fitted mean \eqn{\mu_{ij}}{mu_ij}
and a gene-specific dispersion parameter \eqn{\alpha_i}{alpha_i}.
The fitted mean is composed of a sample-specific size factor
\eqn{s_j}{s_j} and a parameter \eqn{q_{ij}}{q_ij} proportional to the
expected true concentration of fragments for sample j.
The coefficients \eqn{\beta_i}{beta_i} give the log2 fold changes for gene i for each
column of the model matrix \eqn{X}{X}.
The sample-specific size factors can be replaced by
gene-specific normalization factors for each sample using
\code{\link{normalizationFactors}}.
For details on the fitting of the log2 fold changes and calculation of p-values,
see \code{\link{nbinomWaldTest}} if using \code{test="Wald"},
or \code{\link{nbinomLRT}} if using \code{test="LRT"}.
Experiments without replicates do not allow for estimation of the dispersion
of counts around the expected value for each group, which is critical for
differential expression analysis. Analysis without replicates was deprecated
in v1.20 and is no longer supported since v1.22.
The argument \code{minReplicatesForReplace} is used to decide which samples
are eligible for automatic replacement in the case of extreme Cook's distance.
By default, \code{DESeq} will replace outliers if the Cook's distance is
large for a sample which has 7 or more replicates (including itself).
Outlier replacement is turned off entirely for \code{fitType="glmGamPoi"}.
This replacement is performed by the \code{\link{replaceOutliers}}
function. This default behavior helps to prevent filtering genes
based on Cook's distance when there are many degrees of freedom.
See \code{\link{results}} for more information about filtering using
Cook's distance, and the 'Dealing with outliers' section of the vignette.
Unlike the behavior of \code{\link{replaceOutliers}}, here original counts are
kept in the matrix returned by \code{\link{counts}}, original Cook's
distances are kept in \code{assays(dds)[["cooks"]]}, and the replacement
counts used for fitting are kept in \code{assays(dds)[["replaceCounts"]]}.
Note that if a log2 fold change prior is used (betaPrior=TRUE)
then expanded model matrices will be used in fitting. These are
described in \code{\link{nbinomWaldTest}} and in the vignette. The
\code{contrast} argument of \code{\link{results}} should be used for
generating results tables.
}
\examples{
# see vignette for suggestions on generating
# count tables from RNA-Seq data
cnts <- matrix(rnbinom(n=1000, mu=100, size=1/0.5), ncol=10)
cond <- factor(rep(1:2, each=5))
# object construction
dds <- DESeqDataSetFromMatrix(cnts, DataFrame(cond), ~ cond)
# standard analysis
dds <- DESeq(dds)
res <- results(dds)
# moderated log2 fold changes
resultsNames(dds)
resLFC <- lfcShrink(dds, coef=2, type="apeglm")
# an alternate analysis: likelihood ratio test
ddsLRT <- DESeq(dds, test="LRT", reduced= ~ 1)
resLRT <- results(ddsLRT)
}
\references{
Love, M.I., Huber, W., Anders, S. (2014) Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biology, 15:550. \url{https://doi.org/10.1186/s13059-014-0550-8}
For \code{fitType="glmGamPoi"}:
Ahlmann-Eltze, C., Huber, W. (2020) glmGamPoi: Fitting Gamma-Poisson Generalized Linear Models on Single Cell Count Data. Bioinformatics. \url{https://doi.org/10.1093/bioinformatics/btaa1009}
}
\seealso{
\code{link{results}}, \code{\link{lfcShrink}}, \code{\link{nbinomWaldTest}}, \code{\link{nbinomLRT}}
}
\author{
Michael Love
}
|