1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
|
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/test_de.R
\name{test_de}
\alias{test_de}
\title{Test for Differential Expression}
\usage{
test_de(
fit,
contrast,
reduced_design = NULL,
full_design = fit$model_matrix,
subset_to = NULL,
pseudobulk_by = NULL,
pval_adjust_method = "BH",
sort_by = NULL,
decreasing = FALSE,
n_max = Inf,
verbose = FALSE
)
}
\arguments{
\item{fit}{object of class \code{glmGamPoi}. Usually the result of calling \code{glm_gp(data, ...)}}
\item{contrast}{The contrast to test. Can be a single column name (quoted or as a string)
that is removed from the full model matrix of \code{fit}. Or a complex contrast comparing two
or more columns: e.g. \code{A - B}, \code{"A - 3 * B"}, \code{(A + B) / 2 - C} etc. \cr
Only one of \code{contrast} or \code{reduced_design} must be specified.}
\item{reduced_design}{a specification of the reduced design used as a comparison to see what
how much better \code{fit} describes the data.
Analogous to the \code{design} parameter in \code{glm_gp()}, it can be either a \code{formula},
a \code{model.matrix()}, or a \code{vector}. \cr
Only one of \code{contrast} or \code{reduced_design} must be specified.}
\item{full_design}{option to specify an alternative \code{full_design} that can differ from
\code{fit$model_matrix}. Can be a \code{formula} or a \code{matrix}. Default: \code{fit$model_matrix}}
\item{subset_to}{a vector with the same length as \code{ncol(fit$data)} or an expression
that evaluates to such a vector. The expression can reference columns from \code{colData(fit$data)}.
A typical use case in single cell analysis would be to subset to a specific cell type
(e.g. \code{subset_to = cell_type == "T-cells"}). Note that if this argument is set a new
the model for the \code{full_design} is re-fit.\cr
Default: \code{NULL} which means that the data is not subset.}
\item{pseudobulk_by}{a vector with the same length as \code{ncol(fit$data)} that is used to
split the columns into different groups (calls \code{\link[=split]{split()}}). \code{pseudobulk_by} can also be an
expression that evaluates to a vector. The expression can reference columns from \code{colData(fit$data)}. \cr
The counts are summed across the groups
to create "pseudobulk" samples. This is typically used in single cell analysis if the cells come
from different samples to get a proper estimate of the differences. This is particularly powerful
in combination with the \code{subset_to} parameter to analyze differences between samples for
subgroups of cells. Note that this does a fresh fit for both the full and the reduced design.
Default: \code{NULL} which means that the data is not aggregated.}
\item{pval_adjust_method}{one of the p-value adjustment method from
\link{p.adjust.methods}. Default: \code{"BH"}.}
\item{sort_by}{the name of the column or an expression used to sort the result. If \code{sort_by} is \code{NULL}
the table is not sorted. Default: \code{NULL}}
\item{decreasing}{boolean to decide if the result is sorted increasing or decreasing
order. Default: \code{FALSE}.}
\item{n_max}{the maximum number of rows to return. Default: \code{Inf} which means that all
rows are returned}
\item{verbose}{a boolean that indicates if information about the individual steps are printed
while fitting the GLM. Default: \code{FALSE}.}
}
\value{
a \code{data.frame} with the following columns
\describe{
\item{name}{the rownames of the input data}
\item{pval}{the p-values of the quasi-likelihood ratio test}
\item{adj_pval}{the adjusted p-values returned from \code{\link[=p.adjust]{p.adjust()}}}
\item{f_statistic}{the F-statistic: \eqn{F = (Dev_full - Dev_red) / (df_1 * disp_ql-shrunken)}}
\item{df1}{the degrees of freedom of the test: \code{ncol(design) - ncol(reduced_design)}}
\item{df2}{the degrees of freedom of the fit: \code{ncol(data) - ncol(design) + df_0}}
\item{lfc}{the log2-fold change. If the alternative model is specified by \code{reduced_design} will
be \code{NA}.}
}
}
\description{
Conduct a quasi-likelihood ratio test for a Gamma-Poisson fit.
}
\examples{
Y <- matrix(rnbinom(n = 30 * 100, mu = 4, size = 0.3), nrow = 30, ncol = 100)
annot <- data.frame(sample = sample(LETTERS[1:6], size = 100, replace = TRUE),
cont1 = rnorm(100), cont2 = rnorm(100, mean = 30))
annot$condition <- ifelse(annot$sample \%in\% c("A", "B", "C"), "ctrl", "treated")
head(annot)
se <- SummarizedExperiment::SummarizedExperiment(Y, colData = annot)
fit <- glm_gp(se, design = ~ condition + cont1 + cont2)
# Test with reduced design
res <- test_de(fit, reduced_design = ~ condition + cont1)
head(res)
# Test with contrast argument, the results are identical
res2 <- test_de(fit, contrast = cont2)
head(res2)
# The column names of fit$Beta are valid variables in the contrast argument
colnames(fit$Beta)
# You can also have more complex contrasts:
# the following compares cont1 vs cont2:
test_de(fit, cont1 - cont2, n_max = 4)
# You can also sort the output
test_de(fit, cont1 - cont2, n_max = 4,
sort_by = "pval")
test_de(fit, cont1 - cont2, n_max = 4,
sort_by = - abs(f_statistic))
# If the data has multiple samples, it is a good
# idea to aggregate the cell counts by samples.
# This is called "pseudobulk".
test_de(fit, contrast = "conditiontreated", n_max = 4,
pseudobulk_by = sample)
# You can also do the pseudobulk only on a subset of cells:
cell_types <- sample(c("Tcell", "Bcell", "Makrophages"), size = 100, replace = TRUE)
test_de(fit, contrast = "conditiontreated", n_max = 4,
pseudobulk_by = sample,
subset_to = cell_types == "Bcell")
# Be care full, if you included the cell type information in
# the original fit, after subsetting the design matrix would
# be degenerate. To fix this, specify the full_design in 'test_de()'
SummarizedExperiment::colData(se)$ct <- cell_types
fit_with_celltype <- glm_gp(se, design = ~ condition + cont1 + cont2 + ct)
test_de(fit_with_celltype, contrast = cont1, n_max = 4,
full_design = ~ condition + cont1 + cont2,
pseudobulk_by = sample,
subset_to = ct == "Bcell")
}
\references{
\itemize{
\item Lund, S. P., Nettleton, D., McCarthy, D. J., & Smyth, G. K. (2012). Detecting differential expression
in RNA-sequence data using quasi-likelihood with shrunken dispersion estimates. Statistical
Applications in Genetics and Molecular Biology, 11(5).
\url{https://doi.org/10.1515/1544-6115.1826}.
}
}
\seealso{
\code{\link[=glm_gp]{glm_gp()}}
}
|