1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348
|
\name{samr}
\alias{samr}
\title{Significance analysis of microarrays}
\description{
Correlates a large number of features (eg genes) with an outcome
variable, such as a group indicator, quantitative variable or survival time.
NOTE: for most users, the interface function SAM--- which calls samr-- will be more
convenient for array data, and the interface function SAMseq-- which also calls samr--
will be more convenient for sequencing data.
}
\usage{
samr(data, resp.type=c("Quantitative","Two class unpaired",
"Survival","Multiclass", "One class", "Two class paired",
"Two class unpaired timecourse", "One class timecourse",
"Two class paired timecourse", "Pattern discovery"),
assay.type=c("array","seq"), s0=NULL, s0.perc=NULL, nperms=100,
center.arrays=FALSE, testStatistic=c("standard","wilcoxon"),
time.summary.type=c("slope","signed.area"),
regression.method=c("standard","ranks"), return.x=FALSE,
knn.neighbors=10, random.seed=NULL, nresamp=20,nresamp.perm=NULL,
xl.mode=c("regular","firsttime","next20","lasttime"),
xl.time=NULL, xl.prevfit=NULL)
}
\arguments{
\item{data}{Data object with components x- p by n matrix of features,
one observation per column (missing values allowed); y- n-vector of outcome measurements;
censoring.status- n-vector of censoring censoring.status (1= died or event occurred, 0=survived, or event was censored), needed
for a censored survival outcome}
\item{resp.type}{Problem type:
"Quantitative" for a continuous parameter (Available for both array and sequencing data);
"Two class unpaired" (for both array and sequencing data);
"Survival" for censored survival outcome (for both array and sequencing data);
"Multiclass": more than 2 groups (for both array and sequencing data);
"One class" for a single group (only for array data);
"Two class paired" for two classes with paired observations (for both array and sequencing data);
"Two class unpaired timecourse" (only for array data),
"One class time course" (only for array data),
"Two class.paired timecourse" (only for array data),
or "Pattern discovery" (only for array data)}
\item{assay.type}{Assay type: "array" for microarray data, "seq" for counts from sequencing}
\item{s0}{Exchangeability factor for denominator of test statistic; Default
is automatic choice. Only used for array data.}
\item{s0.perc}{Percentile of standard deviation values to use for s0; default is
automatic choice; -1 means s0=0 (different from s0.perc=0, meaning
s0=zeroeth percentile of standard deviation values= min of sd values.
Only used for array data.}
\item{nperms}{Number of permutations used to estimate false discovery rates}
\item{center.arrays}{Should the data for each sample (array) be median centered
at the outset? Default =FALSE. Only used for array data.},
\item{testStatistic}{Test statistic to use in two class unpaired case.Either
"standard" (t-statistic) or ,"wilcoxon" (Two-sample wilcoxon or Mann-Whitney
test). Only used for array data.}
\item{time.summary.type}{Summary measure for each time course: "slope", or "signed.area"). Only used for array data.}
\item{regression.method}{Regression method for quantitative case:
"standard", (linear least squares) or "ranks" (linear least squares
on ranked data). Only used for array data.}
\item{return.x}{Should the matrix of feature values be returned?
Only useful for time course data, where x contains summaries of the features
over time. Otherwise x is the same as the input data data\$x}
\item{knn.neighbors}{Number of nearest neighbors to use for imputation
of missing features values. Only used for array data.}
\item{random.seed}{Optional initial seed for random number generator (integer)}
\item{nresamp}{For assay.type="seq", number of resamples used to construct test statistic. Default 20. Only used for sequencing data.}
\item{nresamp.perm}{For assay.type="seq", number of resamples used to construct test statistic for permutations. Default is equal to nresamp and it must be at most nresamp. Only used for sequencing data.}
\item{xl.mode}{Used by Excel interface}
\item{xl.time}{Used by Excel interface}
\item{xl.prevfit}{Used by Excel interface}
}
\details{Carries out a SAM analysis. Applicable to microarray data,
sequencing data,
and other data with a large number of features. This is the R package
that is called by the "official" SAM Excel package v2.0.
The format of the response vector y and the calling sequence
is illustrated in the examples below. A more complete description
is given in the SAM manual
at http://www-stat.stanford.edu/~tibs/SAM}
\value{
A list with components
\item{n}{Number of observations}
\item{x}{Data matrix p by n (p=\# genes or features). Equal to the matrix data\$x in the original call to samr except
for (1) time course analysis, where is contains the summarized data
or (2) quantitative outcome with rank regression, where it contains
the data transformed to ranks. Hence it is null except for in time course analysis. }
\item{y}{Vector of n outcome values. equal the values data\$y in the original call to samr, except
for (1) time course analysis, where is contains the summarized y
or (2) quantitative outcome with rank regression, where it contains
the y values transformed to ranks}
\item{argy}{The values data\$y in the original call to samr}
\item{censoring.status}{Censoring status indicators if applicable}
\item{testStatistic}{Test Statistic used},
\item{nperms}{Number of permutations requested}
\item{nperms.act}{Number of permutations actually used. Will be <
nperms when \# of possible permutations <= nperms (in which case
all permutations are done)}
\item{tt}{tt=numer/sd, the vector of p test statistics for original data}
\item{numer}{Numerators for tt}
\item{sd}{Denominators for tt. Equal to standard deviation for feature plus s0}
\item{s0}{Computed exchangeability factor}
\item{s0.perc}{Computed percentile of standard deviation values.
s0= s0.perc percentile of the gene standard deviations}
\item{eva}{p-vector of expected values for tt under permutation sampling}
\item{perms}{nperms.act by n matrix of permutations used. Each row is
a permutation of 1,2...n}
\item{permsy}{nperms.act by n matrix of permutations used. Each row is
a permutation of y1,y2,...yn. Only one of perms or permys is non-Null, depending on resp.type}
\item{all.perms.flag}{Were all possible permutations used?}
\item{ttstar}{p by nperms.aca matrix t of test statistics from permuted data. Each column if sorted in descending order}
\item{ttstar0}{p by nperms.act matrix of test statistics from permuted data. Columns are in order of data}
\item{eigengene.number}{The number of the eigengene (eg 1,2,..) that was requested for Pattern discovery}
\item{eigengene}{Computed eigengene}
\item{pi0}{Estimated proportion of non-null features (genes)}
\item{foldchange}{p-vector of foldchanges for original data}
\item{foldchange.star}{p by nperms.act matrix estimated foldchanges from permuted data}
\item{sdstar.keep}{n by nperms.act matrix of standard deviations
from each permutation}
\item{censoring.status.star.keep}{n by nperms.act matrix of
censoring.status indicators from each permutation}
\item{resp.type}{The response type used. Same as resp.type.arg, except for
time course data, where time data is summarized and then treated as non-time course. Eg if resp.type.arg="oneclass.timecourse" then resp.type="oneclass"}
\item{resp.type.arg}{The response type requested in the call to samr}
\item{stand.contrasts}{For multiclass data, p by nclass matrix of standardized differences
between the class mean and the overall mean}
\item{stand.contrasts.star}{For multiclass data, p by nclass by nperms.act array of standardized contrasts for permuted datasets}
\item{stand.contrasts.95}{For multiclass data, 2.5% and 97.5% percentiles
of standardized contrasts. Useful for determining which class contrast
for significant genes, are large}
\item{depth}{For array.type="seq", estimated sequencing depth for each sample.}
\item{call}{calling sequence}
}
\references{Tusher, V., Tibshirani, R. and Chu, G. (2001):
Significance analysis of microarrays applied to the ionizing radiation response PNAS 2001 98: 5116-5121, (Apr 24).
http://www-stat.stanford.edu/~tibs/SAM
Li, Jun and Tibshirani, R. (2011). Finding consistent patterns: a nonparametric
approach for identifying differential expression in
RNA-Seq data. To appear, Statistical Methods in Medical Research.
}
\author{Jun Li and Balasubrimanian Narasimhan and Robert Tibshirani}
\examples{
######### two class unpaired comparison
# y must take values 1,2
set.seed(100)
x<-matrix(rnorm(1000*20),ncol=20)
dd<-sample(1:1000,size=100)
u<-matrix(2*rnorm(100),ncol=10,nrow=100)
x[dd,11:20]<-x[dd,11:20]+u
y<-c(rep(1,10),rep(2,10))
data=list(x=x,y=y, geneid=as.character(1:nrow(x)),
genenames=paste("g",as.character(1:nrow(x)),sep=""), logged2=TRUE)
samr.obj<-samr(data, resp.type="Two class unpaired", nperms=100)
delta=.4
samr.plot(samr.obj,delta)
delta.table <- samr.compute.delta.table(samr.obj)
siggenes.table<-samr.compute.siggenes.table(samr.obj,delta, data, delta.table)
# sequence data
set.seed(3)
x<-abs(100*matrix(rnorm(1000*20),ncol=20))
x=trunc(x)
y<- c(rep(1,10),rep(2,10))
x[1:50,y==2]=x[1:50,y==2]+50
data=list(x=x,y=y, geneid=as.character(1:nrow(x)),
genenames=paste("g",as.character(1:nrow(x)),sep=""))
samr.obj<-samr(data, resp.type="Two class unpaired",assay.type="seq", nperms=100)
delta=5
samr.plot(samr.obj,delta)
delta.table <- samr.compute.delta.table(samr.obj)
siggenes.table<-samr.compute.siggenes.table(samr.obj,delta, data, delta.table)
########### two class paired
# y must take values -1, 1, -2,2 etc, with (-k,k) being a pair
set.seed(100)
x<-matrix(rnorm(1000*20),ncol=20)
dd<-sample(1:1000,size=100)
u<-matrix(2*rnorm(100),ncol=10,nrow=100)
x[dd,11:20]<-x[dd,11:20]+u
y=c(-(1:10),1:10)
d=list(x=x,y=y, geneid=as.character(1:nrow(x)),
genenames=paste("g",as.character(1:nrow(x)),sep=""), logged2=TRUE)
samr.obj<-samr(d, resp.type="Two class paired", nperms=100)
#############quantitative response
# y must take numeric values
set.seed(84048)
x=matrix(rnorm(1000*9),ncol=9)
mu=c(3,2,1,0,0,0,1,2,3)
b=runif(100)+.5
x[1:100,]=x[1:100,]+ b%*%t(mu)
y=mu
d=list(x=x,y=y,
geneid=as.character(1:nrow(x)),genenames=paste("gene", as.character(1:nrow(x))))
samr.obj =samr(d, resp.type="Quantitative", nperms=50)
########### oneclass
# y is a vector of ones
set.seed(100)
x<-matrix(rnorm(1000*20),ncol=20)
dd<-sample(1:1000,size=100)
u<-matrix(2*rnorm(100),ncol=10,nrow=100)
x[dd,11:20]<-x[dd,11:20]+u
y<-c(rep(1,20))
data=list(x=x,y=y, geneid=as.character(1:nrow(x)),
genenames=paste("g",as.character(1:nrow(x)),sep=""), logged2=TRUE)
samr.obj<-samr(data, resp.type="One class", nperms=100)
###########survival data
# y is numeric; censoring.status=1 for failures, and 0 for censored
set.seed(84048)
x=matrix(rnorm(1000*50),ncol=50)
x[1:50,26:50]= x[1:50,26:50]+2
x[51:100,26:50]= x[51:100,26:50]-2
y=abs(rnorm(50))
y[26:50]=y[26:50]+2
censoring.status=sample(c(0,1),size=50,replace=TRUE)
d=list(x=x,y=y,censoring.status=censoring.status,
geneid=as.character(1:1000),genenames=paste("gene", as.character(1:1000)))
samr.obj=samr(d, resp.type="Survival", nperms=20)
################multi-class example
# y takes values 1,2,3,...k where k= number of classes
set.seed(84048)
x=matrix(rnorm(1000*10),ncol=10)
x[1:50,6:10]= x[1:50,6:10]+2
x[51:100,6:10]= x[51:100,6:10]-2
y=c(rep(1,3),rep(2,3),rep(3,4))
d=list(x=x,y=y,geneid=as.character(1:1000),
genenames=paste("gene", as.character(1:1000)))
samr.obj <- samr(d, resp.type="Multiclass")
#################### timecourse data
# elements of y are of the form kTimet where k is the class label and t
# is the time; in addition, the suffixes Start or End indicate the first
# and last observation in a given time course
# the class label can be that for a two class unpaired, one class or
# two class paired problem
set.seed(8332)
y=paste(c(rep(1,15),rep(2,15)),"Time",rep(c(1,2,3,4,5,1.1,2.5, 3.7, 4.1,5.5),3),
sep="")
start=c(1,6,11,16,21,26)
for(i in start){
y[i]=paste(y[i],"Start",sep="")
}
for(i in start+4){
y[i]=paste(y[i],"End",sep="")
}
x=matrix(rnorm(1000*30),ncol=30)
x[1:50,16:20]=x[1:50,16:20]+matrix(3*c(0,1,2,3,4),ncol=5,nrow=50,byrow=TRUE)
x[1:50,21:25]=x[1:50,21:25]+matrix(3*c(0,1,2,3,4),ncol=5,nrow=50,byrow=TRUE)
x[1:50,26:30]=x[1:50,26:30]+matrix(3*c(0,1,2,3,4),ncol=5,nrow=50,byrow=TRUE)
x[51:100,16:20]=x[51:100,16:20]-matrix(3*c(0,1,2,3,4),ncol=5,nrow=50,byrow=TRUE)
x[51:100,21:25]=x[51:100,21:25]-matrix(3*c(0,1,2,3,4),ncol=5,nrow=50,byrow=TRUE)
x[51:100,26:30]=x[51:100,26:30]-matrix(3*c(0,1,2,3,4),ncol=5,nrow=50,byrow=TRUE)
data=list(x=x,y=y, geneid=as.character(1:nrow(x)),
genenames=paste("g",as.character(1:nrow(x)),sep=""), logged2=TRUE)
samr.obj<- samr(data, resp.type="Two class unpaired timecourse",
nperms=100, time.summary.type="slope")
##################### pattern discovery
# here there is no outcome y; the desired eigengene is indicated by
# the argument eigengene.numbe in the data object
set.seed(32)
x=matrix(rnorm(1000*9),ncol=9)
mu=c(3,2,1,0,0,0,1,2,3)
b=3*runif(100)+.5
x[1:100,]=x[1:100,]+ b%*%t(mu)
d=list(x=x,eigengene.number=1,
geneid=as.character(1:nrow(x)),genenames=paste("gene", as.character(1:nrow(x))))
samr.obj=samr(d, resp.type="Pattern discovery", nperms=50)
}
\keyword{univar}% at least one, from doc/KEYWORDS
\keyword{survival}
\keyword{ts}
\keyword{nonparametric}
|