1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364
|
################################################################################
#' Construct a sample-by-sequence observation matrix.
#'
#' This function constructs a sequence table (analogous to an OTU table) from
#' the provided list of samples.
#'
#' @param samples (Required). A \code{list} of the samples to include in the sequence table.
#' Samples can be provided in any format that can be processed by \code{\link{getUniques}}.
#' Sample names are propagated to the rownames of the sequence table.
#'
#' @param orderBy (Optional). \code{character(1)}. Default "abundance".
#' Specifies how the sequences (columns) of the returned table should be ordered (decreasing).
#' Valid values: "abundance", "nsamples", NULL.
#'
#' @return Named integer matrix.
#' A row for each sample, and a column for each unique sequence across all the samples.
#' Note that the columns are named by the sequence which can make display a little unwieldy.
#'
#' @seealso \code{\link{dada}}, \code{\link{getUniques}}
#' @export
#'
#' @importFrom methods is
#'
#' @examples
#' derep1 <- derepFastq(system.file("extdata", "sam1F.fastq.gz", package="dada2"))
#' derep2 <- derepFastq(system.file("extdata", "sam2F.fastq.gz", package="dada2"))
#' dada1 <- dada(derep1, tperr1)
#' dada2 <- dada(derep2, tperr1)
#' seqtab <- makeSequenceTable(list(sample1=dada1, sample2=dada2))
#'
makeSequenceTable <- function(samples, orderBy = "abundance") {
if(is(samples, "dada") || is(samples, "derep") || is(samples, "data.frame")) { samples <- list(samples) }
if(!is.list(samples)) { stop("Requires a list of samples.") }
unqs <- lapply(samples, getUniques)
unqsqs <- unique(do.call(c, lapply(unqs, names)))
rval <- matrix(0L, nrow=length(unqs), ncol=length(unqsqs))
# Samples are rows, columns are sequences
colnames(rval) <- unqsqs
for(i in seq_along(unqs)) {
rval[i, match(names(unqs[[i]]), colnames(rval))] <- unqs[[i]]
}
if(!is.null(names(unqs))) {
rownames(rval) <- names(unqs)
}
# Order columns
if(!is.null(orderBy)) {
if(orderBy == "abundance") {
rval <- rval[,order(colSums(rval), decreasing=TRUE),drop=FALSE]
} else if(orderBy == "nsamples") {
rval <- rval[,order(colSums(rval>0), decreasing=TRUE),drop=FALSE]
}
}
return(rval)
}
################################################################################
#' Combine together sequences that are identical up to shifts and/or length.
#'
#' This function takes as input a sequence table and returns a sequence table in which
#' any sequences that are identical up to shifts or length variation, i.e. that have
#' no mismatches or internal indels when aligned, are collapsed together. The most abundant
#' sequence is chosen as the representative of the collapsed sequences. This function can
#' be thought of as implementing greedy 100\% OTU clustering with end-gapping ignored.
#'
#' @param seqtab (Required). A sample by sequence matrix, the return of \code{\link{makeSequenceTable}}.
#'
#' @param minOverlap (Optional). \code{numeric(1)}. Default 20.
#' The minimum amount of overlap between sequences required to collapse them together.
#'
#' @param orderBy (Optional). \code{character(1)}. Default "abundance".
#' Specifies how the sequences (columns) of the returned table should be ordered (decreasing).
#' Valid values: "abundance", "nsamples", NULL.
#'
#' @param identicalOnly (Optional). \code{logical(1)}. Default FALSE.
#' If TRUE, only identical sequences (i.e. duplicates) are collapsed together.
#'
#' @param vec (Optional). \code{logical(1)}. Default TRUE.
#' Use the vectorized aligner. Should be turned off if sequences exceed 2kb in length.
#'
#' @param band (Optional). \code{numeric(1)}. Default -1 (no banding). The Needleman-Wunsch
#' alignment can be banded. This value specifies the radius of that band. Set band = -1
#' to turn off banding.
#'
#' @param verbose (Optional). \code{logical(1)}. Default FALSE.
#' If TRUE, a summary of the function results are printed to standard output.
#'
#' @return Named integer matrix.
#' A row for each sample, and a column for each collapsed sequence across all the samples.
#' Note that the columns are named by the sequence which can make display a little unwieldy.
#' Columns are in the same order (modulo the removed columns) as in the input matrix.
#'
#' @seealso \code{\link{makeSequenceTable}}
#' @export
#'
#' @examples
#' derep1 <- derepFastq(system.file("extdata", "sam1F.fastq.gz", package="dada2"))
#' derep2 <- derepFastq(system.file("extdata", "sam2F.fastq.gz", package="dada2"))
#' dada1 <- dada(derep1, tperr1)
#' dada2 <- dada(derep2, tperr1)
#' seqtab <- makeSequenceTable(list(sample1=dada1, sample2=dada2))
#' collapseNoMismatch(seqtab)
#'
collapseNoMismatch <- function(seqtab, minOverlap=20, orderBy="abundance", identicalOnly=FALSE, vec=TRUE, band=-1, verbose=FALSE) {
# Collapse identical sequences (duplicates)
dupes <- duplicated(colnames(seqtab))
if(any(dupes)) { # Collapse duplicates first
st <- seqtab[,!dupes,drop=FALSE] # Deduplicated matrix
for(i in which(dupes)) {
sq <- colnames(seqtab)[[i]]
st[,sq] <- st[,sq] + seqtab[,i]
}
seqtab <- st # Use deduplicated sequence table going forward
}
if(identicalOnly) { return(seqtab) }
# Collapse sequences with no mismatches
unqs.srt <- sort(getUniques(seqtab), decreasing=TRUE)
seqs <- names(unqs.srt) # The input sequences in order of decreasing total abundance
seqs.out <- character(0) # The output sequences (after collapsing)
# collapsed will be the output sequence table
collapsed <- matrix(0L, nrow=nrow(seqtab), ncol=ncol(seqtab))
colnames(collapsed) <- colnames(seqtab) # Keep input ordering for output table
rownames(collapsed) <- rownames(seqtab)
for(query in seqs) {
added=FALSE
prefix <- substr(query, 1, minOverlap)
for(ref in seqs.out) { # Loop over the reference sequences already added to output
prefix.ref <- substr(ref, 1, minOverlap)
# Prescreen to see if costly alignment worthwhile, this could all be moved C-side
if(grepl(prefix, ref, fixed=TRUE) || grepl(prefix.ref, query, fixed=TRUE)) {
if(nwhamming(query,ref,vec=vec,band=band) == 0) {
collapsed[,ref] <- collapsed[,ref] + seqtab[,query]
added=TRUE
break
}
}
} # for(ref in seqs.out)
if(!added) {
collapsed[,query] <- seqtab[,query]
seqs.out <- c(seqs.out, query)
}
} # for(query in seqs)
collapsed <- collapsed[,colnames(collapsed) %in% seqs.out,drop=FALSE]
# Order columns
if(!is.null(orderBy)) {
if(orderBy == "abundance") {
collapsed <- collapsed[,order(colSums(collapsed), decreasing=TRUE),drop=FALSE]
} else if(orderBy == "nsamples") {
collapsed <- collapsed[,order(colSums(collapsed>0), decreasing=TRUE),drop=FALSE]
}
}
collapsed <- collapsed[,order(colSums(collapsed), decreasing=TRUE),drop=FALSE]
if(verbose) message("Output ", ncol(collapsed), " collapsed sequences out of ", ncol(seqtab), " input sequences.")
collapsed
}
# Combines a list of derep-class objects into one single derep object
#' @importFrom methods as
#' @keywords internal
combineDereps2 <- function(dereps) {
if(is(dereps, "derep")) dereps <- list(dereps)
if(!all(sapply(dereps, function(x) is(x,"derep")))) stop("Requires derep-class objects.")
maxlen <- max(sapply(dereps, function(x) ncol(x$quals)))
# Generate the unique sequences and make the output $uniques vector
sqs.all <- unique(do.call(c, lapply(dereps, getSequences)))
derepCounts <- integer(length=length(sqs.all))
names(derepCounts) <- sqs.all
# Make the output $qual matrix with the appropriate size and rownames
derepQuals <- matrix(0.0, nrow=length(derepCounts), ncol=maxlen)
rownames(derepQuals) <- sqs.all
# Initialize the $map with appropriate length
derepMap <- integer(length=sum(sapply(dereps, function(x) length(x$map))))
start.map <- 1
for(derep in dereps) {
if(ncol(derep$quals)<maxlen) { derep$quals <- cbind(derep$quals, matrix(NA, nrow=nrow(derep$quals), ncol=(maxlen-ncol(derep$quals)))) }
derepCounts[names(derep$uniques)] <- derepCounts[names(derep$uniques)] + derep$uniques
derepQuals[rownames(derep$quals),] <- derepQuals[rownames(derep$quals),] + sweep(derep$quals, 1, derep$uniques, "*")
map <- match(names(derep$uniques), names(derepCounts))
derepMap[start.map:(start.map+length(derep$map)-1)] <- map[derep$map]
start.map <- start.map + length(derep$map)
}
derepQuals <- sweep(derepQuals, 1, derepCounts, "/")
# Order by decreasing abundance
ord <- order(derepCounts, decreasing=TRUE)
derepCounts <- derepCounts[ord]
derepQuals <- derepQuals[ord,]
derepMap <- match(seq(length(ord)), ord)[derepMap]
rval <- list(uniques=derepCounts, quals=derepQuals, map=derepMap)
rval <- as(rval, "derep")
rval
}
# Check the validity of a putatitve sequence table object, provide useful diagnostic messages if invalid.
is.sequence.table <- function(tab, verbose=TRUE) {
if(!(is.matrix(tab))) {
if(verbose) warning("Not a matrix.")
return(FALSE)
}
if(!(all(tab>=0))) {
if(verbose) warning("Negative entries.")
return(FALSE)
}
if(is.null(colnames(tab))) {
if(verbose) warning("Column (sequence) names are missing.")
return(FALSE)
}
if(is.null(rownames(tab))) {
if(verbose) warning("Row (sample) names are missing.")
return(FALSE)
}
if(!all(sapply(colnames(tab), nchar)>0)) {
if(verbose) warning("Some column (sequence) names are blank.")
return(FALSE)
}
if(!all(sapply(rownames(tab), nchar)>0)) {
if(verbose) warning("Some row (sample) names are blank.")
return(FALSE)
}
if(any(duplicated(colnames(tab)))) {
if(verbose) warning("Duplicated column (sequences) names.")
}
if(any(duplicated(rownames(tab)))) {
if(verbose) warning("Duplicated row (sample) names.")
}
return(TRUE)
}
################################################################################
#' Merge two or more sample-by-sequence observation matrices.
#'
#' This function combines sequence tables together into one merged sequences table.
#'
#' @param table1 (Optional, default=NULL). Named integer matrix. Rownames correspond to samples
#' and column names correspond to sequences. The output of \code{\link{makeSequenceTable}}.
#'
#' @param table2 (Optional, default=NULL). Named integer matrix. Rownames correspond to samples
#' and column names correspond to sequences. The output of \code{\link{makeSequenceTable}}.
#'
#' @param ... (Optional). Additional sequence tables.
#'
#' @param tables (Optional, default=NULL). Either a list of sequence tables, or a list/vector of RDS filenames
#' corresponding to sequence tables. If provided, \code{table1}, \code{table2}, and any
#' additional arguments will be ignored.
#'
#' @param repeats (Optional). Default "error".
#' Specifies how merging should proceed in the presence of repeated sample names.
#' Valid values: "error", "sum".
#' If "sum", then samples with the same name are summed together in the merged table.
#'
#' @param orderBy (Optional). \code{character(1)}. Default "abundance".
#' Specifies how the sequences (columns) of the returned table should be ordered (decreasing).
#' Valid values: "abundance", "nsamples", NULL.
#'
#' @param tryRC (Optional). \code{logical(1)}. Default FALSE.
#' If tryRC=TRUE, sequences whose reverse complement matches an earlier sequence will be reverse-
#' complemented and merged together with that earlier sequence. This is most useful when different
#' runs sequenced the same gene region in different or mixed orientations. Note, this does not
#' guarantee consistent orientatation from e.g. 5' to 3' on the gene, it just ensures that identical
#' sequences in different orientations are merged.
#'
#' @return Named integer matrix.
#' A row for each sample, and a column for each unique sequence across all the samples.
#' Note that the columns are named by the sequence which can make display unwieldy.
#'
#' @seealso \code{\link{makeSequenceTable}}
#' @export
#'
#' @examples
#'
#' \dontrun{
#' mergetab <- mergeSequenceTables(seqtab1, seqtab2, seqtab3) # unnamed arguments assumed to be sequence tables
#' input_tables <- list(seqtab1, seqtab2, seqtab3)
#' mergetab <- mergeSequenceTables(tables=input_tables) # list of sequence tables
#' files <- c(file1, file2, file3)
#' mergetab <- mergeSequenceTables(tables=files) # vector of filenames
#' }
#'
mergeSequenceTables <- function( table1=NULL, table2=NULL, ..., tables=NULL, repeats="error", orderBy = "abundance", tryRC=FALSE) {
# Convert to sequence tables if necessary
if (is.null(tables) && (is.null(table1) || is.null(table2))) {
stop("Either 'tables' or 'table1' and 'table2' must be provided.")
} else if (!is.null(tables)) {
if (is.list(tables)) {
if(all(sapply(tables, is.character))) {
tables <- lapply(tables, readRDS)
}
} else if (is.character(tables)) {
tables <- sapply(tables, readRDS)
} else {
stop("'tables' must be a list or vector.")
}
} else {
tables <- list(table1, table2)
tables <- c(tables, list(...))
}
# Validate tables
if(length(tables)<2){
stop("At least two sequence tables are expected")
}
tablesValid <- sapply(tables, is.sequence.table)
if(!(all(tablesValid))) {
errorMessage <- paste0(names(tables[which(!tablesValid)]), collapse=", ")
if(nchar(errorMessage) > 0){
errorMessage <- paste0(": ", errorMessage)
}
stop("Some sequence tables found invalid", errorMessage)
}
sample.names <- c(sapply(tables, rownames), recursive=TRUE)
namesDuplicated <- duplicated(sample.names)
if(any(namesDuplicated)) {
if(repeats=="error") {
stop("Duplicated sample names detected in the sequence table row names: ", paste0(unique(sample.names[which(namesDuplicated)]), collapse=", "))
}
else {
sample.names <- unique(sample.names)
message("Duplicated sample names detected in the sequence table row names.")
}
}
seqs <- unique(c(sapply(tables, colnames), recursive=TRUE))
if(tryRC && length(seqs) > 1) {
earlier.rc <- c(FALSE, sapply(seq(2, length(seqs)), function(i) rc(seqs[[i]]) %in% seqs[1:(i-1)]))
rc.seqs <- seqs[earlier.rc]
if(length(rc.seqs) > 0) {
message("Reverse complemented sequences detected and re-oriented.")
for(i in seq_along(tables)) {
do.rc <- colnames(tables[[i]]) %in% rc.seqs
if(any(do.rc)) {
colnames(tables[[i]])[do.rc] <- rc(colnames(tables[[i]])[do.rc])
tables[[i]] <- collapseNoMismatch(tables[[i]], identicalOnly=TRUE)
}
}
seqs <- seqs[!seqs %in% rc.seqs]
}
}
# Make merged table
rval <- matrix(0L, nrow=length(sample.names), ncol=length(seqs))
rownames(rval) <- sample.names
colnames(rval) <- seqs
for(tab in tables) {
rval[rownames(tab), colnames(tab)] <- rval[rownames(tab), colnames(tab)] + tab
}
# Order columns
if(!is.null(orderBy)) {
if(orderBy == "abundance") {
rval <- rval[,order(colSums(rval), decreasing=TRUE),drop=FALSE]
} else if(orderBy == "nsamples") {
rval <- rval[,order(colSums(rval>0), decreasing=TRUE),drop=FALSE]
}
}
rval
}
|