File: multiSample.R

package info (click to toggle)
r-bioc-dada2 1.34.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 3,016 kB
  • sloc: cpp: 3,096; makefile: 5
file content (364 lines) | stat: -rw-r--r-- 15,298 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
################################################################################
#' Construct a sample-by-sequence observation matrix.
#' 
#' This function constructs a sequence table (analogous to an OTU table) from
#' the provided list of samples.
#' 
#' @param samples (Required). A \code{list} of the samples to include in the sequence table. 
#' Samples can be provided in any format that can be processed by \code{\link{getUniques}}.
#' Sample names are propagated to the rownames of the sequence table.
#' 
#' @param orderBy (Optional). \code{character(1)}. Default "abundance".
#' Specifies how the sequences (columns) of the returned table should be ordered (decreasing).
#' Valid values: "abundance", "nsamples", NULL.
#' 
#' @return Named integer matrix.
#' A row for each sample, and a column for each unique sequence across all the samples.
#' Note that the columns are named by the sequence which can make display a little unwieldy.
#' 
#' @seealso \code{\link{dada}}, \code{\link{getUniques}}
#' @export
#' 
#' @importFrom methods is
#' 
#' @examples
#' derep1 <- derepFastq(system.file("extdata", "sam1F.fastq.gz", package="dada2"))
#' derep2 <- derepFastq(system.file("extdata", "sam2F.fastq.gz", package="dada2"))
#' dada1 <- dada(derep1, tperr1)
#' dada2 <- dada(derep2, tperr1)
#' seqtab <- makeSequenceTable(list(sample1=dada1, sample2=dada2))
#' 
makeSequenceTable <- function(samples, orderBy = "abundance") {
  if(is(samples, "dada") || is(samples, "derep") || is(samples, "data.frame")) { samples <- list(samples) }
  if(!is.list(samples)) { stop("Requires a list of samples.") }
  unqs <- lapply(samples, getUniques)
  unqsqs <- unique(do.call(c, lapply(unqs, names)))
  rval <- matrix(0L, nrow=length(unqs), ncol=length(unqsqs))
  # Samples are rows, columns are sequences
  colnames(rval) <- unqsqs
  for(i in seq_along(unqs)) {
    rval[i, match(names(unqs[[i]]), colnames(rval))] <- unqs[[i]]
  }
  if(!is.null(names(unqs))) {
    rownames(rval) <- names(unqs)
  }
  # Order columns
  if(!is.null(orderBy)) {
    if(orderBy == "abundance") {
      rval <- rval[,order(colSums(rval), decreasing=TRUE),drop=FALSE]
    } else if(orderBy == "nsamples") {
      rval <- rval[,order(colSums(rval>0), decreasing=TRUE),drop=FALSE]
    }
  }
  
  return(rval)
}

################################################################################
#' Combine together sequences that are identical up to shifts and/or length.
#' 
#' This function takes as input a sequence table and returns a sequence table in which
#' any sequences that are identical up to shifts or length variation, i.e. that have
#' no mismatches or internal indels when aligned, are collapsed together. The most abundant
#' sequence is chosen as the representative of the collapsed sequences. This function can
#' be thought of as implementing greedy 100\% OTU clustering with end-gapping ignored.
#' 
#' @param seqtab (Required). A sample by sequence matrix, the return of \code{\link{makeSequenceTable}}.
#' 
#' @param minOverlap (Optional). \code{numeric(1)}. Default 20.
#' The minimum amount of overlap between sequences required to collapse them together.
#' 
#' @param orderBy (Optional). \code{character(1)}. Default "abundance".
#' Specifies how the sequences (columns) of the returned table should be ordered (decreasing).
#' Valid values: "abundance", "nsamples", NULL.
#' 
#' @param identicalOnly (Optional). \code{logical(1)}. Default FALSE.
#' If TRUE, only identical sequences (i.e. duplicates) are collapsed together.
#' 
#' @param vec (Optional). \code{logical(1)}. Default TRUE.
#' Use the vectorized aligner. Should be turned off if sequences exceed 2kb in length.
#' 
#' @param band 	(Optional). \code{numeric(1)}. Default -1 (no banding). The Needleman-Wunsch 
#' alignment can be banded. This value specifies the radius of that band. Set band = -1 
#' to turn off banding.
#' 
#' @param verbose (Optional). \code{logical(1)}. Default FALSE.
#' If TRUE, a summary of the function results are printed to standard output.
#' 
#' @return Named integer matrix.
#' A row for each sample, and a column for each collapsed sequence across all the samples.
#' Note that the columns are named by the sequence which can make display a little unwieldy.
#' Columns are in the same order (modulo the removed columns) as in the input matrix.
#' 
#' @seealso \code{\link{makeSequenceTable}}
#' @export
#' 
#' @examples
#' derep1 <- derepFastq(system.file("extdata", "sam1F.fastq.gz", package="dada2"))
#' derep2 <- derepFastq(system.file("extdata", "sam2F.fastq.gz", package="dada2"))
#' dada1 <- dada(derep1, tperr1)
#' dada2 <- dada(derep2, tperr1)
#' seqtab <- makeSequenceTable(list(sample1=dada1, sample2=dada2))
#' collapseNoMismatch(seqtab)
#' 
collapseNoMismatch <- function(seqtab, minOverlap=20, orderBy="abundance", identicalOnly=FALSE, vec=TRUE, band=-1, verbose=FALSE) {
  # Collapse identical sequences (duplicates)
  dupes <- duplicated(colnames(seqtab))
  if(any(dupes)) { # Collapse duplicates first
    st <- seqtab[,!dupes,drop=FALSE] # Deduplicated matrix
    for(i in which(dupes)) {
      sq <- colnames(seqtab)[[i]]
      st[,sq] <- st[,sq] + seqtab[,i]
    }
    seqtab <- st # Use deduplicated sequence table going forward
  }
  if(identicalOnly) { return(seqtab) }
  
  # Collapse sequences with no mismatches
  unqs.srt <- sort(getUniques(seqtab), decreasing=TRUE)
  seqs <- names(unqs.srt) # The input sequences in order of decreasing total abundance
  seqs.out <- character(0) # The output sequences (after collapsing)
  # collapsed will be the output sequence table  
  collapsed <- matrix(0L, nrow=nrow(seqtab), ncol=ncol(seqtab))
  colnames(collapsed) <- colnames(seqtab) # Keep input ordering for output table
  rownames(collapsed) <- rownames(seqtab)
  for(query in seqs) {
    added=FALSE
    prefix <- substr(query, 1, minOverlap)
    for(ref in seqs.out) { # Loop over the reference sequences already added to output
      prefix.ref <- substr(ref, 1, minOverlap)
      # Prescreen to see if costly alignment worthwhile, this could all be moved C-side
      if(grepl(prefix, ref, fixed=TRUE) || grepl(prefix.ref, query, fixed=TRUE)) { 
        if(nwhamming(query,ref,vec=vec,band=band) == 0) {
          collapsed[,ref] <- collapsed[,ref] + seqtab[,query] 
          added=TRUE
          break
        }
      }
    } # for(ref in seqs.out)
    
    if(!added) {
      collapsed[,query] <- seqtab[,query]
      seqs.out <- c(seqs.out, query)
    }
  } # for(query in seqs)
  collapsed <- collapsed[,colnames(collapsed) %in% seqs.out,drop=FALSE]
  
  # Order columns
  if(!is.null(orderBy)) {
    if(orderBy == "abundance") {
      collapsed <- collapsed[,order(colSums(collapsed), decreasing=TRUE),drop=FALSE]
    } else if(orderBy == "nsamples") {
      collapsed <- collapsed[,order(colSums(collapsed>0), decreasing=TRUE),drop=FALSE]
    }
  }
  
  collapsed <- collapsed[,order(colSums(collapsed), decreasing=TRUE),drop=FALSE]
  
  if(verbose) message("Output ", ncol(collapsed), " collapsed sequences out of ", ncol(seqtab), " input sequences.")
  collapsed
}

# Combines a list of derep-class objects into one single derep object
#' @importFrom methods as
#' @keywords internal
combineDereps2 <- function(dereps) {
  if(is(dereps, "derep")) dereps <- list(dereps)
  if(!all(sapply(dereps, function(x) is(x,"derep")))) stop("Requires derep-class objects.")
  maxlen <- max(sapply(dereps, function(x) ncol(x$quals)))

  # Generate the unique sequences and make the output $uniques vector
  sqs.all <- unique(do.call(c, lapply(dereps, getSequences)))
  derepCounts <- integer(length=length(sqs.all))
  names(derepCounts) <- sqs.all
  
  # Make the output $qual matrix with the appropriate size and rownames
  derepQuals <- matrix(0.0, nrow=length(derepCounts), ncol=maxlen)
  rownames(derepQuals) <- sqs.all
  
  # Initialize the $map with appropriate length
  derepMap <- integer(length=sum(sapply(dereps, function(x) length(x$map))))
  
  start.map <- 1
  for(derep in dereps) {
    if(ncol(derep$quals)<maxlen) { derep$quals <- cbind(derep$quals, matrix(NA, nrow=nrow(derep$quals), ncol=(maxlen-ncol(derep$quals)))) }
    derepCounts[names(derep$uniques)] <- derepCounts[names(derep$uniques)] + derep$uniques
    derepQuals[rownames(derep$quals),] <- derepQuals[rownames(derep$quals),] + sweep(derep$quals, 1, derep$uniques, "*")
    map <- match(names(derep$uniques), names(derepCounts))
    derepMap[start.map:(start.map+length(derep$map)-1)] <- map[derep$map]
    start.map <- start.map + length(derep$map)
  }
  
  derepQuals <- sweep(derepQuals, 1, derepCounts, "/")
  
  # Order by decreasing abundance
  ord <- order(derepCounts, decreasing=TRUE)
  derepCounts <- derepCounts[ord]
  derepQuals <- derepQuals[ord,]
  derepMap <- match(seq(length(ord)), ord)[derepMap]
  
  rval <- list(uniques=derepCounts, quals=derepQuals, map=derepMap)
  rval <- as(rval, "derep")
  rval
}

# Check the validity of a putatitve sequence table object, provide useful diagnostic messages if invalid.
is.sequence.table <- function(tab, verbose=TRUE) {
  if(!(is.matrix(tab))) {
    if(verbose) warning("Not a matrix.")
    return(FALSE)
  }
  if(!(all(tab>=0))) {
    if(verbose) warning("Negative entries.")
    return(FALSE)
  }
  if(is.null(colnames(tab))) {
    if(verbose) warning("Column (sequence) names are missing.")
    return(FALSE)
  }
  if(is.null(rownames(tab))) {
    if(verbose) warning("Row (sample) names are missing.")
    return(FALSE)
  }
  if(!all(sapply(colnames(tab), nchar)>0)) {
    if(verbose) warning("Some column (sequence) names are blank.")
    return(FALSE)
  }
  if(!all(sapply(rownames(tab), nchar)>0)) {
    if(verbose) warning("Some row (sample) names are blank.")
    return(FALSE)
  }
  if(any(duplicated(colnames(tab)))) {
    if(verbose) warning("Duplicated column (sequences) names.")
  }
  if(any(duplicated(rownames(tab)))) {
    if(verbose) warning("Duplicated row (sample) names.")
  }
  return(TRUE)
}

################################################################################
#' Merge two or more sample-by-sequence observation matrices.
#' 
#' This function combines sequence tables together into one merged sequences table.
#' 
#' @param table1 (Optional, default=NULL). Named integer matrix. Rownames correspond to samples
#' and column names correspond to sequences. The output of \code{\link{makeSequenceTable}}.
#' 
#' @param table2 (Optional, default=NULL). Named integer matrix. Rownames correspond to samples
#' and column names correspond to sequences. The output of \code{\link{makeSequenceTable}}.
#' 
#' @param ... (Optional). Additional sequence tables.
#' 
#' @param tables (Optional, default=NULL). Either a list of sequence tables, or a list/vector of RDS filenames
#' corresponding to sequence tables. If provided, \code{table1}, \code{table2}, and any
#' additional arguments will be ignored.
#' 
#' @param repeats (Optional). Default "error".
#'  Specifies how merging should proceed in the presence of repeated sample names.
#'  Valid values: "error", "sum".
#'  If "sum", then samples with the same name are summed together in the merged table.
#' 
#' @param orderBy (Optional). \code{character(1)}. Default "abundance".
#' Specifies how the sequences (columns) of the returned table should be ordered (decreasing).
#' Valid values: "abundance", "nsamples", NULL.
#' 
#' @param tryRC (Optional). \code{logical(1)}. Default FALSE.
#' If tryRC=TRUE, sequences whose reverse complement matches an earlier sequence will be reverse-
#' complemented and merged together with that earlier sequence. This is most useful when different
#' runs sequenced the same gene region in different or mixed orientations. Note, this does not
#' guarantee consistent orientatation from e.g. 5' to 3' on the gene, it just ensures that identical
#' sequences in different orientations are merged.
#' 
#' @return Named integer matrix.
#' A row for each sample, and a column for each unique sequence across all the samples.
#' Note that the columns are named by the sequence which can make display unwieldy.
#' 
#' @seealso \code{\link{makeSequenceTable}}
#' @export
#' 
#' @examples
#' 
#' \dontrun{
#'   mergetab <- mergeSequenceTables(seqtab1, seqtab2, seqtab3) # unnamed arguments assumed to be sequence tables
#'   input_tables <- list(seqtab1, seqtab2, seqtab3)
#'   mergetab <- mergeSequenceTables(tables=input_tables) # list of sequence tables
#'   files <- c(file1, file2, file3)
#'   mergetab <- mergeSequenceTables(tables=files) # vector of filenames
#' }
#' 
mergeSequenceTables <- function( table1=NULL, table2=NULL, ..., tables=NULL, repeats="error", orderBy = "abundance", tryRC=FALSE) {
  # Convert to sequence tables if necessary
  if (is.null(tables) && (is.null(table1) || is.null(table2))) {
  	stop("Either 'tables' or 'table1' and 'table2' must be provided.")
  } else if (!is.null(tables)) {
		if (is.list(tables)) {
			if(all(sapply(tables, is.character))) {
				tables <- lapply(tables, readRDS)
			}
		} else if (is.character(tables)) {
			tables <- sapply(tables, readRDS)
		} else {
			stop("'tables' must be a list or vector.")
		}
	} else {
		tables <- list(table1, table2)
		tables <- c(tables, list(...))
	}
 
  # Validate tables
  if(length(tables)<2){
    stop("At least two sequence tables are expected")
  }
  tablesValid <- sapply(tables, is.sequence.table)
  if(!(all(tablesValid))) {
    errorMessage <- paste0(names(tables[which(!tablesValid)]), collapse=", ")
    if(nchar(errorMessage) > 0){
      errorMessage <- paste0(": ", errorMessage)
    }
    stop("Some sequence tables found invalid", errorMessage)
  }
  sample.names <- c(sapply(tables, rownames), recursive=TRUE)
  namesDuplicated <- duplicated(sample.names)
  if(any(namesDuplicated)) {
    if(repeats=="error") {
      stop("Duplicated sample names detected in the sequence table row names: ", paste0(unique(sample.names[which(namesDuplicated)]), collapse=", "))
    }
    else { 
      sample.names <- unique(sample.names)
      message("Duplicated sample names detected in the sequence table row names.")
    }
  }
  seqs <- unique(c(sapply(tables, colnames), recursive=TRUE))
  if(tryRC && length(seqs) > 1) {
    earlier.rc <- c(FALSE, sapply(seq(2, length(seqs)), function(i) rc(seqs[[i]]) %in% seqs[1:(i-1)]))
    rc.seqs <- seqs[earlier.rc]
    if(length(rc.seqs) > 0) {
      message("Reverse complemented sequences detected and re-oriented.")
      for(i in seq_along(tables)) {
        do.rc <- colnames(tables[[i]]) %in% rc.seqs
        if(any(do.rc)) {
          colnames(tables[[i]])[do.rc] <- rc(colnames(tables[[i]])[do.rc])
          tables[[i]] <- collapseNoMismatch(tables[[i]], identicalOnly=TRUE)
        }
      }
      seqs <- seqs[!seqs %in% rc.seqs]
    }
  }
  # Make merged table
  rval <- matrix(0L, nrow=length(sample.names), ncol=length(seqs))
  rownames(rval) <- sample.names
  colnames(rval) <- seqs
  for(tab in tables) {
    rval[rownames(tab), colnames(tab)] <- rval[rownames(tab), colnames(tab)] + tab
  }
  # Order columns
  if(!is.null(orderBy)) {
    if(orderBy == "abundance") {
      rval <- rval[,order(colSums(rval), decreasing=TRUE),drop=FALSE]
    } else if(orderBy == "nsamples") {
      rval <- rval[,order(colSums(rval>0), decreasing=TRUE),drop=FALSE]
    }
  }
  rval
}