File: sequenceIO.R

package info (click to toggle)
r-bioc-dada2 1.34.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 3,016 kB
  • sloc: cpp: 3,096; makefile: 5
file content (357 lines) | stat: -rw-r--r-- 14,248 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
################################################################################
#' Read in and dereplicate a fastq file.
#' 
#' A custom interface to \code{\link[ShortRead]{FastqStreamer}} 
#' for dereplicating amplicon sequences from fastq or compressed fastq files,
#' while also controlling peak memory requirement to support large files.
#'
#' @param fls (Required). \code{character}.
#'  The file path(s) to the fastq file(s), or a directory containing fastq file(s).
#'  Compressed file formats such as .fastq.gz and .fastq.bz2 are supported.
#' 
#' @param n (Optional). \code{numeric(1)}.
#'  The maximum number of records (reads) to parse and dereplicate
#'  at any one time. This controls the peak memory requirement
#'  so that large fastq files are supported.
#'  Default is \code{1e6}, one-million reads.
#'  See \code{\link[ShortRead]{FastqStreamer}} for details on this parameter,
#'  which is passed on.
#' 
#' @param qualityType (Optional). \code{character(1)}.
#'  The quality encoding of the fastq file(s). "Auto" (the default) means to 
#'  attempt to auto-detect the encoding. This may fail for PacBio files with
#'  uniformly high quality scores, in which case use "FastqQuality". This
#'  parameter is passed on to \code{\link[ShortRead]{readFastq}}; see
#'  information there for details.
#' 
#' @param verbose (Optional). Default FALSE.
#'  If TRUE, throw standard R \code{\link{message}}s 
#'  on the intermittent and final status of the dereplication.
#'
#' @return A \code{\link{derep-class}} object or list of such objects. 
#'
#' @export
#' @importFrom ShortRead FastqStreamer
#' @importFrom ShortRead yield
#' @importFrom methods as
#'
#' @examples 
#' # Test that chunk-size, `n`, does not affect the result.
#' testFastq = system.file("extdata", "sam1F.fastq.gz", package="dada2")
#' derep1 = derepFastq(testFastq, verbose = TRUE)
#' derep1.35 = derepFastq(testFastq, n = 35, verbose = TRUE)
#' all.equal(getUniques(derep1), getUniques(derep1.35)[names(getUniques(derep1))])
#' 
derepFastq <- function(fls, n = 1e6, verbose = FALSE, qualityType = "Auto") {
  if(!is.character(fls)) { stop("File paths must be provided in character format.") }
  if(length(fls)==1 && dir.exists(fls)) { fls <- parseFastqDirectory(fls) }
  if(!all(file.exists(fls))) { stop("Not all provided files exist.") }
  rval <- list()
  for(i in seq_along(fls)) {
    fl <- fls[[i]]
    if(verbose){
      message("Dereplicating sequence entries in Fastq file: ", fl, appendLF = TRUE)
    }
    
    f <- FastqStreamer(fl, n = n)
    suppressWarnings(fq <- yield(f, qualityType = qualityType))
    
    out <- qtables2(fq) ###ITS
    
    derepCounts <- out$uniques
    derepQuals <- out$cum_quals
    derepMap <- out$map
    while( length(suppressWarnings(fq <- yield(f, qualityType = qualityType))) ){
      # A little loop protection
      newniques = alreadySeen = NULL
      # Dot represents one turn inside the chunking loop.
      if(verbose){
        message(".", appendLF = FALSE)
      }
      out <- qtables2(fq)
      # Augment quality matrices with NAs as needed to match ncol
      if(ncol(out$cum_quals) > ncol(derepQuals)) {
        derepQuals <- cbind(derepQuals, matrix(NA, nrow=nrow(derepQuals), ncol=(ncol(out$cum_quals)-ncol(derepQuals))))
      } else if(ncol(out$cum_quals) < ncol(derepQuals)) {
        out$cum_quals <- cbind(out$cum_quals, matrix(NA, nrow=nrow(out$cum_quals), ncol=(ncol(derepQuals)-ncol(out$cum_quals))))
      }
      # identify sequences already present in `derepCounts`
      alreadySeen <- names(out$uniques) %in% names(derepCounts)
      # Sum these values, if any
      if(any(alreadySeen)){
        sqnms = names(out$uniques)[alreadySeen]
        derepCounts[sqnms] <- derepCounts[sqnms] + out$uniques[sqnms] ###ITS
        derepQuals[sqnms,] <- derepQuals[sqnms,] + out$cum_quals[sqnms,] ###ITS
      }
      # Concatenate the remainder to `derepCounts`, if any
      if(!all(alreadySeen)){
        derepCounts <- c(derepCounts, out$uniques[!alreadySeen])
        derepQuals <- rbind(derepQuals, out$cum_quals[!alreadySeen,,drop=FALSE])
      }
      new2old <- match(names(out$uniques), names(derepCounts)) # map from out$uniques index to derepCounts index
      if(any(is.na(new2old))) warning("Failed to properly extend uniques.")
      derepMap <- c(derepMap, new2old[out$map])
    }
    derepQuals <- derepQuals/derepCounts # Average
    ###  if(qeff) derepQuals <- -10*log10(derepQuals)  # Convert back to effective Q value
    # Sort by decreasing abundance
    ord <- order(derepCounts, decreasing=TRUE)
    derepCounts <- derepCounts[ord]
    derepQuals <- derepQuals[ord,,drop=FALSE]
    derepMap <- match(derepMap, ord)
    if(verbose){
      message("Encountered ",
              length(derepCounts),
              " unique sequences from ",
              sum(derepCounts),
              " total sequences read.")
    }
    close(f)
    derepO <- list(uniques=derepCounts, quals=derepQuals, map=derepMap)
    derepO <- as(derepO, "derep")
    rval[[i]] <- derepO
  }
  if(length(rval) == 1) {
    rval <- rval[[1]]
  } else {
    if(is.null(names(fls))) {
      names(rval) <- basename(fls)
    } else {
      names(rval) <- names(fls)
    }
  }
  return(rval)
}
###
#' Internal tables function
#' 
#' Internal function to replicate ShortRead::tables functionality while also returning average quals and a map
#' from reads to uniques
#' 
#' @param x ShortReadQ.
#'  The ShortReadQ-class object to table (or dereplicate).
#' 
#' @param qeff \code{logical(1)}.
#'  Calculate average quality by first transforming to expected error rate.
#' 
#' @param handle.zerolen \code{logical(1)}.
#' Default TRUE. If TRUE, gracefully excludes zero-length sequences.
#' 
#' @return List.
#'  Matches format of derep-class object.
#' 
#' @importFrom ShortRead srsort
#' @importFrom ShortRead srrank
#' @importFrom ShortRead sread
#' @importFrom methods as
#' 
#' @keywords internal
#' 
qtables2 <- function(x, qeff = FALSE, handle.zerolen=TRUE) {
  nread <- length(x)
  npos <- sum(width(x) > 0)
  if(npos == 0) { stop("Only zero-length sequences detected during dereplication.") }
  if(handle.zerolen && npos < nread) { # Some zero length sequences in input
    warning("Zero-length sequences detected during dereplication. They will be ignored.")
    is.pos <- width(x) > 0
    x <- x[is.pos]
  }
  # ranks are lexical rank
  srt <- srsort(x) # map from rank to sequence/quality/id
  rnk <- srrank(x) # map from read_i to rank (integer vec of ranks, ties take top rank)
  tab <- tabulate(rnk) # map from rank to abundance (much faster than table)
  
  srtrnk <- srrank(srt)
  unq <- unique(srtrnk)
  uniques <- tab[unq] # abundance of each rank (value)
  names(uniques) <- as.character(sread(srt)[unq]) # sequence of each unique (name)
  
  rnk2unqi <- rep(seq(length(uniques)), tab[tab>0]) # map from rank to uniques index
  map <- rnk2unqi[rnk] # map from read index to unique index
  
  if(handle.zerolen && npos < nread) { # Fix mapping to include NAs for the zero-length input reads
    foo <- map
    map <- rep(as.integer(NA), nread)
    map[is.pos] <- foo
  }
  # do matrices
  qmat <- as(quality(srt), "matrix") # map from read_i to quality
  if(qeff) qmat <- 10^(-qmat/10)  # Convert to nominal error probability first
  qmat <- rowsum(qmat, srtrnk, reorder=FALSE)
  rownames(qmat) <- names(uniques)
  list(uniques=uniques, cum_quals=qmat, map=map)
}

##########
#' Write a uniques vector to a FASTA file
#' 
#' A wrapper for writeFastq in the ShortRead package.
#' Default output format is compatible with uchime.
#' 
#' @param unqs (Required).
#'  A \code{\link{uniques-vector}} or any object that can be coerced
#'  into one with \code{\link{getUniques}}.
#'   
#' @param fout (Required).
#'  The file path of the output file.
#' 
#' @param ids (Optional). \code{character}. Default NULL.
#'  A vector of sequence ids, one for each element in \code{unqs}.
#'  If NULL, a uchime-compatible ID is assigned.
#'  
#' @param mode (Optional). Default "w".
#'  Passed on to \code{\link[ShortRead]{writeFasta}}
#'  indicating the type of file writing mode. Default is \code{"w"}.
#'  
#' @param width (Optional). Default 20000.
#'  The number of characters per line in the file. Default is effectively one line
#'  per sequence. Passed on to \code{\link[ShortRead]{writeFasta}}.
#'  
#' @param ... Additional parameters passed on to \code{\link[ShortRead]{writeFasta}}.
#' 
#' @return NULL.
#' 
#' @importFrom ShortRead writeFasta
#' @importFrom ShortRead ShortRead
#' @importFrom Biostrings DNAStringSet
#' @importFrom Biostrings BStringSet
#' @export
#' 
#' @examples
#' derep1 = derepFastq(system.file("extdata", "sam1F.fastq.gz", package="dada2"))
#' outfile <- tempfile(fileext=".fasta")
#' uniquesToFasta(derep1, outfile)
#' uniquesToFasta(derep1, outfile, ids=paste0("Sequence", seq(length(getSequences(derep1)))))
#' 
uniquesToFasta <- function(unqs, fout, ids=NULL, mode="w", width=20000, ...) {
  unqs <- getUniques(unqs, collapse=FALSE)
  if(is.null(ids)) {
    ids <- paste0("sq", seq(1, length(unqs)), ";size=", unname(unqs), ";")
  }
  writeFasta(object = ShortRead(sread = DNAStringSet(names(unqs)),
                                id = BStringSet(ids)),
             file = fout, 
             mode = mode, 
             width = width,
             ...)
}

######## DEREPFASTA DEREPFASTA DEREPFASTA ################
#' derepFasta creates a derep-class object from a fasta file, by
#' creating a corresponding fastq file with a uniform quality score
#' and calling derepFastq.
#' 
#' @param fls (Required). \code{character}.
#'  The file path(s) to the fasta or gzipped fasta file(s).
#' 
#' @param ... (Optional).
#'  Additional arguments passed on to \code{\link{derepFastq}}
#' 
#' @importFrom Biostrings readDNAStringSet
#' @importFrom Biostrings writeXStringSet
#' 
#' @keywords internal
#' 
derepFasta <- function(fls, ...){
  if(!is.character(fls)) {
    stop("Filenames must be provided in character format.")
  }
  fastqs <- character(length(fls))
  for(i in seq_along(fls)) {
    fastq <- tempfile()
    fl <- fls[[i]]
    foo <- readDNAStringSet(fl)
    writeXStringSet(foo, fastq, format="fastq")
    fastqs[[i]] <- fastq
  }
  
  derepFastq(fastqs, ...)
}

#' Writes a named character vector of DNA sequences to a fasta file.
#' Values are the sequences, and names are used for the id lines.
#'
#' @seealso \code{\link[Biostrings]{writeXStringSet}}
#'
#' @param object (Required). A named \code{character} vector.
#' @param file (Required). The output file.
#' @param mode (Optional). Default "w". Append with "a".
#' @param width (Optional). Default 20000L. Maximum line length before newline.
#' @param ... (Optional). Additonal arguments passed to \code{\link[Biostrings]{writeXStringSet}}.
#' @return NULL.
#' @rdname writeFasta
#' @importFrom Biostrings DNAStringSet
#' @importFrom Biostrings writeXStringSet
#' 
setMethod("writeFasta", "character", function(object, file, mode="w", width=20000L, ...){
  append = mode == "a"
  seqs <- DNAStringSet(object)
  if(is.null(names(seqs))) { names(seqs) <- as.character(seq(length(seqs))) }
  writeXStringSet(seqs, file, ..., append = append, width=width, format = "fasta")
})

################################################################################
## Get derep-class objects from the input object.
## 
## This function extracts a \code{\link{derep-class}} or list of \code{\link{derep-class}}
##  from either an input \code{\link{derep-class}} or list of \code{\link{derep-class}} objects,
##  or an input character vector of filenames to be dereplicated by \code{\link{derepFastq}}.
## 
## @param object (Required). The object from which to extract the \code{\link{derep-class}} object(s).
## 
## @param ... (Optional). Arguments passed to \code{\link{derepFastq}} if it is called.
## 
## @return A \code{\link{derep-class}} object or list of \code{\link{derep-class}} objects.
## 
## @examples
## fn <- system.file("extdata", "sam1F.fastq.gz", package="dada2")
## derep1 = derepFastq(fn)
## identical(getDerep(derep1), getDerep(fn))
## 
getDerep <- function(object, ...) {
  if(is(object, "derep")) { object }
  else if(is.list.of(object, "derep")) { object }
  else if(is(object, "character")) { derepFastq(object, ...) }
  else{
    stop("Unrecognized format: Requires derep-class object, list of derep-class objects, a directory containing fastq files, or a character vector of fastq files.")
  }
}

################################################################################
## Get file paths to the fastq files or compressed fastq files in a directory.
## 
## @param path (Required). The directory containing the (potentially compressed) fastq files.
## 
## @param pattern (Optional). Default c(".fastq.gz$", ".fastq.bz2$", ".fastq$").
##  The ordered list of filename patterns to search for in the directory, see \code{\link{list.files}}.
##  Patterns are searched for in order, and only files matching the first pattern for which any files
##  were detected are returned.
## 
## @return A \code{\link{character}} vector of file paths to the (potentially compressed) fastq files..
## 
parseFastqDirectory <- function(path, pattern=c(".fastq.gz$", ".fastq.bz2$", ".fastq$")) {
  # Validate inputs
  if(length(path)>1) stop("If providing a directory, only one file path can be provided.")
  if(!dir.exists(path)) stop("Provided path is not an existing directory.")
  if(!is.character(pattern)) stop("File name pattern(s) must be provided in character format.")
  # Initialize
  fn <- character(0)
  multi.ext <- FALSE
  # Read in filenames
  for(pat in pattern) {
    foo <- list.files(path, pattern=pat)
    if(length(foo) > 0) { # Files with this pattern detected
      if(length(fn)==0) { fn <- foo }
      else { multi.ext <- TRUE }
    }
  }
  # Validate results and provide error messaging
  if(length(fn) == 0) stop("No files found in this directory with the expected extension(s): ", paste(pattern, collapse=", "))
  if(multi.ext) {
    warning("Multiple fastq-type file extensions detected in this directory. Only those with extensions appearing earliest in
             the search list were kept: ", paste(pattern, collapse=", "))
  }
  # Return
  return(file.path(path, fn))
}