File: misc.R

package info (click to toggle)
r-bioc-dada2 1.34.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 3,016 kB
  • sloc: cpp: 3,096; makefile: 5
file content (344 lines) | stat: -rw-r--r-- 12,995 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
################################################################################
#' Get the uniques-vector from the input object.
#' 
#' This function extracts the \code{\link{uniques-vector}} from several different data objects, 
#'  including \code{\link{dada-class}} and \code{\link{derep-class}} objects, as well as 
#'  \code{data.frame} objects that have both $sequence and $abundance columns.
#'  The return value is an integer vector named by sequence and valued by abundance. If the input is
#'  already in \code{\link{uniques-vector}} format, that same vector will be returned.
#' 
#' @param object (Required). The object from which to extract the \code{\link{uniques-vector}}.
#' 
#' @param collapse (Optional). Default TRUE.
#'  Should duplicate sequences detected in \code{object} be collapsed together, thereby
#'   imposing uniqueness on non-unique input.
#'  
#' @param silence (Optional). Default FALSE.
#'  Suppress reporting of the detection and merger of duplicated input sequences.
#' 
#' @return \code{integer}.
#'  An integer vector named by unique sequence and valued by abundance.
#' 
#' @importFrom methods is
#' 
#' @export
#' 
#' @examples
#' derep1 = derepFastq(system.file("extdata", "sam1F.fastq.gz", package="dada2"))
#' dada1 <- dada(derep1, err=tperr1)
#' getUniques(derep1)[1:3]
#' getUniques(dada1)[1:3]
#' getUniques(dada1$clustering)[1:3]
#' 
getUniques <- function(object, collapse=TRUE, silence=FALSE) {
  if(is.character(object) && length(object)==1 && file.exists(object)) {
    unqs <- derepFastq(object)$uniques
  } else if(is.vector(object, "integer") && length(names(object)) != 0 && !any(is.na(names(object)))) { # Named integer vector already
    unqs <- object
  } else if(is(object, "dada")) {  # dada return 
    unqs <- object$denoised
  } else if(is(object, "derep")) {
    unqs <- object$uniques
  } else if(is.data.frame(object) && all(c("sequence", "abundance") %in% colnames(object))) {
    unqs <- as.integer(object$abundance)
    names(unqs) <- object$sequence
  } else if(is.matrix(object) && is.numeric(object) && !any(is.na(colnames(object)))) { # Tabled sequences
    unqs <- as.integer(colSums(object))
    names(unqs) <- colnames(object)
  }
  else {
    stop("Unrecognized format: Requires named integer vector, fastq filename, dada-class, derep-class, sequence matrix, or a data.frame with $sequence and $abundance columns.")
  }
  #### ENFORCE UNIQUENESS HERE!!!
  if(any(duplicated(names(unqs)))) {
    if(collapse) {
      unqs <- tapply(unqs, names(unqs), sum)
      if(!silence) message("Duplicate sequences detected and merged.")
    } else if(!silence) {
      message("Duplicate sequences detected.")
    }
  }
  return(unqs)
}

################################################################################
#' Get vector of sequences from input object.
#' 
#' This function extracts the sequences from several different data objects, including
#'  including \code{\link{dada-class}} and \code{\link{derep-class}} objects, as well as 
#'  \code{data.frame} objects that have both $sequence and $abundance columns. This function 
#'  wraps the \code{\link{getUniques}} function, but return only the names (i.e. the sequences).
#'  Can also be provided the file path to a fasta or fastq file, a taxonomy table, or a
#'  DNAStringSet object. Sequences are coerced to upper-case characters.
#' 
#' @param object (Required). The object from which to extract the sequences.
#' 
#' @param collapse (Optional). Default FALSE.
#'  Should duplicate sequences detected in \code{object} be collapsed together, thereby
#'  imposing uniqueness on non-unique input.
#'  
#' @param silence (Optional). Default TRUE.
#'  Suppress reporting of the detection and merger of duplicated input sequences.
#' 
#' @return \code{character}. A character vector of the sequences.
#' 
#' @importFrom methods is
#' @importFrom methods as
#' @importFrom ShortRead readFasta
#' @importFrom ShortRead readFastq
#' @importFrom ShortRead sread
#' @importFrom ShortRead id
#' 
#' @export
#' 
#' @examples
#' derep1 = derepFastq(system.file("extdata", "sam1F.fastq.gz", package="dada2"))
#' dada1 <- dada(derep1, err=tperr1)
#' getSequences(derep1)[1:5]
#' getSequences(dada1)[1:5]
#' getSequences(dada1$clustering)[1:5]
#' 
getSequences <- function(object, collapse=FALSE, silence=TRUE) {
  if(is(object, "character")) {
    if(length(object)==1 && file.exists(object)) {
      sr <- tryCatch(readFasta(object), error=function(err) { readFastq(object) })
      seqs <- toupper(as.character(sread(sr)))
      names(seqs) <- id(sr)
      rval <- seqs
    } else if(collapse) {
      if(any(duplicated(object)) && !silence) message("Duplicate sequences detected and merged.")
      rval <- unique(object)
    } else {
      rval <- object
    }
  } else if(is(object, "DNAStringSet")) {
    rval <- as.character(object)
  } else if(is.matrix(object) && is.character(object) && !any(is.na(rownames(object)))) { # Taxonomy table
    seqs <- rownames(object)
    if(any(duplicated(seqs))) {
      if(collapse) seqs <- unique(seqs)
      if(collapse && !silence) message("Duplicate sequences detected and merged.")
      if(!collapse && !silence) message("Duplicate sequences detected.")
    }
    rval <- seqs
  } else {
    rval <- names(getUniques(object, collapse=collapse, silence=silence))
  }
  return(toupper(rval))
}

getAbund <- function(object) {
  return(sum(getUniques(object)))
}

getNseq <- function(object) {
  return(length(getUniques(object)))
}

################################################################################
#' Needleman-Wunsch alignment.
#' 
#' This function performs a Needleman-Wunsch alignment between two sequences.
#' 
#' @param s1 (Required). \code{character(1)}. The first sequence to align. A/C/G/T only.
#' 
#' @param s2 (Required). \code{character(1)}. The second sequence to align. A/C/G/T only.
#' 
#' @param match (Optional). \code{numeric(1)}. Default is getDadaOpt("MATCH").
#'  The score of a match in the alignment.
#' 
#' @param mismatch (Optional). \code{numeric(1)}. Default is getDadaOpt("MISMATCH").
#'  The score of a mismatch in the alignment.
#' 
#' @param gap (Optional). \code{numeric(1)}. Default is getDadaOpt("GAP_PENALTY").
#'  The alignment gap penalty. Should be negative.
#'  
#' @param homo_gap (Optional). \code{numeric(1)}. Default NULL (no special homopolymer penalty).
#'  The alignment gap penalty within homopolymer regions. Should be negative.
#'  
#' @param band (Optional). \code{numeric(1)}.  Default -1 (no banding).
#'  The Needleman-Wunsch alignment can be banded. This value specifies the radius of that band.
#'  Set \code{band = -1} to turn off banding.
#'  
#' @param endsfree (Optional). \code{logical(1)}. Default TRUE.
#'  Allow unpenalized gaps at the ends of the alignment.
#'  
#' @param vec (Optional). \code{logical(1)}. Default FALSE.
#'  Use DADA2's vectorized aligner instead of standard DP matrix. Not intended for long sequences (>1kb).
#'  
#' @return \code{character(2)}. The aligned sequences.
#' 
#' @export
#' 
#' @examples
#'  sq1 <- "CTAATACATGCAAGTCGAGCGAGTCTGCCTTGAAGATCGGAGTGCTTGCACTCTGTGAAACAAGATA"
#'  sq2 <- "TTAACACATGCAAGTCGAACGGAAAGGCCAGTGCTTGCACTGGTACTCGAGTGGCGAACGGGTGAGT"
#'  nwalign(sq1, sq2)
#'  nwalign(sq1, sq2, band=16)
#' 
nwalign <- function(s1, s2, match=getDadaOpt("MATCH"), mismatch=getDadaOpt("MISMATCH"), gap=getDadaOpt("GAP_PENALTY"), homo_gap=NULL, band=-1, endsfree=TRUE, vec=FALSE) {
  if(!is.character(s1) || !is.character(s2)) stop("Can only align character sequences.")
  if(is.null(homo_gap)) { homo_gap <- gap }
  if(vec) {
    if(homo_gap != gap) stop("Homopolymer gap penalties are not implemented in the vectorized aligner.")
    return(C_nwvec(s1, s2, match, mismatch, gap, band, endsfree))
  } else {
    if(!C_isACGT(s1) || !C_isACGT(s2)) {
      stop("Sequences must contain only A/C/G/T characters.")
    }
    return(C_nwalign(s1, s2, match, mismatch, gap, homo_gap, band, endsfree))
  }
}

################################################################################
#' Hamming distance after Needlman-Wunsch alignment.
#' 
#' This function performs a Needleman-Wunsch alignment between two sequences, and then counts
#' the number of mismatches and indels in that alignment. End gaps are not included in this count.
#' 
#' @param s1 (Required). \code{character(1)}. The first sequence to align. A/C/G/T only.
#' 
#' @param s2 (Required). \code{character(1)}. The second sequence to align. A/C/G/T only.
#' 
#' @param ... (Optional). Further arguments to pass on to \code{\link{nwalign}}.
#' 
#' @return \code{integer(1)}. The total number of mismatches and gaps, excluding gaps at the beginning
#'  and end of the alignment.
#' 
#' @export
#' 
#' @examples
#'  sq1 <- "CTAATACATGCAAGTCGAGCGAGTCTGCCTTGAAGATCGGAGTGCTTGCACTCTGTGAAACAAGATA"
#'  sq2 <- "TTAACACATGCAAGTCGAACGGAAAGGCCAGTGCTTGCACTGGTACTCGAGTGGCGAACGGGTGAGT"
#' nwhamming(sq1, sq2)
#' nwhamming(sq1, sq2, band=16)
#' 
nwhamming <- Vectorize(function(s1, s2, ...) {
  al <- nwalign(s1, s2, ...)
  out <- C_eval_pair(al[1], al[2])
  return(unname(out["mismatch"]+out["indel"]))
}, USE.NAMES=FALSE)

nweval <- Vectorize(function(s1, s2, ...) {
  al <- nwalign(s1, s2, ...)
  C_eval_pair(al[1], al[2])
}, USE.NAMES=FALSE)

nwextract <- function(query, ref, ...) {
  al <- nwalign(query, ref, ...)
  ntq <- gregexpr("[ACGT]", al[[1]])
  rval <- substr(al[[2]], min(ntq[[1]]), max(ntq[[1]]))
  rval <- gsub("-", "", rval)
  rval
}

strdiff <- function(s1, s2) {
  xx = unlist(strsplit(s1,""))
  yy = unlist(strsplit(s2,""))
  dd <- which(xx != yy)
  data.frame(pos=dd,nt0=xx[dd],nt1=yy[dd])
}

################################################################################
#' Reverse complement DNA sequences.
#' 
#' This function reverse complements DNA sequence(s) provided.
#' This function is nothing more than a concisely-named convenience wrapper for 
#' \code{\link[Biostrings]{reverseComplement}} that handles the \code{character} vector
#' DNA sequences generated in the the dada2 package.
#' 
#' @param sq (Required). \code{character}. The DNA sequence(s) to reverse-complement.
#'  \code{\link[Biostrings]{DNAString}}, or \code{\link[Biostrings]{DNAStringSet}} formats
#'  are also accepted.
#'  
#' @return \code{character}. The reverse-complemented DNA sequence(s).
#' 
#' @seealso 
#'  \code{\link[Biostrings]{reverseComplement}}
#'  
#' @importFrom Biostrings DNAString
#' @importFrom Biostrings DNAStringSet
#' @importFrom Biostrings reverseComplement
#' @importFrom methods as
#' 
#' @export
#' 
#' @examples
#' R1492 <- "RGYTACCTTGTTACGACTT"
#' rc(R1492)
#' sqs <- getSequences(system.file("extdata", "example_seqs.fa", package="dada2"))
#' rc(sqs)
#' 
rc <- function(sq) {
  if(length(sq) < 1) {
    return(character(0))
  } else if(length(sq) == 1) {
    as(reverseComplement(DNAString(sq)), "character")
  } else {
    as(reverseComplement(DNAStringSet(sq)), "character")
  }
}

checkConvergence <- function(dadaO) {
  sapply(dadaO$err_in, function(x) sum(abs(dadaO$err_out-x)))
}

pfasta <- function(seqs, ids=seq(length(seqs))) {
  seqs <- getSequences(seqs, collapse=FALSE)
  cat(paste(">", ids, "\n", seqs, sep="", collapse="\n"))
}

#' @importFrom methods is
#' @keywords internal
is.list.of <- function(x, ctype) {
  if(!is.list(x)) return(FALSE)
  else return(all(sapply(x, is, ctype)))
}

#' @importFrom utils write.table
#' @keywords internal
seqtab_to_qiime <- function(st, fout) {
  st <- t(st) # QIIME has OTUs as rows
  col.names <- colnames(st)
  col.names[[1]] <- paste0("#OTU ID\t", col.names[[1]])
  write.table(st, fout, sep="\t",
              row.names=TRUE, col.names=col.names, quote=FALSE)
}

#' @importFrom utils write.table
#' @keywords internal
seqtab_to_mothur <- function(st, fout) {
  # mothur has OTUs as columns, and a couple required columns
  df.shared <- data.frame(label=rep("DADA2", nrow(st)), Group=rownames(st), numOtus=ncol(st))
  df.shared <- cbind(df.shared, st)
  write.table(df.shared, fout, row.names=FALSE, col.names=TRUE, quote=FALSE)
}

#' @importFrom utils write.table
#' @keywords internal
samdf_to_qiime2 <- function(df, fout) {
  col.names <- colnames(df)
  col.names[[1]] <- paste0("#SampleID\t", col.names[[1]])
  write.table(df, fout, sep="\t",
              row.names=TRUE, col.names=col.names, quote=FALSE)
}

#' @keywords internal
bs1ham <- function(dd, ham=1) {
  is.1ham <- which(dd$clustering$birth_ham %in% ham)
  dd$birth_subs[dd$birth_subs$clust %in% is.1ham,]
}

#' @keywords internal
getSRR <- Vectorize(function(run, outdir="sra", verbose=TRUE, ...) {
  if(!grepl("^SRR[0-9]{6,}$", run)) stop("Requires SRA Run accessions in format: SRR1234567")
  if(!dir.exists(outdir)) dir.create(outdir)
  loc <- paste0("ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/",
                substr(run, 1, 6), "/", run, "/")
  loc <- paste0(loc, run, ".sra")
  download.file(loc, file.path(outdir, paste0(run, ".sra")), ...)
  if(verbose) cat(run, "\n")
})