1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742
|
#'
#' Classifies sequences against reference training dataset.
#'
#' assignTaxonomy implements the RDP Naive Bayesian Classifier algorithm described in
#' Wang et al. Applied and Environmental Microbiology 2007, with kmer size 8 and 100 bootstrap
#' replicates. Properly formatted reference files for several popular taxonomic databases
#' are available \url{http://benjjneb.github.io/dada2/training.html}
#'
#' @param seqs (Required). A character vector of the sequences to be assigned, or an object
#' coercible by \code{\link{getUniques}}.
#'
#' @param refFasta (Required). The path to the reference fasta file, or an
#' R connection Can be compressed.
#' This reference fasta file should be formatted so that the id lines correspond to the
#' taxonomy (or classification) of the associated sequence, and each taxonomic level is
#' separated by a semicolon. Eg.
#'
#' >Kingom;Phylum;Class;Order;Family;Genus;
#' ACGAATGTGAAGTAA......
#'
#' @param minBoot (Optional). Default 50.
#' The minimum bootstrap confidence for assigning a taxonomic level.
#'
#' @param tryRC (Optional). Default FALSE.
#' If TRUE, the reverse-complement of each sequences will be used for classification if it is a better match to the reference
#' sequences than the forward sequence.
#'
#' @param outputBootstraps (Optional). Default FALSE.
#' If TRUE, bootstrap values will be retained in an integer matrix. A named list containing the assigned taxonomies (named "taxa")
#' and the bootstrap values (named "boot") will be returned. Minimum bootstrap confidence filtering still takes place,
#' to see full taxonomy set minBoot=0
#'
#' @param taxLevels (Optional). Default is c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species").
#' The taxonomic levels being assigned. Truncates if deeper levels not present in
#' training fasta.
#'
#' @param multithread (Optional). Default is FALSE.
#' If TRUE, multithreading is enabled and the number of available threads is automatically determined.
#' If an integer is provided, the number of threads to use is set by passing the argument on to
#' \code{\link{setThreadOptions}}.
#'
#' @param verbose (Optional). Default FALSE.
#' If TRUE, print status to standard output.
#'
#' @return A character matrix of assigned taxonomies exceeding the minBoot level of
#' bootstrapping confidence. Rows correspond to the provided sequences, columns to the
#' taxonomic levels. NA indicates that the sequence was not consistently classified at
#' that level at the minBoot threshhold.
#'
#' If outputBootstraps is TRUE, a named list containing the assigned taxonomies (named "taxa")
#' and the bootstrap values (named "boot") will be returned.
#'
#' @export
#'
#' @importFrom ShortRead readFasta
#' @importFrom ShortRead sread
#' @importFrom ShortRead id
#'
#' @examples
#' seqs <- getSequences(system.file("extdata", "example_seqs.fa", package="dada2"))
#' training_fasta <- system.file("extdata", "example_train_set.fa.gz", package="dada2")
#' taxa <- assignTaxonomy(seqs, training_fasta)
#' taxa80 <- assignTaxonomy(seqs, training_fasta, minBoot=80, multithread=2)
#'
assignTaxonomy <- function(seqs, refFasta, minBoot=50, tryRC=FALSE, outputBootstraps=FALSE,
taxLevels=c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"),
multithread=FALSE, verbose=FALSE) {
MIN_REF_LEN <- 20 # Enforced minimum length of reference seqs. Must be bigger than the kmer-size used (8).
MIN_TAX_LEN <- 50 # Minimum length of input sequences to get a taxonomic assignment
# Get character vector of sequences
seqs <- getSequences(seqs)
if(min(nchar(seqs)) < MIN_TAX_LEN) {
warning("Some sequences were shorter than ", MIN_TAX_LEN, " nts and will not receive a taxonomic classification.")
}
# Read in the reference fasta
refsr <- readFasta(refFasta)
lens <- width(sread(refsr))
if(any(lens<MIN_REF_LEN)) {
refsr <- refsr[lens>=MIN_REF_LEN]
warning(paste0("Some reference sequences were too short (<", MIN_REF_LEN, "nts) and were excluded."))
}
refs <- as.character(sread(refsr))
tax <- as.character(id(refsr))
tax <- sapply(tax, function(x) gsub("^\\s+|\\s+$", "", x)) # Remove leading/trailing whitespace
# Sniff and parse UNITE fasta format
UNITE <- FALSE
if(all(grepl("FU\\|re[pf]s", tax[1:10]))) {
UNITE <- TRUE
cat("UNITE fungal taxonomic reference detected.\n")
tax <- sapply(strsplit(tax, "\\|"), `[`, 5)
tax <- gsub("[pcofg]__unidentified;", "_DADA2_UNSPECIFIED;", tax)
tax <- gsub(";s__(\\w+)_", ";s__", tax)
tax <- gsub(";s__sp$", ";_DADA2_UNSPECIFIED", tax)
}
# Crude format check
if(!grepl(";", tax[[1]])) {
if(length(unlist(strsplit(tax[[1]], "\\s")))==3) {
stop("Incorrect reference file format for assignTaxonomy (this looks like a file formatted for assignSpecies).")
} else {
stop("Incorrect reference file format for assignTaxonomy.")
}
}
# Parse the taxonomies from the id string
tax.depth <- sapply(strsplit(tax, ";"), length)
td <- max(tax.depth)
for(i in seq(length(tax))) {
if(tax.depth[[i]] < td) {
for(j in seq(td - tax.depth[[i]])) {
tax[[i]] <- paste0(tax[[i]], "_DADA2_UNSPECIFIED;")
}
}
}
# Create the integer maps from reference to type ("genus") and for each tax level
genus.unq <- unique(tax)
ref.to.genus <- match(tax, genus.unq)
tax.mat <- matrix(unlist(strsplit(genus.unq, ";")), ncol=td, byrow=TRUE)
tax.df <- as.data.frame(tax.mat)
for(i in seq(ncol(tax.df))) {
tax.df[,i] <- factor(tax.df[,i])
tax.df[,i] <- as.integer(tax.df[,i])
}
tax.mat.int <- as.matrix(tax.df)
### Assign
# Parse multithreading argument
if(is.logical(multithread)) {
if(multithread==TRUE) { RcppParallel::setThreadOptions(numThreads = "auto") }
else { RcppParallel::setThreadOptions(numThreads = 1) }
} else if(is.numeric(multithread)) {
RcppParallel::setThreadOptions(numThreads = multithread)
} else {
warning("Invalid multithread parameter. Running as a single thread.")
RcppParallel::setThreadOptions(numThreads = 1)
}
# Run C assignemnt code
assignment <- C_assign_taxonomy2(seqs, rc(seqs), refs, ref.to.genus, tax.mat.int, tryRC, verbose)
# Parse results and return tax consistent with minBoot
bestHit <- genus.unq[assignment$tax]
boots <- assignment$boot
taxes <- strsplit(bestHit, ";")
taxes <- lapply(seq_along(taxes), function(i) taxes[[i]][boots[i,]>=minBoot])
# Convert to character matrix
tax.out <- matrix(NA_character_, nrow=length(seqs), ncol=td)
for(i in seq(length(seqs))) {
if(length(taxes[[i]]) > 0) {
tax.out[i,1:length(taxes[[i]])] <- taxes[[i]]
}
}
rownames(tax.out) <- seqs
colnames(tax.out) <- taxLevels[1:ncol(tax.out)]
tax.out[tax.out=="_DADA2_UNSPECIFIED"] <- NA_character_
if(outputBootstraps){
# Convert boots to integer matrix
boots.out <- matrix(boots, nrow=length(seqs), ncol=td)
rownames(boots.out) <- seqs
colnames(boots.out) <- taxLevels[1:ncol(boots.out)]
list(tax=tax.out, boot=boots.out)
} else {
tax.out
}
}
# Helper function for assignSpecies
mapHits <- function(x, refs, keep, sep="/") {
hits <- refs[x]
hits[grepl("Escherichia", hits, fixed=TRUE) | grepl("Shigella", hits, fixed=TRUE)] <- "Escherichia/Shigella"
if(length(unique(hits))<=keep) {
rval <- do.call(paste, c(as.list(sort(unique(hits))), sep=sep))
} else { rval <- NA_character_ }
if(length(rval)==0) rval <- NA_character_
rval
}
# Match curated genus names to binomial genus names
# Handles Clostridium groups and split genera names
matchGenera <- function(gen.tax, gen.binom, split.glyph="/") {
if(is.na(gen.tax) || is.na(gen.binom)) { return(FALSE) }
if((gen.tax==gen.binom) ||
grepl(paste0("^", gen.binom, "[ _", split.glyph, "]"), gen.tax) ||
grepl(paste0(split.glyph, gen.binom, "$"), gen.tax)) {
return(TRUE)
} else {
return(FALSE)
}
}
#'
#' Taxonomic assignment to the species level by exact matching.
#'
#' \code{assignSpecies} uses exact matching against a reference fasta to identify the
#' genus-species binomial classification of the input sequences.
#'
#' @param seqs (Required). A character vector of the sequences to be assigned, or an object
#' coercible by \code{\link{getUniques}}.
#'
#' @param refFasta (Required). The path to the reference fasta file, or an
#' R connection. Can be compressed.
#' This reference fasta file should be formatted so that the id lines correspond to the
#' genus-species of the associated sequence:
#'
#' >SeqID genus species
#' ACGAATGTGAAGTAA......
#'
#' @param allowMultiple (Optional). Default FALSE.
#' Defines the behavior when multiple exact matches against different species are returned.
#' By default only unambiguous identifications are return. If TRUE, a concatenated string
#' of all exactly matched species is returned. If an integer is provided, multiple
#' identifications up to that many are returned as a concatenated string.
#'
#' @param tryRC (Optional). Default FALSE.
#' If TRUE, the reverse-complement of each sequences will also be tested for exact matching
#' to the reference sequences.
#'
#' @param n (Optional). Default \code{2000}.
#' The number of sequences to perform assignment on at one time.
#' This controls the peak memory requirement so that large numbers of sequences are supported.
#'
#' @param verbose (Optional). Default FALSE.
#' If TRUE, print status to standard output.
#'
#' @return A two-column character matrix. Rows correspond to the provided sequences,
#' columns to the genus and species taxonomic levels. NA indicates that the sequence
#' was not classified at that level.
#'
#' @export
#'
#' @importFrom Biostrings vcountPDict
#' @importFrom Biostrings PDict
#' @importFrom ShortRead readFasta
#' @importFrom ShortRead sread
#' @importFrom ShortRead reverseComplement
#' @importFrom ShortRead id
#' @importFrom methods as
#'
#' @examples
#' seqs <- getSequences(system.file("extdata", "example_seqs.fa", package="dada2"))
#' species_fasta <- system.file("extdata", "example_species_assignment.fa.gz", package="dada2")
#' spec <- assignSpecies(seqs, species_fasta)
#'
assignSpecies <- function(seqs, refFasta, allowMultiple=FALSE, tryRC=FALSE, n=2000, verbose=FALSE) {
# Define number of multiple species to return
if(is.logical(allowMultiple)) {
if(allowMultiple) keep <- Inf
else keep <- 1
} else {
keep <- as.integer(allowMultiple)
}
# Get character vector of sequences
seqs <- getSequences(seqs)
# Read in the reference fasta
refsr <- readFasta(refFasta)
ids <- as(id(refsr), "character")
# Crude format check
if(!length(unlist(strsplit(ids[[1]], "\\s"))) >= 3) {
if(length(unlist(gregexpr(";", ids[[1]]))) >= 3) {
stop("Incorrect reference file format for assignSpecies (this looks like a file formatted for assignTaxonomy).")
} else {
stop("Incorrect reference file format for assignSpecies.")
}
}
genus <- sapply(strsplit(ids, "\\s"), `[`, 2)
species <- sapply(strsplit(ids, "\\s"), `[`, 3)
# Identify the exact hits
hits <- vector("list", length(seqs))
lens <- nchar(seqs)
for(len in unique(lens)) { # Requires all same length sequences
i.len <- which(lens==len); n.len <- length(i.len)
j.lo<-1; j.hi<-min(n,n.len)
while(j.lo <= n.len) {
i.loop <- i.len[j.lo:j.hi]
seqdict <- PDict(seqs[i.loop])
vhit <- (vcountPDict(seqdict, sread(refsr))>0)
if(tryRC) vhit <- vhit | (vcountPDict(seqdict, reverseComplement(sread(refsr)))>0)
hits[i.loop] <- lapply(seq(nrow(vhit)), function(x) vhit[x,])
j.lo <- j.lo + n; j.hi <- min(j.hi+n, n.len)
rm(seqdict)
gc()
}
}
# Get genus species return strings
rval <- cbind(unlist(sapply(hits, mapHits, refs=genus, keep=1)),
unlist(sapply(hits, mapHits, refs=species, keep=keep)))
colnames(rval) <- c("Genus", "Species")
rownames(rval) <- seqs
gc()
if(verbose) cat(sum(!is.na(rval[,"Species"])), "out of", length(seqs), "were assigned to the species level.\n")
rval
}
#'
#' Add species-level annotation to a taxonomic table.
#'
#' \code{addSpecies} wraps the \code{\link{assignSpecies}} function to assign genus-species
#' binomials to the input sequences by exact matching against a reference fasta. Those binomials
#' are then merged with the input taxonomic table with species annotations appended as an
#' additional column to the input table.
#' Only species identifications where the genera in the input table and the binomial
#' classification are consistent are included in the return table.
#'
#' @param taxtab (Required). A taxonomic table, the output of \code{\link{assignTaxonomy}}.
#'
#' @param refFasta (Required). The path to the reference fasta file, or an
#' R connection. Can be compressed.
#' This reference fasta file should be formatted so that the id lines correspond to the
#' genus-species binomial of the associated sequence:
#'
#' >SeqID genus species
#' ACGAATGTGAAGTAA......
#'
#' @param allowMultiple (Optional). Default FALSE.
#' Defines the behavior when multiple exact matches against different species are returned.
#' By default only unambiguous identifications are return. If TRUE, a concatenated string
#' of all exactly matched species is returned. If an integer is provided, multiple
#' identifications up to that many are returned as a concatenated string.
#'
#' @param tryRC (Optional). Default FALSE.
#' If TRUE, the reverse-complement of each sequences will be used for classification if it is a better match to the reference
#' sequences than the forward sequence.
#'
#' @param n (Optional). Default \code{1e5}.
#' The number of records (reads) to read in and filter at any one time.
#' This controls the peak memory requirement so that very large fastq files are supported.
#' See \code{\link{FastqStreamer}} for details.
#'
#' @param verbose (Optional). Default FALSE.
#' If TRUE, print status to standard output.
#'
#' @return A character matrix one column larger than input. Rows correspond to
#' sequences, and columns to the taxonomic levels. NA indicates that the sequence
#' was not classified at that level.
#'
#' @seealso
#' \code{\link{assignTaxonomy}}, \code{\link{assignSpecies}}
#'
#' @export
#'
#' @examples
#'
#' seqs <- getSequences(system.file("extdata", "example_seqs.fa", package="dada2"))
#' training_fasta <- system.file("extdata", "example_train_set.fa.gz", package="dada2")
#' taxa <- assignTaxonomy(seqs, training_fasta)
#' species_fasta <- system.file("extdata", "example_species_assignment.fa.gz", package="dada2")
#' taxa.spec <- addSpecies(taxa, species_fasta)
#' taxa.spec.multi <- addSpecies(taxa, species_fasta, allowMultiple=TRUE)
#'
addSpecies <- function(taxtab, refFasta, allowMultiple=FALSE, tryRC=FALSE, n=2000, verbose=FALSE) {
seqs <- rownames(taxtab)
binom <- assignSpecies(seqs, refFasta=refFasta, allowMultiple=allowMultiple, tryRC=tryRC, n=n, verbose=verbose)
# Merge tables
if("Genus" %in% colnames(taxtab)) gcol <- which(colnames(taxtab) == "Genus")
else gcol <- ncol(taxtab)
# Match genera
gen.match <- mapply(matchGenera, taxtab[,gcol], binom[,1])
taxtab <- cbind(taxtab, binom[,2])
colnames(taxtab)[ncol(taxtab)] <- "Species"
taxtab[!gen.match,"Species"] <- NA_character_
if(verbose) cat("Of which", sum(!is.na(taxtab[,"Species"])),"had genera consistent with the input table.")
taxtab
}
#' This function creates the dada2 assignTaxonomy training fasta for the RDP trainset .fa file
#' The RDP trainset data was downloaded from: https://sourceforge.net/projects/rdp-classifier/files/RDP_Classifier_TrainingData/
#'
#' ## RDP Trainset 18
#' path <- "~/Desktop/RDP/RDPClassifier_16S_trainsetNo18_rawtrainingdata"
#' dada2:::makeTaxonomyFasta_RDP(file.path(path, "trainset18_062020.fa"),
#' file.path(path, "trainset18_db_taxid.txt"),
#' "~/tax/rdp_train_set_18.fa.gz")
#' dada2:::tax.check("~/tax/rdp_train_set_18.fa.gz", "~/Desktop/ten_16s.100.fa")
#'
#' ## RDP Trainset 16
#' path <- "~/Desktop/RDP/RDPClassifier_16S_trainsetNo16_rawtrainingdata"
#' dada2:::makeTaxonomyFasta_RDP(file.path(path, "trainset16_022016.fa"),
#' file.path(path, "trainset16_db_taxid.txt"),
#' "~/tax/rdp_train_set_16.fa.gz")
#'
#' @importFrom ShortRead readFasta
#' @importFrom ShortRead writeFasta
#' @importFrom ShortRead sread
#' @importFrom Biostrings BStringSet
#' @importFrom utils read.table
#' @keywords internal
makeTaxonomyFasta_RDP <- function(fin, fdb, fout, compress=TRUE) {
# Read in the fasta and pull out the taxonomy entries
sr <- readFasta(fin)
id <- as.character(gsub("\"", "", id(sr)))
tax <- sapply(strsplit(id, "\\t"), `[`, 2)
tax <- gsub("^Root;", "", tax)
tax <- strsplit(tax, ";")
# Get the names of the standard 6 taxonomic levels
db <- read.table(file=fdb, sep="*", stringsAsFactors=FALSE)
colnames(db) <- c("Index", "Name", "L", "R", "Level")
keep <- db$Name[db$Level %in% c("domain", "phylum", "class", "order", "family", "genus")]
# Cut down to just the 6 level taxonomy
tax <- sapply(tax, function(x) x[x %in% keep])
tax <- lapply(tax, paste, collapse = ";")
tax <- unlist(tax)
# Final formatting
tax <- paste0(tax, ";") # Ending semicolon
tax <- gsub("[^;]*_incertae_sedis;$", "", tax) # Uncertain lowest-level assignment is better to leave blank
tax <- gsub(" ", "_", tax)
# Write to disk
writeFasta(ShortRead(sread(sr), BStringSet(tax)), fout,
width=20000L, compress=compress)
}
#' This function creates the dada2 assignSpecies fasta file for the RDP
#' from the RDP's _Bacteria_unaligned.fa file.
#'
#' ## RDP Trainset 18/Release 11.5
#' ## The RDP documentation does not make clear whether the updates to the taxonomy from training set release 18 were
#' ## propagated to the current Bacterial alignment.
#' dada2:::makeSpeciesFasta_RDP("~/Desktop/RDP/current_Bacteria_unaligned.fa", "~/tax/rdp_species_assignment_18.fa.gz")
#' dada2:::tax.check("~/tax/rdp_species_assignment_18.fa.gz", "~/Desktop/ten_16s.100.fa", mode="species")
#'
#' ## RDP Trainset 16/Release 11.5
#' dada2:::makeSpeciesFasta_RDP("~/Desktop/RDP/current_Bacteria_unaligned.fa", "~/tax/rdp_species_assignment_16.fa.gz")
#'
#' @importFrom ShortRead readFasta
#' @importFrom ShortRead writeFasta
#' @importFrom ShortRead sread
#' @importFrom ShortRead narrow
#' @importFrom IRanges narrow
#' @importFrom Biostrings BStringSet
#' @keywords internal
makeSpeciesFasta_RDP <- function(fin, fout, compress=TRUE) {
# Read in and remove records not assigned to species
sr <- readFasta(fin)
is.uncult <- grepl("[Uu]ncultured", id(sr))
sr <- sr[!is.uncult]
is.unclass <- grepl("[Uu]nclassified", id(sr))
sr <- sr[!is.unclass]
is.outgroup <- (grepl("Outgroup", id(sr)))
sr <- sr[!is.outgroup]
is.unident <- grepl("[Uu]nidentified", id(sr))
sr <- sr[!is.unident]
# Pull out the genus species binomial string
binom <- sapply(strsplit(as.character(id(sr)), ";"), `[`, 1)
binom <- sapply(strsplit(binom, "\\t"), `[`, 1)
binom <- gsub(" \\(T\\)", "", binom)
binom <- gsub("\\[", "", binom)
binom <- gsub("\\]", "", binom)
# Match genera between binomial and the curated taxonomy
bar <- strsplit(as.character(id(sr)), ";")
barlens <- sapply(bar, length)
geni <- mapply(function(x,y) x[[y]], bar, barlens-1)
# Get rid of SXXX id
binom <- gsub("^S[0123456789]{9} ", "", binom)
binom <- gsub("\'" , "", binom)
# Drop Candidatus strings
binom <- gsub("Candidatus ", "", binom)
geni <- gsub("Candidatus ", "", geni)
# Subset down to those binomials which match the curated genus
binom.geni <- sapply(strsplit(binom, "\\s"), `[`, 1)
gen.match <- mapply(matchGenera, geni, binom.geni)
sr <- sr[gen.match]
binom <- binom[gen.match]
geni <- geni[gen.match]
# Make matrix of genus/species
binom[sapply(strsplit(binom, "\\s"), length)==1] <- paste(binom[sapply(strsplit(binom, "\\s"), length)==1], "sp.")
binom2 <- cbind(sapply(strsplit(binom, "\\s"), `[`, 1),
sapply(strsplit(binom, "\\s"), `[`, 2))
# Keep only those with a named species
has.spec <- !grepl("sp\\.", binom2[,2])
sum(has.spec)
binom2 <- binom2[has.spec,]
sr <- sr[has.spec]
binom <- binom[has.spec]
geni <- geni[has.spec]
cat(length(binom), "sequences with genus/species binomial annotation output.\n")
# Write to disk
ids <- as.character(narrow(id(sr),1,10))
writeFasta(ShortRead(sread(sr), BStringSet(paste(ids, binom))), fout,
width=20000L, compress=compress)
}
#' This function creates the dada2 assignTaxonomy training fasta for the official Silva NR99
#' release files. If `include.species`=TRUE, a 7th taxonomic level (species) will be added based on the
#' Genus species binomial in the Silva taxonomy string, if present and valid.
#'
#' ## Silva release v138
#' path <- "~/tax/Silva/v138"
#' dada2:::makeTaxonomyFasta_SilvaNR(file.path(path, "SILVA_138_SSURef_NR99_tax_silva.fasta.gz"),
#' file.path(path, "tax_slv_ssu_138.txt"),
#' "~/Desktop/silva_nr99_v138_train_set.fa.gz")
#' dada2:::makeTaxonomyFasta_SilvaNR(file.path(path, "SILVA_138_SSURef_NR99_tax_silva.fasta.gz"),
#' file.path(path, "tax_slv_ssu_138.txt"),
#' include.species=TRUE, "~/Desktop/silva_nr99_v138_wSpecies_train_set.fa.gz")
#'
#' @importFrom ShortRead readFasta
#' @importFrom ShortRead writeFasta
#' @importFrom ShortRead sread
#' @importFrom Biostrings BStringSet
#' @importFrom utils read.table
#' @keywords internal
makeTaxonomyFasta_SilvaNR <- function(fin, ftax, fout, include.species=FALSE, compress=TRUE) {
xset <- DNAStringSet(readRNAStringSet(fin, format="fasta"))
# taxl: The taxonmic strings or (l)ines associated with each entry. Named by the sequence ID/accession.
taxl <- names(xset)
names(taxl) <- sapply(strsplit(names(xset), "\\s"), `[`, 1)
if(any(duplicated(names(taxl)))) stop("Duplicated sequence IDs detected.")
names(xset) <- names(taxl)
taxl <- gsub("^[A-Za-z0-9.]+\\s", "", taxl)
# Fix Silva- or release-specific errors
taxl <- gsub(";YM;", ";", taxl) # YM bacterial suborder included in 138 Release in error (confirmed by Silva)
## taxl <- gsub(";Rahnella1", ";Rahnella", taxl) # Rahnella1 genus seems like an error, shares same species w/ Rahnella,
## But! also in official tax file. Maybe check with Silva on this one.
# taxa: A list of the ordered taxonomic levels corresponding to each reference sequence. Named by the sequence ID/accession.
taxa <- strsplit(taxl, ";")
# Read in the defined Silva taxonomic levels, e.g. Bacteria;Desulfobacterota;Desulfobulbia;Desulfobulbales;Desulfurivibrionaceae;
silva.taxa <- read.table(ftax, sep="\t", col.names=c("Taxon", "V2", "Level", "V4", "V5"), stringsAsFactors=FALSE)
silva.taxa <- silva.taxa[,c("Taxon", "Level")]
# Subset down to Bacteria and Archaea
kingdom <- sapply(strsplit(taxl, ";"), `[`, 1)
taxl.ba <- taxl[kingdom %in% c("Bacteria", "Archaea")]
taxa.ba <- taxa[names(taxl.ba)]
# Create 6-column matrix with Silva taxonomic assignment for each sequence at each level from Kingdom to Genus (NA if no assignment)
taxa.ba.mat <- matrix(sapply(taxa.ba, function(flds) {
c(flds[1], flds[2], flds[3], flds[4], flds[5], flds[6])
}), ncol=6, byrow=TRUE)
rownames(taxa.ba.mat) <- names(taxl.ba)
# Create 6-column matrix with full Silva taxonomic string at each level for each sequence, from Kingdom to Genus
# Strings will include NA levels if no assignment at that level, e.g. Bacteria;Firmicutes;NA;NA
taxa.ba.mat.string <- matrix("UNDEF", nrow=nrow(taxa.ba.mat), ncol=ncol(taxa.ba.mat))
rownames(taxa.ba.mat.string) <- names(taxl.ba)
taxa.ba.mat.string[,1] <- paste0(taxa.ba.mat[,1],";")
for(col in seq(2,6)) {
taxa.ba.mat.string[,col] <- paste0(taxa.ba.mat.string[,col-1], taxa.ba.mat[,col],";")
}
if(any(taxa.ba.mat.string == "UNDEF")) stop("Taxon string matrix was not fully initialized.")
# Define the set of valid taxonomic assignment by their appearance in the list of valid Silva taxonomic levels
taxa.ba.mat.is_valid <- matrix(taxa.ba.mat.string %in% silva.taxa$Taxon, ncol=6)
# Update taxa.ba.mat matrix by replacing invalid entries with NAs
taxa.ba.mat[!taxa.ba.mat.is_valid] <- NA
# Also replace "uncultured" taxonomic ranks with NAs (note, uncultured only shows up as the terminal "assigned" rank)
taxa.ba.mat[taxa.ba.mat %in% c("Uncultured", "uncultured")] <- NA
######### ADD SPECIES PART HERE ##############
if(include.species) {
# Add the 7th column, which will be the species column
taxa.ba.mat <- cbind(taxa.ba.mat,
matrix(sapply(taxa.ba, `[`, 7), ncol=1, byrow=TRUE))
# Get validated genus from the matrix
genus <- taxa.ba.mat[,6]
genus <- gsub("Candidatus ", "", genus)
genus <- gsub("\\[", "", genus)
genus <- gsub("\\]", "", genus)
# Get the "binomial" string from the 7th field in the Silva taxonomic annotation
# The "binomial" field is not curated like the other Silva taxonomic levels, and can have varying info
# We assume that the first two words are the Genus species binomial, when there is a valid one in the field
# NOTE: the binomial is actually not always in the 7th field, so this isn't strictly correct.
# the binomial is in the "last" field, which may be <7 when not all the levels down to genus are assigned.
# But we are throwing away everything that doesn't match the genus anyway, so that case
# doesn't need to be handled correctly here.
binom <- taxa.ba.mat[,7]
binom <- gsub("Candidatus ", "", binom)
binom <- gsub("\\[", "", binom)
binom <- gsub("\\]", "", binom)
# Pull out the first two fields, and turn binom into a two column matrix (Genus, species)
binom <- cbind(sapply(strsplit(binom, "\\s"), `[`, 1),
sapply(strsplit(binom, "\\s"), `[`, 2))
# Identify binomials that match the curated genus
gen.match <- mapply(dada2:::matchGenera, genus, binom[,1], split.glyph="-")
# Identify some other types of invalid species names
is.NA <- apply(binom, 1, function(x) any(is.na(x)))
is.sp <- grepl("sp\\.", binom[,2]) # "sp." is not a valid species name, just a generic
is.endo <- binom[,1] %in% "endosymbiont" | binom[,2] %in% "endosymbiont"
is.uncult <- grepl("[Uu]ncultured", binom[,1]) | grepl("[Uu]ncultured", binom[,2])
is.unident <- grepl("[Uu]nidentified", binom[,1]) | grepl("[Uu]nidentified", binom[,2])
# Define the "valid" species, and set invalid species to NA in the taxonomic matrix
valid.spec <- gen.match & !is.NA & !is.sp & !is.endo & !is.uncult & !is.unident
binom[!valid.spec,2] <- NA
taxa.ba.mat[,7] <- binom[,2]
}
# Organize a small number of Eukaryota sequences for outgroup purposes, keeping only the Eukaryota Kingdom taxonomic assignment
set.seed(100); N_EUK <- 100
euk.keep <- sample(names(taxl)[kingdom %in% "Eukaryota"], N_EUK)
taxa.euk.mat <- matrix("", nrow=N_EUK, ncol=ncol(taxa.ba.mat))
rownames(taxa.euk.mat) <- euk.keep
taxa.euk.mat[,1] <- "Eukaryota"
taxa.euk.mat[,2:ncol(taxa.euk.mat)] <- NA
# Now need to make the final training fasta in DADA2 format.
taxa.mat.final <- rbind(taxa.ba.mat, taxa.euk.mat)
taxa.string.final <- apply(taxa.mat.final, 1, function(x) {
tst <- paste(x, collapse=";")
tst <- paste0(tst, ";")
tst <- gsub("NA;", "", tst)
tst
})
if(any(is.na(names(taxa.string.final)))) stop("NA names in the final set of taxon strings.")
if(!all(names(taxa.string.final) %in% names(xset))) stop("Some names of the final set of taxon strings don't match sequence names.")
xset.out <- xset[names(taxa.string.final)]
## Add some verbose output describing what happened.
cat(length(xset.out), "reference sequences were output.\n")
print(table(taxa.mat.final[,1], useNA="ifany"))
if(include.species) cat(sum(!is.na(taxa.mat.final[,7])), "entries include species names.\n")
writeFasta(ShortRead(unname(xset.out), BStringSet(taxa.string.final)), fout,
width=20000L, compress=compress)
}
#' DEPRECATED in favor of `makeTaxonomyFasta_SilvaNR``
#'
#' This function creates the dada2 assignTaxonomy training fasta for the Silva .align file
#' generated by the Mothur project.
#'
#' ## Silva release v128
#' path <- "~/Desktop/Silva/Silva.nr_v128"
#' dada2:::makeTaxonomyFasta_Silva(file.path(path, "silva.nr_v128.align"),
#' file.path(path, "silva.nr_v128.tax"),
#' "~/tax/silva_nr_v128_train_set.fa.gz")
#'
#' ## Silva release v132
#' path <- "~/Desktop/Silva/Silva.nr_v132"
#' dada2:::makeTaxonomyFasta_Silva(file.path(path, "silva.nr_v132.align"),
#' file.path(path, "silva.nr_v132.tax"),
#' "~/tax/silva_nr_v132_train_set.fa.gz")
#'
#' @importFrom ShortRead readFasta
#' @importFrom ShortRead writeFasta
#' @importFrom ShortRead sread
#' @importFrom Biostrings BStringSet
#' @importFrom utils read.table
#' @keywords internal
makeTaxonomyFasta_Silva <- function(fin, ftax, fout, compress=TRUE) {
# Read in the fasta and pull out the taxonomy entries
sr <- readFasta(fin) # ~10GB to read in
ids <- sapply(strsplit(as.character(id(sr)), "\\t"), `[`, 1)
seqs <- gsub("[.-]", "", sread(sr)) # Takes a while
rm(sr);gc()
# Read in the taxnoomy file
taxdf <- read.table(ftax, sep="\t", header=FALSE, stringsAsFactors = FALSE)
colnames(taxdf) <- c("id", "tax")
taxdf$tax <- gsub("^\\s+|\\s+$", "", taxdf$tax)
if(!identical(taxdf$id, ids)) stop("Input align and taxonomy files don't match.")
# Final formatting
tax <- taxdf$tax
tax <- gsub("Escherichia-Shigella", "Escherichia/Shigella", tax)
# Remove faux-assignments added by new Mothur processing script
tax <- gsub("[^;]*_ge;$", "", tax)
tax <- gsub("[^;]*_fa;$", "", tax)
tax <- gsub("[^;]*_or;$", "", tax)
tax <- gsub("[^;]*_cl;$", "", tax)
tax <- gsub("[^;]*_ph;$", "", tax)
tax <- gsub(";uncultured;$", ";", tax)
# Write to disk
writeFasta(ShortRead(DNAStringSet(seqs), BStringSet(tax)), fout,
width=20000L, compress=compress)
}
#' This function creates the dada2 assignSpecies fasta file for Silva
#' from the SILVA_[VERSION]_SSURef_tax_silva.fasta file
#'
#' ## Silva release v128
#' dada2:::makeSpeciesFasta_Silva("~/Desktop/Silva/SILVA_128_SSURef_tax_silva.fasta.gz",
#' "~/tax/silva_species_assignment_v128.fa.gz")
#'
#' ## Silva release v132
#' dada2:::makeSpeciesFasta_Silva("~/Desktop/Silva/SILVA_132_SSURef_tax_silva.fasta.gz",
#' "~/tax/silva_species_assignment_v132.fa.gz")
#'
#' Output: 313502 sequences with genus/species binomial annotation output.
#'
#' @importFrom ShortRead readFasta
#' @importFrom ShortRead writeFasta
#' @importFrom ShortRead sread
#' @importFrom Biostrings BStringSet
#' @importFrom Biostrings DNAStringSet
#' @importFrom Biostrings readRNAStringSet
#' @keywords internal
makeSpeciesFasta_Silva <- function(fin, fout, compress=TRUE) {
# Read in and remove records not assigned to species and non-bacteria
xset <- DNAStringSet(readRNAStringSet(fin, format="fasta"))
is.bact <- grepl("Bacteria;", names(xset), fixed=TRUE)
xset <- xset[is.bact]
is.uncult <- grepl("[Uu]ncultured", names(xset))
xset <- xset[!is.uncult]
is.unident <- grepl("[Uu]nidentified", names(xset))
xset <- xset[!is.unident]
is.complete <- sapply(strsplit(as.character(names(xset)), ";"), length)==7
xset <- xset[is.complete]
# Pull out binomial strings
binom <- strsplit(as.character(names(xset)), ";")
genus <- sapply(binom, `[`, 6)
binom <- sapply(binom, `[`, 7)
genus <- gsub("Candidatus ", "", genus)
binom <- gsub("Candidatus ", "", binom)
genus <- gsub("\\[", "", genus)
genus <- gsub("\\]", "", genus)
binom <- gsub("\\[", "", binom)
binom <- gsub("\\]", "", binom)
# Subset down to those binomials which match the curated genus
genus.binom <- sapply(strsplit(binom, "\\s"), `[`, 1)
gen.match <- mapply(matchGenera, genus, genus.binom, split.glyph="-")
# Note that raw Silva files use Escherichia-Shigella, but this is changed to Escherichia/Shigella in dada2 version
xset <- xset[gen.match]
binom <- binom[gen.match]
genus <- genus[gen.match]
# Make matrix of genus/species
binom[sapply(strsplit(binom, "\\s"), length)==1] <- paste(binom[sapply(strsplit(binom, "\\s"), length)==1], "sp.")
binom2 <- cbind(sapply(strsplit(binom, "\\s"), `[`, 1),
sapply(strsplit(binom, "\\s"), `[`, 2))
# Keep only those with a named species
has.spec <- !grepl("sp\\.", binom2[,2]) & !(binom2[,2]=="endosymbiont")
binom2 <- binom2[has.spec,]
xset <- xset[has.spec]
binom <- binom[has.spec]
genus <- genus[has.spec]
cat(length(binom), "sequences with genus/species binomial annotation output.\n")
# Write to disk
ids <- sapply(strsplit(as.character(names(xset)), "\\s"), `[`, 1)
writeFasta(ShortRead(unname(xset), BStringSet(paste(ids, binom))), fout,
width=20000L, compress=compress)
}
## This uses the "ten_16s.100.fa" originally from Robert Edgar's taxonomy testing page: https://drive5.com/taxxi/doc/fasta_index.html
## This file is relicensed here under the DADA2 LGPL2 license on permission from Robert Edgar.
tax.check <- function(fn.tax, fn.test=system.file("extdata", "example_seqs.fa", package="dada2"), nseq=100, level=6, mode="taxonomy") {
sq.test <- sample(getSequences(fn.test), nseq)
if(mode == "taxonomy") {
tax <- assignTaxonomy(sq.test, fn.tax, multi=TRUE)
cbind(unname(tax[,level]), sapply(strsplit(names(sq.test), ":"), `[`, level+1))
} else if (mode=="species") {
sq.acgt <- sq.test[dada2:::C_isACGT(sq.test)]
spc <- assignSpecies(sq.acgt, fn.spc)
cbind(unname(spc[,level-5]), sapply(strsplit(names(sq.acgt), ":"), `[`, level+1))
} else { stop("Valid modes are taxonomy or species.") }
}
|