1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151
|
## read.GenBank.R (2024-02-13)
## Read DNA Sequences and Annotations from GenBank
## Copyright 2002-2022 Emmanuel Paradis and Klaus Schliep 2024
## This file is part of the R-package `ape'.
## See the file ../COPYING for licensing issues.
read.GenBank <- function(access.nb, seq.names = access.nb, species.names = TRUE,
as.character = FALSE, chunk.size = 400, quiet = TRUE,
type = "DNA")
{
type <- match.arg(type, c("DNA", "AA"))
db <- ifelse(type == "DNA", "nucleotide", "protein")
chunk.size <- as.integer(chunk.size)
N <- length(access.nb)
## if more than 400 sequences, we break down the requests
a <- 1L
b <- if (N > chunk.size) chunk.size else N
fl <- paste0(tempfile(), ".fas")
if (!quiet)
cat("Note: chunk.size =", chunk.size, "(max nb of sequences downloaded together)\n")
repeat {
if (!quiet) cat("\rDownloading sequences:", b, "/", N, "...")
URL <- paste0("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=", db, "&id=",
paste(access.nb[a:b], collapse = ","), "&rettype=fasta&retmode=text")
X <- scan(file = URL, what = "", sep = "\n", quiet = TRUE)
cat(X, sep = "\n", file = fl, append = TRUE)
if (b == N) break
a <- b + 1L
b <- b + chunk.size
if (b > N) b <- N
}
if (!quiet) {
cat(" Done.\nNote: the downloaded sequences are in file:", fl)
cat("\nReading sequences...")
}
res <- read.FASTA(fl, type = type)
if (is.null(res)) return(NULL)
attr(res, "description") <- names(res)
if (length(access.nb) != length(res)) {
names(res) <- gsub("\\..*$", "", names(res))
failed <- paste(access.nb[! access.nb %in% names(res)], collapse = ", ")
warning(paste0("cannot get the following sequence(s):\n", failed))
} else names(res) <- access.nb
if (as.character) res <- as.character(res)
if (!quiet) cat("\n")
if (species.names) {
a <- 1L
b <- if (N > chunk.size) chunk.size else N
sp <- character(0)
repeat {
if (!quiet) cat("\rDownloading species names:", b, "/", N)
URL <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=", db, "&id=",
paste(access.nb[a:b], collapse = ","), "&rettype=gb&retmode=text", sep = "")
X <- scan(file = URL, what = "", sep = "\n", quiet = TRUE, n = -1)
sp <- c(sp, gsub(" +ORGANISM +", "", grep("ORGANISM", X, value = TRUE)))
if (b == N) break
a <- b + 1L
b <- b + chunk.size
if (b > N) b <- N
}
if (!quiet) cat(".\n")
attr(res, "species") <- gsub(" ", "_", sp)
}
if (!quiet) cat("Done.\n")
res
}
.parse.annotations.file <- function(file) {
get.product.others.gene <- function(a, b) {
res <- rep(NA_character_, 3L)
if (a > b) return(res)
y <- x[a:b]
li <- length(i <- grep("product\t", y))
lj <- length(j <- grep("gene\t", y))
if (li) res[1L] <- gsub("product\t", "", y[i])
if (lj) res[2L] <- gsub("gene\t", "", y[j])
if (length(y) > li + lj) res[3L] <- paste(y[-c(i, j)], collapse = "; ")
res
}
convert.with.incomplete <- function(vec) {
i <- grep("<|>", vec)
if (length(i)) {
incomplete <<- c(incomplete, i)
vec[i] <- gsub("<|>", "", vec[i])
}
as.integer(vec)
}
x <- scan(file, what = "", sep = "\n", quiet = TRUE, skip = 1)
n <- length(x)
i <- grep("^\t\t\t", x)
i2 <- seq_len(n)[-i]
Y <- strsplit(x[i2], "\t")
incomplete <- NULL
start <- convert.with.incomplete(sapply(Y, "[", 1L))
end <- convert.with.incomplete(sapply(Y, "[", 2L))
if (!is.null(incomplete)) {
incomplete <- sort(unique(incomplete))
warning(paste("features were incomplete in row(s):",
paste(incomplete, collapse = ", ")))
}
res <- data.frame(start, end)
sel <- which(!duplicated.data.frame(res))
res <- res[sel, ]
res$type <- sapply(Y, "[", 3L)[sel]
i3 <- i2[sel]
from <- i3 + 1L
to <- c(i3[-1L] - 1L, n)
x[i] <- gsub("^\t\t\t", "", x[i])
Z <- mapply(get.product.others.gene, from, to)
res$product <- Z[1L, ]
gene <- Z[2L, ]
res$others <- gsub("\t", ": ", Z[3L, ])
if (!all(is.na(gene))) res$gene <- gene
row.names(res) <- as.character(seq_len(nrow(res)))
res
}
getAnnotationsGenBank <- function(access.nb, quiet = TRUE)
{
N <- length(access.nb)
res <- setNames(vector("list", N), access.nb)
notfound <- NULL
s1 <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id="
s3 <- "&rettype=ft&retmode=text"
for (i in 1:N) {
if (!quiet) cat("\rDownloading annotations:", i, "/", N)
URL <- paste0(s1, access.nb[i], s3)
fl <- tempfile()
ans <- try(download.file(URL, fl, quiet = TRUE), silent = TRUE)
if (inherits(ans, "try-error")) {
notfound <- c(notfound, access.nb[i])
next
}
res[[i]] <- .parse.annotations.file(fl)
}
if (!quiet) cat(". Done.\n")
if (!is.null(notfound)) {
warning(paste0("cannot get features for the following accession(s):\n",
paste(notfound, collapse = ", ")))
if (length(notfound) == N) return(NULL)
}
if (N == 1) res[[1L]] else res
}
|