File: read.GenBank.R

package info (click to toggle)
r-cran-ape 5.8-1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,676 kB
  • sloc: ansic: 7,676; cpp: 116; sh: 17; makefile: 2
file content (151 lines) | stat: -rw-r--r-- 5,521 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
## read.GenBank.R (2024-02-13)

##   Read DNA Sequences and Annotations from GenBank

## Copyright 2002-2022 Emmanuel Paradis and Klaus Schliep 2024

## This file is part of the R-package `ape'.
## See the file ../COPYING for licensing issues.

read.GenBank <- function(access.nb, seq.names = access.nb, species.names = TRUE,
                         as.character = FALSE, chunk.size = 400, quiet = TRUE,
                         type = "DNA")
{
    type <- match.arg(type, c("DNA", "AA"))
    db <- ifelse(type == "DNA", "nucleotide", "protein")
    chunk.size <- as.integer(chunk.size)
    N <- length(access.nb)
    ## if more than 400 sequences, we break down the requests
    a <- 1L
    b <- if (N > chunk.size) chunk.size else N
    fl <- paste0(tempfile(), ".fas")
    if (!quiet)
        cat("Note: chunk.size =", chunk.size, "(max nb of sequences downloaded together)\n")
    repeat {
        if (!quiet) cat("\rDownloading sequences:", b, "/", N, "...")
        URL <- paste0("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=", db, "&id=",
                      paste(access.nb[a:b], collapse = ","), "&rettype=fasta&retmode=text")
        X <- scan(file = URL, what = "", sep = "\n", quiet = TRUE)
        cat(X, sep = "\n", file = fl, append = TRUE)
        if (b == N) break
        a <- b + 1L
        b <- b + chunk.size
        if (b > N) b <- N
    }
    if (!quiet) {
        cat(" Done.\nNote: the downloaded sequences are in file:", fl)
        cat("\nReading sequences...")
    }
    res <- read.FASTA(fl, type = type)
    if (is.null(res)) return(NULL)
    attr(res, "description") <- names(res)
    if (length(access.nb) != length(res)) {
        names(res) <- gsub("\\..*$", "", names(res))
        failed <- paste(access.nb[! access.nb %in% names(res)], collapse = ", ")
        warning(paste0("cannot get the following sequence(s):\n", failed))
    } else names(res) <- access.nb

    if (as.character) res <- as.character(res)
    if (!quiet) cat("\n")
    if (species.names) {
        a <- 1L
        b <- if (N > chunk.size) chunk.size else N
        sp <- character(0)
        repeat {
            if (!quiet) cat("\rDownloading species names:", b, "/", N)
            URL <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=", db, "&id=",
                     paste(access.nb[a:b], collapse = ","), "&rettype=gb&retmode=text", sep = "")
            X <- scan(file = URL, what = "", sep = "\n", quiet = TRUE, n = -1)
            sp <- c(sp, gsub(" +ORGANISM +", "", grep("ORGANISM", X, value = TRUE)))
            if (b == N) break
            a <- b + 1L
            b <- b + chunk.size
            if (b > N) b <- N
        }
        if (!quiet) cat(".\n")
        attr(res, "species") <- gsub(" ", "_", sp)
    }
    if (!quiet) cat("Done.\n")
    res
}

.parse.annotations.file <- function(file) {
    get.product.others.gene <- function(a, b) {
        res <- rep(NA_character_, 3L)
        if (a > b) return(res)
        y <- x[a:b]
        li <- length(i <- grep("product\t", y))
        lj <- length(j <- grep("gene\t", y))
        if (li) res[1L] <- gsub("product\t", "", y[i])
        if (lj) res[2L] <- gsub("gene\t", "", y[j])
        if (length(y) > li + lj) res[3L] <- paste(y[-c(i, j)], collapse = "; ")
        res
    }
    convert.with.incomplete <- function(vec) {
        i <- grep("<|>", vec)
        if (length(i)) {
            incomplete <<- c(incomplete, i)
            vec[i] <- gsub("<|>", "", vec[i])
        }
        as.integer(vec)
    }
    x <- scan(file, what = "", sep = "\n", quiet = TRUE, skip = 1)
    n <- length(x)
    i <- grep("^\t\t\t", x)
    i2 <- seq_len(n)[-i]
    Y <- strsplit(x[i2], "\t")

    incomplete <- NULL
    start <- convert.with.incomplete(sapply(Y, "[", 1L))
    end <- convert.with.incomplete(sapply(Y, "[", 2L))
    if (!is.null(incomplete)) {
        incomplete <- sort(unique(incomplete))
        warning(paste("features were incomplete in row(s):",
                      paste(incomplete, collapse = ", ")))
    }

    res <- data.frame(start, end)
    sel <- which(!duplicated.data.frame(res))
    res <- res[sel, ]
    res$type <- sapply(Y, "[", 3L)[sel]

    i3 <- i2[sel]
    from <- i3 + 1L
    to <- c(i3[-1L] - 1L, n)

    x[i] <- gsub("^\t\t\t", "", x[i])
    Z <- mapply(get.product.others.gene, from, to)
    res$product <- Z[1L, ]
    gene <- Z[2L, ]
    res$others <- gsub("\t", ": ", Z[3L, ])
    if (!all(is.na(gene))) res$gene <- gene
    row.names(res) <- as.character(seq_len(nrow(res)))
    res
}

getAnnotationsGenBank <- function(access.nb, quiet = TRUE)
{
    N <- length(access.nb)
    res <- setNames(vector("list", N), access.nb)
    notfound <- NULL
    s1 <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id="
    s3 <- "&rettype=ft&retmode=text"
    for (i in 1:N) {
        if (!quiet) cat("\rDownloading annotations:", i, "/", N)
        URL <- paste0(s1, access.nb[i], s3)
        fl <- tempfile()
        ans <- try(download.file(URL, fl, quiet = TRUE), silent = TRUE)
        if (inherits(ans, "try-error")) {
            notfound <- c(notfound, access.nb[i])
            next
        }
        res[[i]] <- .parse.annotations.file(fl)
    }
    if (!quiet) cat(". Done.\n")
    if (!is.null(notfound)) {
        warning(paste0("cannot get features for the following accession(s):\n",
                       paste(notfound, collapse = ", ")))
        if (length(notfound) == N) return(NULL)
    }
    if (N == 1) res[[1L]] else res
}