File: read.dna.R

package info (click to toggle)
r-cran-ape 5.8-1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,676 kB
  • sloc: ansic: 7,676; cpp: 116; sh: 17; makefile: 2
file content (190 lines) | stat: -rw-r--r-- 7,067 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
## read.dna.R (2021-11-29)

##   Read DNA Sequences in a File

## Copyright 2003-2021 Emmanuel Paradis, 2017 RJ Ewing

## This file is part of the R-package `ape'.
## See the file ../COPYING for licensing issues.

read.FASTA <- function(file, type = "DNA")
{
    TYPES <- c("DNA", "AA")
    itype <- pmatch(toupper(type), TYPES)
    if (is.na(itype))
        stop(paste("'type' should be", paste(dQuote(TYPES), collapse = " or ")))
    GZ <- grepl("\\.gz$", file, ignore.case = TRUE)
    if (length(grep("^(ht|f)tp(s|):", file))) {
        url <- file
        file <- tempfile()
        download.file(url, file)
    }
    if (inherits(file, "connection")) {
        if (!isOpen(file, "rt")) {
            open(file, "rt")
            on.exit(close(file))
        }
        x <- scan(file, what = character(), sep = "\n", quiet = TRUE)
        x <- charToRaw(paste(x, collapse = "\n"))
    } else {
        if (GZ) {
            file <- gzcon(gzfile(file))
            open(file)
            x <- raw()
            repeat {
                y <- readBin(file, "raw", 1e9)
                if (!length(y)) break
                x <- c(x, y)
            }
            close(file)
        } else {
            x <- readBin(file, "raw", file.size(file))
        }
    }
    sz <- length(x)
    ## if the file is larger than 1 Gb we assume that it is
    ## UNIX-encoded and skip the search-replace of carriage returns
    if (sz < 1e9) {
        icr <- which(x == as.raw(0x0d)) # CR
        if (length(icr)) x <- x[-icr]
    }
    res <- .Call(rawStreamToDNAorAAbin, x, itype - 2L)
    if (identical(res, 0L)) {
        warning("failed to read sequences, returns NULL")
        return(NULL)
    }
    names(res) <- sub("^ +", "", names(res)) # to permit phylosim
    class(res) <- c("DNAbin", "AAbin")[itype]
    res
}

read.dna <- function(file, format = "interleaved", skip = 0,
                     nlines = 0, comment.char = "#",
                     as.character = FALSE, as.matrix = NULL)
{
    findFirstNucleotide <- function(x) {
        ## actually find the 1st non-blank character
        ## just in case: pat.base <- "[-AaCcGgTtUuMmRrWwSsYyKkVvHhDdBbNn?]{10}"
        tmp <- regexpr("[[:blank:]]+", x[1]) # consider only a single string
        tmp[1] + attr(tmp, "match.length")
    }
    getTaxaNames <- function(x) {
        x <- sub("^['\" ]+", "", x) # remove the leading quotes and spaces
        x <- sub("['\" ]+$", "", x) #   "     "  trailing  "     "    "
        x
    }
    getNucleotide <- function(x) {
        x <- gsub(" ", "", x)
        x <- strsplit(x, NULL)
        tolower(unlist(x))
    }
    formats <- c("interleaved", "sequential", "fasta", "clustal")
    format <- match.arg(format, formats)
    if (format == "fasta") {
        obj <- read.FASTA(file)
    } else {
        X <- scan(file = file, what = "", sep = "\n", quiet = TRUE,
                  skip = skip, nlines = nlines, comment.char = comment.char)
        if (format %in% formats[1:2]) {
            ## need to remove the possible leading spaces and/or tabs in the first line
            fl <- gsub("^[[:blank:]]+", "", X[1])
            fl <- as.numeric(unlist(strsplit(fl, "[[:blank:]]+")))
            if (length(fl) != 2 || any(is.na(fl)))
                stop("the first line of the file must contain the dimensions of the data")
            n <- fl[1]
            s <- fl[2]
            obj <- matrix("", n, s)
            X <- X[-1]
        }
        switch(format,
               "interleaved" = {
                   start.seq <- findFirstNucleotide(X[1])
                   one2n <- 1:n
                   taxa <- getTaxaNames(substr(X[one2n], 1, start.seq - 1))
                   X[one2n] <- substr(X[one2n], start.seq, nchar(X[one2n]))
                   nl <- length(X)
                   for (i in one2n)
                       obj[i, ] <- getNucleotide(X[seq(i, nl, n)])
               },
               "sequential" = {
                   taxa <- character(n)
                   j <- 1L # line number
                   for (i in 1:n) {
                       start.seq <- findFirstNucleotide(X[j])
                       taxa[i] <- getTaxaNames(substr(X[j], 1, start.seq - 1))
                       sequ <- getNucleotide(substr(X[j], start.seq, nchar(X[j])))
                       j <- j + 1L
                       while (length(sequ) < s) {
                           sequ <- c(sequ, getNucleotide(X[j]))
                           j <- j + 1L
                       }
                       obj[i, ] <- sequ
                   }
                   taxa <- getTaxaNames(taxa)
               },
               "clustal" = {
                   X <- X[-1] # drop the line with "Clustal bla bla..."
                   ## find where the 1st sequence starts
                   start.seq <- findFirstNucleotide(X[1])
                   ## find the lines with *********....
                   nspaces <- paste("^ {", start.seq - 1, "}", sep = "", collapse = "")
                   stars <- grep(nspaces, X)
                   ## we now know how many sequences in the file:
                   n <- stars[1] - 1
                   taxa <- getTaxaNames(substr(X[1:n], 1, start.seq - 1))
                   ## need to remove the sequence names before getting the sequences:
                   X <- substr(X, start.seq, nchar(X))
                   nl <- length(X)
                   ## find the length of the 1st sequence:
                   tmp <- getNucleotide(X[seq(1, nl, n + 1)])
                   s <- length(tmp)
                   obj <- matrix("", n, s)
                   obj[1, ] <- tmp
                   for (i in 2:n)
                       obj[i, ] <- getNucleotide(X[seq(i, nl, n + 1)])
               })
    }
    if (format != "fasta") {
        rownames(obj) <- taxa
        if (!as.character) obj <- as.DNAbin(obj)
    } else {
        LENGTHS <- unique(lengths(obj, use.names = FALSE))
        allSameLength <- length(LENGTHS) == 1
        if (is.logical(as.matrix)) {
            if (as.matrix && !allSameLength)
                stop("sequences in FASTA file not of the same length")
        } else {
            as.matrix <- allSameLength
        }
        if (as.matrix) {
            taxa <- names(obj)
            n <- length(obj)
            y <- matrix(as.raw(0), n, LENGTHS)
            for (i in seq_len(n)) y[i, ] <- obj[[i]]
            obj <- y
            rownames(obj) <- taxa
            class(obj) <- "DNAbin"
        }
        if (as.character) obj <- as.character(obj)
    }
    obj
}

read.fastq <- function(file, offset = -33)
{
    Z <- scan(file, "", sep="\n", quiet = TRUE)
    tmp <- Z[c(TRUE, TRUE, FALSE, FALSE)]
    sel <- c(TRUE, FALSE)
    tmp[sel] <- gsub("^@", ">", tmp[sel])
    fl <- tempfile()
    cat(tmp, file = fl, sep = "\n")
    DNA <- read.FASTA(fl)

    ## get the qualities:
    tmp <- Z[c(FALSE, FALSE, FALSE, TRUE)]
    QUAL <- lapply(tmp, function(x) as.integer(charToRaw(x)))
    if (offset) QUAL <- lapply(QUAL, "+", offset)
    names(QUAL) <- names(DNA)
    attr(DNA, "QUAL") <- QUAL
    DNA
}