1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
|
## read.dna.R (2021-11-29)
## Read DNA Sequences in a File
## Copyright 2003-2021 Emmanuel Paradis, 2017 RJ Ewing
## This file is part of the R-package `ape'.
## See the file ../COPYING for licensing issues.
read.FASTA <- function(file, type = "DNA")
{
TYPES <- c("DNA", "AA")
itype <- pmatch(toupper(type), TYPES)
if (is.na(itype))
stop(paste("'type' should be", paste(dQuote(TYPES), collapse = " or ")))
GZ <- grepl("\\.gz$", file, ignore.case = TRUE)
if (length(grep("^(ht|f)tp(s|):", file))) {
url <- file
file <- tempfile()
download.file(url, file)
}
if (inherits(file, "connection")) {
if (!isOpen(file, "rt")) {
open(file, "rt")
on.exit(close(file))
}
x <- scan(file, what = character(), sep = "\n", quiet = TRUE)
x <- charToRaw(paste(x, collapse = "\n"))
} else {
if (GZ) {
file <- gzcon(gzfile(file))
open(file)
x <- raw()
repeat {
y <- readBin(file, "raw", 1e9)
if (!length(y)) break
x <- c(x, y)
}
close(file)
} else {
x <- readBin(file, "raw", file.size(file))
}
}
sz <- length(x)
## if the file is larger than 1 Gb we assume that it is
## UNIX-encoded and skip the search-replace of carriage returns
if (sz < 1e9) {
icr <- which(x == as.raw(0x0d)) # CR
if (length(icr)) x <- x[-icr]
}
res <- .Call(rawStreamToDNAorAAbin, x, itype - 2L)
if (identical(res, 0L)) {
warning("failed to read sequences, returns NULL")
return(NULL)
}
names(res) <- sub("^ +", "", names(res)) # to permit phylosim
class(res) <- c("DNAbin", "AAbin")[itype]
res
}
read.dna <- function(file, format = "interleaved", skip = 0,
nlines = 0, comment.char = "#",
as.character = FALSE, as.matrix = NULL)
{
findFirstNucleotide <- function(x) {
## actually find the 1st non-blank character
## just in case: pat.base <- "[-AaCcGgTtUuMmRrWwSsYyKkVvHhDdBbNn?]{10}"
tmp <- regexpr("[[:blank:]]+", x[1]) # consider only a single string
tmp[1] + attr(tmp, "match.length")
}
getTaxaNames <- function(x) {
x <- sub("^['\" ]+", "", x) # remove the leading quotes and spaces
x <- sub("['\" ]+$", "", x) # " " trailing " " "
x
}
getNucleotide <- function(x) {
x <- gsub(" ", "", x)
x <- strsplit(x, NULL)
tolower(unlist(x))
}
formats <- c("interleaved", "sequential", "fasta", "clustal")
format <- match.arg(format, formats)
if (format == "fasta") {
obj <- read.FASTA(file)
} else {
X <- scan(file = file, what = "", sep = "\n", quiet = TRUE,
skip = skip, nlines = nlines, comment.char = comment.char)
if (format %in% formats[1:2]) {
## need to remove the possible leading spaces and/or tabs in the first line
fl <- gsub("^[[:blank:]]+", "", X[1])
fl <- as.numeric(unlist(strsplit(fl, "[[:blank:]]+")))
if (length(fl) != 2 || any(is.na(fl)))
stop("the first line of the file must contain the dimensions of the data")
n <- fl[1]
s <- fl[2]
obj <- matrix("", n, s)
X <- X[-1]
}
switch(format,
"interleaved" = {
start.seq <- findFirstNucleotide(X[1])
one2n <- 1:n
taxa <- getTaxaNames(substr(X[one2n], 1, start.seq - 1))
X[one2n] <- substr(X[one2n], start.seq, nchar(X[one2n]))
nl <- length(X)
for (i in one2n)
obj[i, ] <- getNucleotide(X[seq(i, nl, n)])
},
"sequential" = {
taxa <- character(n)
j <- 1L # line number
for (i in 1:n) {
start.seq <- findFirstNucleotide(X[j])
taxa[i] <- getTaxaNames(substr(X[j], 1, start.seq - 1))
sequ <- getNucleotide(substr(X[j], start.seq, nchar(X[j])))
j <- j + 1L
while (length(sequ) < s) {
sequ <- c(sequ, getNucleotide(X[j]))
j <- j + 1L
}
obj[i, ] <- sequ
}
taxa <- getTaxaNames(taxa)
},
"clustal" = {
X <- X[-1] # drop the line with "Clustal bla bla..."
## find where the 1st sequence starts
start.seq <- findFirstNucleotide(X[1])
## find the lines with *********....
nspaces <- paste("^ {", start.seq - 1, "}", sep = "", collapse = "")
stars <- grep(nspaces, X)
## we now know how many sequences in the file:
n <- stars[1] - 1
taxa <- getTaxaNames(substr(X[1:n], 1, start.seq - 1))
## need to remove the sequence names before getting the sequences:
X <- substr(X, start.seq, nchar(X))
nl <- length(X)
## find the length of the 1st sequence:
tmp <- getNucleotide(X[seq(1, nl, n + 1)])
s <- length(tmp)
obj <- matrix("", n, s)
obj[1, ] <- tmp
for (i in 2:n)
obj[i, ] <- getNucleotide(X[seq(i, nl, n + 1)])
})
}
if (format != "fasta") {
rownames(obj) <- taxa
if (!as.character) obj <- as.DNAbin(obj)
} else {
LENGTHS <- unique(lengths(obj, use.names = FALSE))
allSameLength <- length(LENGTHS) == 1
if (is.logical(as.matrix)) {
if (as.matrix && !allSameLength)
stop("sequences in FASTA file not of the same length")
} else {
as.matrix <- allSameLength
}
if (as.matrix) {
taxa <- names(obj)
n <- length(obj)
y <- matrix(as.raw(0), n, LENGTHS)
for (i in seq_len(n)) y[i, ] <- obj[[i]]
obj <- y
rownames(obj) <- taxa
class(obj) <- "DNAbin"
}
if (as.character) obj <- as.character(obj)
}
obj
}
read.fastq <- function(file, offset = -33)
{
Z <- scan(file, "", sep="\n", quiet = TRUE)
tmp <- Z[c(TRUE, TRUE, FALSE, FALSE)]
sel <- c(TRUE, FALSE)
tmp[sel] <- gsub("^@", ">", tmp[sel])
fl <- tempfile()
cat(tmp, file = fl, sep = "\n")
DNA <- read.FASTA(fl)
## get the qualities:
tmp <- Z[c(FALSE, FALSE, FALSE, TRUE)]
QUAL <- lapply(tmp, function(x) as.integer(charToRaw(x)))
if (offset) QUAL <- lapply(QUAL, "+", offset)
names(QUAL) <- names(DNA)
attr(DNA, "QUAL") <- QUAL
DNA
}
|