1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
|
import.msf <- function (file, aa.to.upper = TRUE, gap.to.dash = TRUE) {
if(missing(file)) {
stop("file is missing")
}
# Read as a vector of lines
lines <- readLines(file)
# Check msf format
check1 <- grep("MSF:.*Type:.*Check:.*", lines)
check2 <- grep("Name:.*Len:.*Check:.*Weight:.*", lines)
limit <- grep("^//", lines)
if (length(check1) == 0 || length(check2) == 0 || length(limit) == 0) {
stop("file is not in msf format")
}
# Get sequence identifiers from header
id.head <- sub("^\\s*Name:\\s+(\\S+).*$","\\1", lines[check2])
nb.seq <- length(id.head)
# Check duplicated identifiers in header
if(any(duplicated(id.head))){
stop("duplicated identifiers in header")
}
# Get sequence identifiers from alignment
align <- grep("^\\s*\\S+\\s+[^1-9]+$", lines[limit:length(lines)], value = TRUE)
id.align <- sub("^\\s*(\\S+)\\s+[^1-9]+$", "\\1", align)
# Localize sequence pieces for each identifier
loc <- lapply(seq_len(nb.seq), function(i) {which(id.align == id.head[i])})
# Paste and clean sequences
seq <- sapply(loc, function(i) {paste(sub("^\\s*\\S+\\s+([^1-9]+)$", "\\1", align[i]), collapse = "")})
seq <- gsub("\\s", "", seq)
# Turn aa into upper case
if (aa.to.upper)
seq <- toupper(seq)
# Give a list of split sequences
seq <- strsplit(seq, split = "")
names(seq) <- id.head
# Turn gap into dash character
if (gap.to.dash)
seq <- lapply(seq, function (i) {i[is.gap(i)] <- "-"; return(i)})
class (seq) <- c("align")
return(seq)
}
|