File: import.msf.R

package info (click to toggle)
r-cran-bios2cor 2.2.2-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,520 kB
  • sloc: makefile: 5
file content (51 lines) | stat: -rw-r--r-- 1,589 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import.msf <- function (file, aa.to.upper = TRUE, gap.to.dash = TRUE) {
  
  if(missing(file)) {
    stop("file is missing")
  }

  # Read as a vector of lines
  lines <- readLines(file)

  # Check msf format
  check1 <- grep("MSF:.*Type:.*Check:.*", lines)
  check2 <- grep("Name:.*Len:.*Check:.*Weight:.*", lines)
  limit <- grep("^//", lines)
  if (length(check1) == 0 || length(check2) == 0 || length(limit) == 0) {
   	stop("file is not in msf format")
	}

  # Get sequence identifiers from header
  id.head <- sub("^\\s*Name:\\s+(\\S+).*$","\\1", lines[check2])
  nb.seq <- length(id.head)

  # Check duplicated identifiers in header
  if(any(duplicated(id.head))){
   stop("duplicated identifiers in header")
  }

  # Get sequence identifiers from alignment
  align <- grep("^\\s*\\S+\\s+[^1-9]+$", lines[limit:length(lines)], value = TRUE)
  id.align <- sub("^\\s*(\\S+)\\s+[^1-9]+$", "\\1", align)

  # Localize sequence pieces for each identifier
  loc <- lapply(seq_len(nb.seq), function(i) {which(id.align == id.head[i])})

  # Paste and clean sequences
  seq <- sapply(loc, function(i) {paste(sub("^\\s*\\S+\\s+([^1-9]+)$", "\\1", align[i]), collapse = "")})
  seq <- gsub("\\s", "", seq)
  
  # Turn aa into upper case
  if (aa.to.upper)
    seq <- toupper(seq)

  # Give a list of split sequences
  seq <- strsplit(seq, split = "")
  names(seq) <- id.head

  # Turn gap into dash character
  if (gap.to.dash)
    seq <- lapply(seq, function (i) {i[is.gap(i)] <- "-"; return(i)})
  class (seq) <- c("align")
  return(seq)
}