File: read.abif.R

package info (click to toggle)
r-cran-seqinr 3.3-3-1
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 5,844 kB
  • ctags: 69
  • sloc: ansic: 1,955; makefile: 13
file content (156 lines) | stat: -rw-r--r-- 6,632 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
read.abif <- function(filename, max.bytes.in.file = file.info(filename)$size, 
  pied.de.pilote = 1.2, verbose = FALSE){
  #
  # Suppress warnings when reading strings with internal nul character:
  #
  RTC <- function(x, ...) suppressWarnings(rawToChar(x, ...))
  #
  # Define some shortcuts:
  #
  SInt32 <- function(f, ...) readBin(f, what = "integer", signed = TRUE, endian = "big", size = 4, ...)
  SInt16 <- function(f, ...) readBin(f, what = "integer", signed = TRUE, endian = "big", size = 2, ...)
  SInt8 <- function(f, ...) readBin(f, what = "integer", signed = TRUE, endian = "big", size = 1, ...)
  UInt32 <- function(f, ...) readBin(f, what = "integer", signed = FALSE, endian = "big", size = 4, ...)
  UInt16 <- function(f, ...) readBin(f, what = "integer", signed = FALSE, endian = "big", size = 2, ...)
  UInt8 <- function(f, ...) readBin(f, what = "integer", signed = FALSE, endian = "big", size = 1, ...)
  f32 <- function(f, ...) readBin(f, what = "numeric", size = 4,endian="little", ...)
  f64 <- function(f, ...) readBin(f, what = "numeric", size = 8, endian="little",...)
  #
  # Load raw data in memory:
  #
  fc <- file(filename, open = "rb")
  rawdata <- readBin(fc, what = "raw", n = pied.de.pilote*max.bytes.in.file,endian="little")
  if(verbose) print(paste("number of bytes in file", filename, "is", length(rawdata)))
  close(fc)
  #
  # Make a list to store results:
  #
  res <- list(Header = NULL, Directory = NA, Data = NA)
  #
  # Header section is 128 bytes long, located at a fixed position at the
  # beginning of the file. We essentially need the number of item and dataoffset
  #
  res$Header$abif <- RTC(rawdata[1:4])
  if(res$Header$abif != "ABIF") stop("file not in ABIF format")
  if(verbose) print("OK: File is in ABIF format")

  res$Header$version <- SInt16(rawdata[5:6])
  if(verbose) print(paste("File in ABIF version", res$Header$version/100))
  
  res$Header$DirEntry.name <- rawdata[7:10]
  if(verbose) print(paste("DirEntry name: ", RTC(res$Header$DirEntry.name)))

  res$Header$DirEntry.number <- SInt32(rawdata[11:14])
  if(verbose) print(paste("DirEntry number: ", res$Header$DirEntry.number))
  
  res$Header$DirEntry.elementtype <- SInt16(rawdata[15:16])
  if(verbose) print(paste("DirEntry elementtype: ", res$Header$DirEntry.elementtype))

  res$Header$DirEntry.elementsize <- SInt16(rawdata[17:18])
  if(verbose) print(paste("DirEntry elementsize: ", res$Header$DirEntry.elementsize))

  # This one is important:
  res$Header$numelements <- SInt32(rawdata[19:22])
  if(verbose) print(paste("DirEntry numelements: ", res$Header$numelements))
  
  # This one is important too:
  res$Header$dataoffset <- SInt32(rawdata[27:30])
  if(verbose) print(paste("DirEntry dataoffset: ", res$Header$dataoffset))
  dataoffset <- res$Header$dataoffset + 1 # start position is 1 in R vectors
  
  res$Header$datahandle <- SInt32(rawdata[31:34])
  if(verbose) print(paste("DirEntry datahandle: ", res$Header$datahandle))

  res$Header$unused <- SInt16(rawdata[35:128], n = 47)
  # Should be ingnored and set to zero
  res$Header$unused[1:length(res$Header$unused)] <- 0
  if(verbose) print(paste("DirEntry unused: ", length(res$Header$unused), "2-byte integers"))

  #
  # The directory is located at the offset specified in the header,
  # and consist of an array of directory entries.
  # We scan the directory to put values in a data.frame:
  #
  dirdf <- data.frame(list(name = character(0)))
  dirdf$name <- as.character(dirdf$name) # force to characters
  
  for(i in seq_len(res$Header$numelements)){
    deb <- (i-1)*res$Header$DirEntry.elementsize + dataoffset
    direntry <- rawdata[deb:(deb + res$Header$DirEntry.elementsize)]
    dirdf[i, "name"] <- RTC(direntry[1:4])
    dirdf[i, "tagnumber"] <- SInt32(direntry[5:8])
    dirdf[i, "elementtype"] <- SInt16(direntry[9:10])
    dirdf[i, "elementsize"] <- SInt16(direntry[11:12])
    dirdf[i, "numelements"] <- SInt32(direntry[13:16])
    dirdf[i, "datasize"] <- SInt32(direntry[17:20])
    dirdf[i, "dataoffset"] <- SInt32(direntry[21:24])
  }
  if(verbose){
  	 print("Element found:")
  	 print(dirdf$name)
  }
  #
  # Save Directory and make a list to store data:
  #
  res$Directory <- dirdf
  res$Data <- vector("list", nrow(dirdf))
  names(res$Data) <- paste(dirdf$name, dirdf$tagnumber, sep = ".")
  #
  # Data extraction:
  #
  for(i in seq_len(res$Header$numelements)){
    deb <- (i-1)*res$Header$DirEntry.elementsize + dataoffset
    # Short data are stored in dataoffset directly:
    if(dirdf[i, "datasize"] > 4){
      debinraw <- dirdf[i, "dataoffset"] + 1
    } else {
      debinraw <- deb + 20
    }
    elementtype <- dirdf[i, "elementtype"]
    numelements <- dirdf[i, "numelements"]
    elementsize <- dirdf[i, "elementsize"]
    data <- rawdata[debinraw:(debinraw + numelements*elementsize)]
    # unsigned 8 bits integer:
    if(elementtype == 1) res$Data[[i]] <- UInt8(data, n = numelements)
    # char or signed 8 bits integer
    if(elementtype == 2){
        res$Data[[i]] <- tryCatch(RTC(data),finally=paste(rawToChar(data,multiple=TRUE),collapse=""),error=function(er){cat(paste("an error was detected with the following  message:",er," but this error was fixed\n",sep=" "))})
        }
    
    # unsigned 16 bits integer:
    if(elementtype == 3) res$Data[[i]] <- UInt16(data, n = numelements)
    # short:
    if(elementtype == 4) res$Data[[i]] <- SInt16(data, n = numelements)
    # long:
    if(elementtype == 5) res$Data[[i]] <- SInt32(data, n = numelements)
    # float:
    if(elementtype == 7) res$Data[[i]] <- f32(data, n = numelements)
    # double:
    if(elementtype == 8) res$Data[[i]] <- f64(data, n = numelements)
    # date:
    if(elementtype == 10)
      res$Data[[i]] <- list(year = SInt16(data, n = 1), 
       month = UInt8(data[-(1:2)], n = 1), day = UInt8(data[-(1:3)], n = 1))
    # time:
    if(elementtype == 11)
      res$Data[[i]] <- list(hour = UInt8(data, n = 1), 
        minute = UInt8(data[-1], n = 1), second = UInt8(data[-(1:2)], n = 1),
        hsecond = UInt8(data[-(1:3)], n = 1))
    # pString:
    if(elementtype == 18){
      n <- SInt8(rawdata[debinraw])
      pString <- RTC(rawdata[(debinraw+1):(debinraw+n)])
      res$Data[[i]] <- pString
    }
    # cString:
    if(elementtype == 19) res$Data[[i]] <- RTC(data[1:(length(data) - 1) ])
    # user:
    if(elementtype >= 1024) res$Data[[i]] <- data
    # legacy:
    if(elementtype %in% c(12, 13)) 
      warning("unimplemented legacy type found in file")
    if(elementtype %in% c(6, 9, 14, 15, 16, 17, 20, 128, 256, 384))
      warning("unsupported legacy type found in file")
  }
  return(res)
}