1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
|
### This file is part of the 'foreign' package for R.
### Functions for reading and writing files in Weka ARFF format.
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# A copy of the GNU General Public License is available at
# http://www.r-project.org/Licenses/
### <NOTE>
### String and evaluation types are enclosed by single quotes upon
### writing and enclosing single quotes are removed upon reading.
### Escaped single quotes inside single quotes may also occur.
### </NOTE>
read.arff <-
function(file)
{
## See read.table().
if(is.character(file)) {
file <- file(file, "r")
on.exit(close(file))
}
if(!inherits(file, "connection"))
stop("Argument 'file' must be a character string or connection.")
if(!isOpen(file)) {
open(file, "r")
on.exit(close(file))
}
## Get header.
col_names <- NULL
col_types <- NULL
col_dfmts <- character()
line <- readLines(file, n = 1L)
while(length(line) &&
regexpr('^[[:space:]]*@(?i)data', line, perl = TRUE) == -1L) {
if(regexpr('^[[:space:]]*@(?i)attribute', line,
perl = TRUE) > 0L) {
con <- textConnection(line)
line <- scan(con, character(), quiet = TRUE)
close(con)
if(length(line) < 3L)
stop("Invalid attribute specification.")
col_names <- c(col_names, line[2L])
if((type <- tolower(line[3L])) == "date") {
col_types <- c(col_types, "character")
col_dfmts <- c(col_dfmts,
if(length(line) > 3L)
ISO_8601_to_POSIX_datetime_format(line[4L])
else "%Y-%m-%d %H:%M:%S")
}
else if(type == "relational")
stop("Type 'relational' currently not implemented.")
else {
type <- sub("\\{.*", "factor", type)
## (Could try to preserve factor levels ...)
type <- sub("string", "character", type)
type <- sub("real", "numeric", type)
col_types <- c(col_types, type)
col_dfmts <- c(col_dfmts, NA)
}
}
line <- readLines(file, n = 1L)
}
## Test header.
if(length(line) == 0L)
stop("Missing data section.")
if(is.null(col_names))
stop("Missing attribute section.")
if(length(col_names) !=
length(grep('factor|numeric|character', col_types)))
stop("Invalid type specification.")
## Get data.
data <- read.table(file, sep = ",", na.strings = "?",
colClasses = col_types, comment.char = '%')
if(any(ind <- which(!is.na(col_dfmts))))
for(i in ind)
data[i] <- as.data.frame(strptime(data[[i]], col_dfmts[i]))
## Remove left over escapes.
for (i in seq_len(length(data)))
if (is.factor(data[[i]]))
levels(data[[i]]) <- gsub("\\\\", "", levels(data[[i]]))
names(data) <- col_names
data
}
write.arff <-
function(x, file, eol = "\n", relation = deparse(substitute(x)))
{
## See write.table().
if(file == "")
file <- stdout()
else if(is.character(file)) {
file <- file(file, "wb")
on.exit(close(file))
}
if(!inherits(file, "connection"))
stop("Argument 'file' must be a character string or connection.")
if (!is.data.frame(x) && !is.matrix(x))
x <- data.frame(x)
## We need to quote for ourselves, as write.table() escapes the quote
## char but not the backslash. Weka seems to prefer backslash
## escapes inside single quotes, so we provide that ...
squote <- function(s) {
## Don't quote NAs.
ifelse(is.na(s), s,
sprintf("'%s'", gsub("(['\\])", "\\\\\\1", s)))
}
spquote <- function(s) {
if (length(grep("^[[:alpha:]]", s)) == 0L) s <- paste("X", s, sep="")
if (length(grep(" ", s))) s <- paste('"', s, '"', sep="")
s
}
## Write header. Quote, mangle if necessary.
text <- paste('@relation', spquote(make.names(relation)))
writeLines(text, file, sep = eol)
for (name in colnames(x)) {
## Attribute names need to start with a letter, quoted if contain spaces.
text <- paste('@attribute', spquote(name))
if (is.data.frame(x) && is.factor(x[[name]])) {
lev <- squote(levels(x[[name]]))
levels(x[[name]]) <- lev
text <- paste(text, " {", paste(lev, collapse = ","), "}", sep = "")
}
else if (is.character(x[,name])) {
text <- paste(text, "string")
x[,name] <- squote((x[,name]))
}
else if (inherits(x[,name], "Date")) {
text <- paste(text, "date \"yyyy-MM-dd\"")
x[,name] <- squote(format(x[,name]))
}
else if (inherits(x[,name], "POSIXt")) {
text <- paste(text, "date \"yyyy-MM-dd HH:mm:ss\"")
x[,name] <- squote(format(x[,name]))
}
else
text <- paste(text, "numeric")
writeLines(text, file, sep = eol)
}
## Write data.
writeLines("@data", file)
write.table(x, file = file, na = "?", sep = ",",
eol = eol, quote = FALSE, row.names = FALSE,
col.names = FALSE)
}
ISO_8601_to_POSIX_datetime_format <-
function(x)
{
## First, Weka thinks that 'yyyy' is ISO 8601 ...
x <- sub("yyyy", "%Y", x, ignore.case = TRUE)
## And it's 'DD' and not 'dd' ...
x <- sub("dd", "%d", x)
## And it's 'hh' and not 'HH' ...
x <- sub("HH", "%H", x)
## Now the real stuff.
## Is there a POSIX format string for the century component of year?
x <- sub("CCYY", "%Y", x)
x <- sub("YY", "%y", x)
x <- sub("MM", "%m", x)
x <- sub("DD", "%d", x)
x <- sub("DDD", "%j", x)
x <- sub("ww", "%U", x)
x <- sub("D", "%w", x)
x <- sub("hh", "%H", x)
x <- sub("mm", "%M", x)
x <- sub("ss", "%S", x)
## Is there a POSIX format string for fractions of seconds? [No]
x
}
|