File: arff.R

package info (click to toggle)
foreign 0.8.71-1
links: PTS, VCS
area: main
in suites: buster
size: 1,632 kB
sloc: ansic: 7,543; asm: 4; makefile: 2
file content (188 lines) | stat: -rw-r--r-- 6,575 bytes
parent folder | download | duplicates (6)
### This file is part of the 'foreign' package for R.
### Functions for reading and writing files in Weka ARFF format.

#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  A copy of the GNU General Public License is available at
#  http://www.r-project.org/Licenses/

### <NOTE>
### String and evaluation types are enclosed by single quotes upon
### writing and enclosing single quotes are removed upon reading.
### Escaped single quotes inside single quotes may also occur.
### </NOTE>

read.arff <-
function(file)
{
    ## See read.table().
    if(is.character(file)) {
        file <- file(file, "r")
        on.exit(close(file))
    }
    if(!inherits(file, "connection"))
        stop("Argument 'file' must be a character string or connection.")
    if(!isOpen(file)) {
        open(file, "r")
        on.exit(close(file))
    }

    ## Get header.
    col_names <- NULL
    col_types <- NULL
    col_dfmts <- character()
    line <- readLines(file, n = 1L)
    while(length(line) &&
          regexpr('^[[:space:]]*@(?i)data', line, perl = TRUE) == -1L) {
        if(regexpr('^[[:space:]]*@(?i)attribute', line,
                   perl = TRUE) > 0L) {
            con <- textConnection(line)
            line <- scan(con, character(), quiet = TRUE)
            close(con)
            if(length(line) < 3L)
                stop("Invalid attribute specification.")
            col_names <- c(col_names, line[2L])
            if((type <- tolower(line[3L])) == "date") {
                col_types <- c(col_types, "character")
                col_dfmts <- c(col_dfmts,
                               if(length(line) > 3L)
                               ISO_8601_to_POSIX_datetime_format(line[4L])
                               else "%Y-%m-%d %H:%M:%S")
            }
            else if(type == "relational")
                stop("Type 'relational' currently not implemented.")
            else {
                type <- sub("\\{.*", "factor", type)
                ## (Could try to preserve factor levels ...)
                type <- sub("string", "character", type)
                type <- sub("real", "numeric", type)
                col_types <- c(col_types, type)
                col_dfmts <- c(col_dfmts, NA)
            }
        }
        line <- readLines(file, n = 1L)
    }

    ## Test header.
    if(length(line) == 0L)
        stop("Missing data section.")
    if(is.null(col_names))
        stop("Missing attribute section.")
    if(length(col_names) !=
       length(grep('factor|numeric|character', col_types)))
        stop("Invalid type specification.")

    ## Get data.
    data <- read.table(file, sep = ",", na.strings = "?",
                       colClasses = col_types, comment.char = '%')
    if(any(ind <- which(!is.na(col_dfmts))))
        for(i in ind)
            data[i] <- as.data.frame(strptime(data[[i]], col_dfmts[i]))
    ## Remove left over escapes.
    for (i in seq_len(length(data)))
        if (is.factor(data[[i]]))
           levels(data[[i]]) <- gsub("\\\\", "", levels(data[[i]]))
    names(data) <- col_names
    data
}

write.arff <-
function(x, file, eol = "\n", relation = deparse(substitute(x)))
{
    ## See write.table().
    if(file == "")
        file <- stdout()
    else if(is.character(file)) {
        file <- file(file, "wb")
        on.exit(close(file))
    }
    if(!inherits(file, "connection"))
        stop("Argument 'file' must be a character string or connection.")

    if (!is.data.frame(x) && !is.matrix(x))
        x <- data.frame(x)

    ## We need to quote for ourselves, as write.table() escapes the quote
    ## char but not the backslash.  Weka seems to prefer backslash
    ## escapes inside single quotes, so we provide that ...
    squote <- function(s) {
        ## Don't quote NAs.
        ifelse(is.na(s), s,
               sprintf("'%s'", gsub("(['\\])", "\\\\\\1", s)))
    }
    spquote <- function(s) {
        if (length(grep("^[[:alpha:]]", s)) == 0L) s <- paste("X", s, sep="")
        if (length(grep(" ", s))) s <- paste('"', s, '"', sep="")
        s
    }

    ## Write header.  Quote, mangle if necessary.
    text <- paste('@relation', spquote(make.names(relation)))
    writeLines(text, file, sep = eol)
    for (name in colnames(x)) {
        ## Attribute names need to start with a letter, quoted if contain spaces.
        text <- paste('@attribute', spquote(name))
        if (is.data.frame(x) && is.factor(x[[name]])) {
            lev <- squote(levels(x[[name]]))
            levels(x[[name]]) <- lev
            text <- paste(text, " {", paste(lev, collapse = ","), "}", sep = "")
        }
        else if (is.character(x[,name])) {
            text <- paste(text, "string")
            x[,name] <- squote((x[,name]))
        }
        else if (inherits(x[,name], "Date")) {
            text <- paste(text, "date \"yyyy-MM-dd\"")
            x[,name] <- squote(format(x[,name]))
        }
        else if (inherits(x[,name], "POSIXt")) {
            text <- paste(text, "date \"yyyy-MM-dd HH:mm:ss\"")
            x[,name] <- squote(format(x[,name]))
        }
        else
            text <- paste(text, "numeric")
        writeLines(text, file, sep = eol)
    }

    ## Write data.
    writeLines("@data", file)
    write.table(x, file = file, na = "?", sep = ",",
                eol = eol, quote = FALSE, row.names = FALSE,
                col.names = FALSE)
}


ISO_8601_to_POSIX_datetime_format <-
function(x)
{
    ## First, Weka thinks that 'yyyy' is ISO 8601 ...
    x <- sub("yyyy", "%Y", x, ignore.case = TRUE)
    ## And it's 'DD' and not 'dd' ...
    x <- sub("dd", "%d", x)
    ## And it's 'hh' and not 'HH' ...
    x <- sub("HH", "%H", x)

    ## Now the real stuff.
    ## Is there a POSIX format string for the century component of year?
    x <- sub("CCYY", "%Y", x)
    x <- sub("YY", "%y", x)
    x <- sub("MM", "%m", x)
    x <- sub("DD", "%d", x)
    x <- sub("DDD", "%j", x)
    x <- sub("ww", "%U", x)
    x <- sub("D", "%w", x)
    x <- sub("hh", "%H", x)
    x <- sub("mm", "%M", x)
    x <- sub("ss", "%S", x)
    ## Is there a POSIX format string for fractions of seconds? [No]

    x
}