1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
|
### This file is part of the 'foreign' package for R.
###
### Read SPSS system data files
###
### Copyright 2000-2002 Saikat DebRoy <saikat$stat.wisc.edu>
### Douglas M. Bates <bates$stat.wisc.edu>,
### Thomas Lumley
### Copyright 2007-9 R Core Development Team
### This file is part of the `foreign' package for R and related languages.
### It is made available under the terms of the GNU General Public
### License, version 2, or at your option, any later version,
### incorporated herein by reference.
###
### This program is distributed in the hope that it will be
### useful, but WITHOUT ANY WARRANTY; without even the implied
### warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
### PURPOSE. See the GNU General Public License for more
### details.
###
### You should have received a copy of the GNU General Public
### License along with this program; if not, a copy is available at
### http://www.r-project.org/Licenses/
read.spss <- function(file, use.value.labels = TRUE, to.data.frame = FALSE,
max.value.labels = Inf, trim.factor.names = FALSE,
trim_values = TRUE, reencode = NA,
use.missings = to.data.frame)
{
trim <- function(strings, trim=TRUE)
if (trim) sub(" +$","",strings) else strings
## mappings taken from win-iconv
knownCP <- c("UCS-2LE" = 1200, "UCS-2BE" = 1201,
"macroman" = 10000, " UCS-4LE" = 12000, "UCS-4BE" = 12001,
"koi8-r" = 20866, "koi8-u" = 21866,
"latin1" = 28591, "latin2" = 28592, "latin3" = 28593,
## latin-9 seems most portable, but only on Windows
## R >= 2.10.0. libiconv doesn't know latin9.
"latin4" = 28594, "latin-9" = 28605,
"ISO-2022-JP" = 50221, "euc-jp" = 51932,
"UTF-8" = 65001,
"ASCII" = 20127,
## pages known to glibc and libiconv
"CP1250" = 1250,
"CP1251" = 1251,
"CP1252" = 1252,
"CP1253" = 1253,
"CP1254" = 1254,
"CP1255" = 1255,
"CP1256" = 1256,
"CP1257" = 1257,
"CP1258" = 1258,
"CP874" = 874,
"CP936" = 936)
if(length(grep("^(http|ftp|https)://", file))) {
tmp <- tempfile()
download.file(file, tmp, quiet = TRUE, mode = "wb")
file <- tmp
on.exit(unlink(file))
}
rval <- .Call(do_read_SPSS, file)
codepage <- attr(rval, "codepage")
if(is.null(codepage)) codepage <- 2 # .por files
if(!capabilities("iconv")) reencode <- FALSE
if(!identical(reencode, FALSE)) {
cp <- "unknown"
if(is.character(reencode)) {
cp <- reencode
reencode <- TRUE
} else if(codepage == 20127) {
reencode <- FALSE # ASCII
} else if(m <- match(codepage, knownCP, 0L)) {
cp <-names(knownCP)[m]
} else if (codepage < 200) {
## small numbers are not codepages, and real codepages are large
attr(rval, "codepage") <- NULL
reencode <- FALSE
} else cp <- paste("CP", codepage, sep="")
if(is.na(reencode)) reencode <- l10n_info()[["UTF-8"]]
if(reencode) {
message("re-encoding from ", cp)
names(rval) <- iconv(names(rval), cp, "")
vl <- attr(rval, "variable.labels")
nm <- names(vl)
vl <- iconv(vl, cp, "")
names(vl) <- iconv(nm, cp, "")
attr(rval, "variable.labels") <- vl
for(i in seq_along(rval)) {
xi <- rval[[i]]
if(is.character(xi)) rval[[i]] <- iconv(xi, cp, "")
}
}
}
miss <- attr(rval, "missings")
if(!is.null(miss)) {
if(reencode) {
nm <- names(miss)
nm <- iconv(nm, cp, "")
names(miss) <- iconv(nm, cp, "")
for(i in seq_along(miss))
if(is.character(miss[[i]]$value))
miss[[i]]$value <- iconv(miss[[i]]$value, cp, "")
attr(rval, "missings") <- miss
}
if(use.missings)
for(v in names(rval)) {
tp <- miss[[v]]$type
if(tp %in% "none") next
if(tp %in% c("one", "two", "three")) {
xi <- rval[[v]]
other <- miss[[v]]$value
## FIXME: do we need to worry about padding for string vals?
xi[xi %in% other] <- NA
rval[[v]] <- xi
## NB: not much tested from here down
} else if(tp == "low" || tp == "low+1") {
xi <- rval[[v]]
z <- miss[[v]]$value
if(tp == "low+1") xi[ xi <= z[1L] | xi == z[2L] ] <- NA
else xi[xi <= z[1L]] <- NA
rval[[v]] <- xi
} else if(tp == "high" || tp == "high+1") {
xi <- rval[[v]]
z <- miss[[v]]$value
if(tp == "high+1") xi[ xi >= z[1L] | xi == z[2L] ] <- NA
else xi[ xi >= z[1L] ] <- NA
rval[[v]] <- xi
} else if(tp == "range" || tp == "range+1") {
xi <- rval[[v]]
z <- miss[[v]]$value
if(tp == "range+1")
xi[ xi >= z[1L] | xi <= z[2L] | xi[xi == z[3L]] ] <- NA
else
xi[ xi >= z[1L] | xi <= z[2L] ] <- NA
rval[[v]] <- xi
} else
warning(gettextf("missingness type %s is not handled", tp),
domain = NA)
}
} else use.missings <- FALSE
vl <- attr(rval,"label.table")
if(reencode) names(vl) <- iconv(names(vl), cp, "")
has.vl <- which(!sapply(vl, is.null))
for(v in has.vl) {
nm <- names(vl)[[v]]
nvalues <- length(na.omit(unique(rval[[nm]])))
nlabels <- length(vl[[v]])
if(reencode && nlabels) {
nm2 <- names(vl[[v]])
vl[[v]] <- iconv(vl[[v]], cp, "")
names(vl[[v]]) <- iconv(nm2, cp, "")
}
if(use.missings && !is.null(mv <- miss[[v]]$value))
vl[[v]] <- vl[[v]][! vl[[v]] %in% mv]
if (use.value.labels &&
(!is.finite(max.value.labels) || nvalues <= max.value.labels) &&
nlabels >= nvalues) {
rval[[nm]] <- factor(trim(rval[[nm]], trim_values),
levels = rev(trim(vl[[v]], trim_values)),
labels = rev(trim(names(vl[[v]]), trim.factor.names)))
} else
attr(rval[[nm]], "value.labels") <- vl[[v]]
}
if(reencode) attr(rval, "label.table") <- vl
if (to.data.frame) {
varlab <- attr(rval, "variable.labels")
rval <- as.data.frame(rval)
attr(rval, "variable.labels") <- varlab
if(codepage > 500) attr(rval, "codepage") <- codepage
}
rval
}
|