File: importers.Rd

package info (click to toggle)
r-cran-memisc 0.99.31.8.2%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,136 kB
  • sloc: ansic: 5,117; makefile: 2
file content (310 lines) | stat: -rw-r--r-- 12,276 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
\name{importers}
\alias{importer}
\alias{importer-class}
\alias{spss.file}
\alias{spss.portable.file}
\alias{spss.portable.importer-class}
\alias{show,spss.portable.importer-method}
\alias{spss.system.file}
\alias{spss.system.importer-class}
\alias{show,spss.system.importer-method}
\alias{spss.fixed.file}
\alias{spss.fixed.importer-class}
\alias{show,spss.fixed.importer-method}
\alias{Stata.file}
\alias{Stata.importer-class}
\alias{show,Stata.importer-method}
\alias{Stata_new.importer-class}
\alias{show,Stata_new.importer-method}
\alias{as.data.set,importer-method}
\alias{dim,importer-method}
\alias{names,importer-method}
\alias{[,importer,atomic,atomic,ANY-method}
\alias{[,importer,atomic,missing,ANY-method}
\alias{[,importer,missing,atomic,ANY-method}
\alias{[,importer,missing,missing,ANY-method}
\alias{[[,importer-method}
\alias{$,importer-method}
\alias{head,importer-method}
\alias{tail,importer-method}
\alias{initialize,spss.portable.importer-method}
\alias{initialize,spss.system.importer-method}
\alias{initialize,spss.fixed.importer-method}
\alias{initialize,Stata.importer-method}
\alias{initialize,Stata_new.importer-method}
\alias{subset.spss.portable.importer}
\alias{subset.spss.system.importer}
\alias{subset.spss.fixed.importer}
\alias{subset.Stata.importer}
\alias{subset.Stata_new.importer}

\title{Object Oriented Interface to Foreign Files}
\description{
Importer objects are objects that refer to an external
data file. Currently only Stata files,
SPSS system, portable, and fixed-column files are supported.

Data are actually imported by `translating' an
importer file into a \code{\link{data.set}} using
\code{as.data.set} or \code{subset}.

The \code{importer} mechanism is more flexible and extensible
than \code{\link[foreign]{read.spss}} and  \code{\link[foreign]{read.dta}}
of package "foreign", as most of the parsing of the file headers is done in R.
It is also adapted to efficiently load large data sets.
Most importantly, importer objects support the
\code{\link{labels}}, \code{\link{missing.values}},
and \code{\link{description}}s, provided by this package.
}
\usage{
spss.file(file,\dots)

spss.fixed.file(file,
  columns.file,
  varlab.file=NULL,
  codes.file=NULL,
  missval.file=NULL,
  count.cases=TRUE,
  to.lower=getOption("spss.fixed.to.lower",FALSE),
  iconv=TRUE,
  encoded=getOption("spss.fixed.encoding","cp1252"),
  negative2missing = FALSE)

spss.portable.file(file,
  varlab.file=NULL,
  codes.file=NULL,
  missval.file=NULL,
  count.cases=TRUE,
  to.lower=getOption("spss.por.to.lower",FALSE),
  iconv=TRUE,
  encoded=getOption("spss.por.encoding","cp1252"),
  negative2missing = FALSE)

spss.system.file(file,
  varlab.file=NULL,
  codes.file=NULL,
  missval.file=NULL,
  count.cases=TRUE,
  to.lower=getOption("spss.sav.to.lower",FALSE),
  iconv=TRUE,
  encoded=getOption("spss.sav.encoding","cp1252"),
  ignore.scale.info = FALSE,
  negative2missing = FALSE)

Stata.file(file,
           iconv=TRUE,
           encoded=if(new_format)
                        getOption("Stata.new.encoding","utf-8")
                   else getOption("Stata.old.encoding","cp1252"),
           negative2missing = FALSE)

## The most important methods for "importer" objects are:
\method{subset}{spss.system.importer}(x, subset, select, drop = FALSE, \dots)
\method{subset}{spss.portable.importer}(x, subset, select, drop = FALSE, \dots)
\method{subset}{spss.fixed.importer}(x, subset, select, drop = FALSE, \dots)
\method{subset}{Stata.importer}(x, subset, select, drop = FALSE, \dots)
\method{subset}{Stata_new.importer}(x, subset, select, drop = FALSE, \dots)

\S4method{as.data.set}{importer}(x,row.names=NULL,optional=NULL,
                    compress.storage.modes=FALSE,\dots)

\S4method{head}{importer}(x,n=20,\dots)
\S4method{tail}{importer}(x,n=20,\dots)
}
\arguments{
    \item{file}{character string; the path to the file containing
      the data}
    \item{\dots}{Other arguments. \code{spss.file()} passes them on to
        \code{spss.portable.file()} of  \code{spss.system.file()}. Other
        function ignore further arguments.}
    \item{columns.file}{character string; the path to an
        SPSS/PSPP syntax file with a \code{DATA LIST FIXED} statement}
    \item{varlab.file}{character string; the path to an
        SPSS/PSPP syntax file with a \code{VARIABLE LABELS} statement}
    \item{codes.file}{character string; the path to an
        SPSS/PSPP syntax file with a \code{VALUE LABELS} statement}
    \item{missval.file}{character string; the path to an
        SPSS/PSPP syntax file with a \code{MISSING VALUES} statement}
    \item{count.cases}{logical; should cases in file be counted? This
      takes effect only if the data file does not already contain information
      about the number of cases.}
    \item{to.lower}{logical; should variable names changed to lower
      case?}
    \item{iconv}{logical; should strings (in labels and
      variables) changed into encoding of the platform?}
    \item{encoded}{a cacharacter string; the way characters are encoded
      in the improrted file. For the available encoding options
      see \code{?iconvlist}. Using this argument for
      \code{spss.system.file} this is only a fallback, as the function
      uses the encoding information present in the file if it is
      present.}
    \item{negative2missing}{logical; should negative values be marked
      as missing values? This is the convention of some newer data sets that
      are available e.g. from the GESIS data archive.}
    \item{ignore.scale.info}{logical; should information about measuremnt
      scale levels provided in the file be ignored?}
    \item{x}{an object that inherits from class \code{"importer"}.}
    \item{subset}{a logical vector or an expression containing variables
        from the external data file that evaluates to logical. }
    \item{select}{a vector of variable names from the external data file.
          This may also be a named vector, where the names give
          the names into which the variables from the external data
          file are renamed.}
    \item{drop}{a logical value, that determines what happens if
      only one column is selected. If TRUE and only one column
      is selected, \code{subset} returns only a single \code{item}
      object and not a \code{data.set}.}
    \item{row.names}{ignored, present only for compatibility.}
    \item{optional}{ignored, present only for compatibility.}
    \item{compress.storage.modes}{logical value; if TRUE floating point values
      are converted to integers if possible without loss of information.}
    \item{n}{integer; the number of rows to be shown by \code{head} or \code{tail}}
}
\value{
\code{spss.fixed.file}, \code{spss.portable.file},
\code{spss.system.file}, and \code{Stata.file}
return, respectively, objects of class
\code{"spss.fixed.importer"}, \code{"spss.portable.importer"},
\code{"spss.system.importer"}, \code{"Stata.importer"}, or \code{"Stata_new.importer"},
which, by inheritance, are also objects of class \code{"importer"}.
\code{"Stata.importer"} is for files in the format of Stata versions up
    to 12, while \code{"Stata_new.importer"} is for files in the newer
    format of Stata versions from 13.

Objects of class \code{"importer"} have at least the following two slots:

  \item{ptr}{an external pointer}
  \item{variables}{a list of objects of class \code{"item.vector"} which
    provides a `prototype' for the \code{"data.set"} set objects returned
    by the \code{as.data.set} and \code{subset} methods for objects of
    class \code{"importer"} }


The \code{as.data.frame} for \code{importer} objects does
the actual data import and returns a data frame. Note that in contrast
to \code{\link[foreign]{read.spss}}, the variable names of the
resulting data frame will be lower case, unless the importer function
is called with \code{to.lower=FALSE}. If long variable names
are defined (in case of a PSPP/SPSS system file), they take
precedence and are \emph{not} coerced to lower case.
}
\seealso{ \code{\link{codebook}}, \code{\link{description}},
\code{\link[foreign]{read.spss}}
}
\details{
    A call to a `constructor' for an importer object, that is,
    \code{spss.fixed.file}, \code{spss.portable.file}, \code{spss.sysntax.file},
    or \code{Stata.file},
    causes R to read in the header of the data file and/or
    the syntax files that contain information about
    the variables, such as the columns that they occupy
    (in case of \code{spss.fixed.file}), variable labels,
    value labels and missing values.

    The information in the file header and/or the accompagnying
    files is then processed to prepare the file for importing.
    Thus the inner structure of an \code{importer} object may
    well vary according to what type of file is to imported and
    what additional information is given.

    The \code{as.data.set} and \code{subset} methods
    for \code{"importer"} objects internally use the
    generic functions \code{seekData}, \code{readData}, \code{readSlice},
    and \code{readChunk}, which have methods for the
    subclasses of \code{"importer"}.
    These functions are not callable
    from outside the package, however.

    The \code{subset} method for \code{"importer"} objects reads in
    the data `chunk-wise' to create the subset of observations if
    the option \code{"subset.chunk.size"} is set to a non-\code{NULL}
    value, e.g. by \code{options(subset.chunk.size=1000)}. This may be
    useful in case of very large data sets from which only a tiny subset
    of observations is needed for analysis.
    
    Since the functions described here are more or less complete rewrite
    based on the description of the file structure provided
    by the documenation for PSPP, they are perhaps not as thorougly tested as the 
    functions in the \code{foreign} package, apart from the frequent use
    by the author of this package.
}
\examples{
# Extract American National Election Study of 1948
nes1948.por <- unzip(system.file("anes/NES1948.ZIP",package="memisc"),
                     "NES1948.POR",exdir=tempfile())

# Get information about the variables contained.
nes1948 <- spss.portable.file(nes1948.por)

# The data are not yet loaded:
show(nes1948)

# ... but one can see what variables are present:
description(nes1948)

# Now a subset of the data is loaded:
vote.socdem.48 <- subset(nes1948,
              select=c(
                  V480018,
                  V480029,
                  V480030,
                  V480045,
                  V480046,
                  V480047,
                  V480048,
                  V480049,
                  V480050
                  ))

# Let's make the names more descriptive:
vote.socdem.48 <- rename(vote.socdem.48,
                  V480018 = "vote",
                  V480029 = "occupation.hh",
                  V480030 = "unionized.hh",
                  V480045 = "gender",
                  V480046 = "race",
                  V480047 = "age",
                  V480048 = "education",
                  V480049 = "total.income",
                  V480050 = "religious.pref"
        )

# It is also possible to do both
# in one step:
# vote.socdem.48 <- subset(nes1948,
#              select=c(
#                  vote           = V480018,
#                  occupation.hh  = V480029,
#                  unionized.hh   = V480030,
#                  gender         = V480045,
#                  race           = V480046,
#                  age            = V480047,
#                  education      = V480048,
#                  total.income   = V480049,
#                  religious.pref = V480050
#                  ))



# We examine the data more closely:
codebook(vote.socdem.48)

# ... and conduct some analyses.
#
t(genTable(percent(vote)~occupation.hh,data=vote.socdem.48))

# We consider only the two main candidates.
vote.socdem.48 <- within(vote.socdem.48,{
  truman.dewey <- vote
  valid.values(truman.dewey) <- 1:2
  truman.dewey <- relabel(truman.dewey,
              "VOTED - FOR TRUMAN" = "Truman",
              "VOTED - FOR DEWEY"  = "Dewey")
  })

summary(truman.relig.glm <- glm((truman.dewey=="Truman")~religious.pref,
    data=vote.socdem.48,
    family="binomial",
))
}
\keyword{file}