File: mice.impute.pmm.Rd

package info (click to toggle)
r-cran-mice 3.17.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,380 kB
  • sloc: cpp: 121; sh: 25; makefile: 2
file content (216 lines) | stat: -rw-r--r-- 7,907 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/mice.impute.pmm.R
\name{mice.impute.pmm}
\alias{mice.impute.pmm}
\alias{pmm}
\title{Imputation by predictive mean matching}
\usage{
mice.impute.pmm(
  y,
  ry,
  x,
  wy = NULL,
  donors = 5L,
  matchtype = 1L,
  exclude = NULL,
  quantify = TRUE,
  trim = 1L,
  ridge = 1e-05,
  use.matcher = FALSE,
  ...
)
}
\arguments{
\item{y}{Vector to be imputed}

\item{ry}{Logical vector of length \code{length(y)} indicating the
the subset \code{y[ry]} of elements in \code{y} to which the imputation
model is fitted. The \code{ry} generally distinguishes the observed
(\code{TRUE}) and missing values (\code{FALSE}) in \code{y}.}

\item{x}{Numeric design matrix with \code{length(y)} rows with predictors for
\code{y}. Matrix \code{x} may have no missing values.}

\item{wy}{Logical vector of length \code{length(y)}. A \code{TRUE} value
indicates locations in \code{y} for which imputations are created.}

\item{donors}{The size of the donor pool among which a draw is made.
The default is \code{donors = 5L}. Setting \code{donors = 1L} always selects
the closest match, but is not recommended. Values between 3L and 10L
provide the best results in most cases (Morris et al, 2015).}

\item{matchtype}{Type of matching distance. The default choice
(\code{matchtype = 1L}) calculates the distance between
the \emph{predicted} value of \code{yobs} and
the \emph{drawn} values of \code{ymis} (called type-1 matching).
Other choices are \code{matchtype = 0L}
(distance between predicted values) and \code{matchtype = 2L}
(distance between drawn values).}

\item{exclude}{Dependent values to exclude from the imputation model
and the collection of donor values}

\item{quantify}{Logical. If \code{TRUE}, factor levels are replaced
by the first canonical variate before fitting the imputation model.
If false, the procedure reverts to the old behaviour and takes the
integer codes (which may lack a sensible interpretation).
Relevant only of \code{y} is a factor.}

\item{trim}{Scalar integer. Minimum number of observations required in a
category in order to be considered as a potential donor value.
Relevant only of \code{y} is a factor.}

\item{ridge}{The ridge penalty used in \code{.norm.draw()} to prevent
problems with multicollinearity. The default is \code{ridge = 1e-05},
which means that 0.01 percent of the diagonal is added to the cross-product.
Larger ridges may result in more biased estimates. For highly noisy data
(e.g. many junk variables), set \code{ridge = 1e-06} or even lower to
reduce bias. For highly collinear data, set \code{ridge = 1e-04} or higher.}

\item{use.matcher}{Logical. Set \code{use.matcher = TRUE} to specify
the C function \code{matcher()}, the now deprecated matching function that
was default in versions
\code{2.22} (June 2014) to \code{3.11.7} (Oct 2020). Since version \code{3.12.0}
\code{mice()} uses the much faster \code{matchindex} C function. Use
the deprecated \code{matcher} function only for exact reproduction.}

\item{\dots}{Other named arguments.}
}
\value{
Vector with imputed data, same type as \code{y}, and of length
\code{sum(wy)}
}
\description{
Imputation by predictive mean matching
}
\details{
Imputation of \code{y} by predictive mean matching, based on
van Buuren (2012, p. 73). The procedure is as follows:

\enumerate{
\item{Calculate the cross-product matrix \eqn{S=X_{obs}'X_{obs}}.}
\item{Calculate \eqn{V = (S+{diag}(S)\kappa)^{-1}}, with some small ridge
parameter \eqn{\kappa}.}
\item{Calculate regression weights \eqn{\hat\beta = VX_{obs}'y_{obs}.}}
\item{Draw \eqn{q} independent \eqn{N(0,1)} variates in vector \eqn{\dot z_1}.}
\item{Calculate \eqn{V^{1/2}} by Cholesky decomposition.}
\item{Calculate \eqn{\dot\beta = \hat\beta + \dot\sigma\dot z_1 V^{1/2}}.}
\item{Calculate \eqn{\dot\eta(i,j)=|X_{{obs},[i]|}\hat\beta-X_{{mis},[j]}\dot\beta}
with \eqn{i=1,\dots,n_1} and \eqn{j=1,\dots,n_0}.}
\item{Construct \eqn{n_0} sets \eqn{Z_j}, each containing \eqn{d}
candidate donors, from \eqn{y_{obs}} such that \eqn{\sum_d\dot\eta(i,j)} is
minimum for all \eqn{j=1,\dots,n_0}. Break ties randomly.}
\item{Draw one donor \eqn{i_j} from \eqn{Z_j} randomly for \eqn{j=1,\dots,n_0}.}
\item{Calculate imputations \eqn{\dot y_j = y_{i_j}} for \eqn{j=1,\dots,n_0}.}
}

The name \emph{predictive mean matching} was proposed by Little (1988).
}
\examples{
# We normally call mice.impute.pmm() from within mice()
# But we may call it directly as follows (not recommended)

set.seed(53177)
xname <- c("age", "hgt", "wgt")
r <- stats::complete.cases(boys[, xname])
x <- boys[r, xname]
y <- boys[r, "tv"]
ry <- !is.na(y)
table(ry)

# percentage of missing data in tv
sum(!ry) / length(ry)

# Impute missing tv data
yimp <- mice.impute.pmm(y, ry, x)
length(yimp)
hist(yimp, xlab = "Imputed missing tv")

# Impute all tv data
yimp <- mice.impute.pmm(y, ry, x, wy = rep(TRUE, length(y)))
length(yimp)
hist(yimp, xlab = "Imputed missing and observed tv")
plot(jitter(y), jitter(yimp),
  main = "Predictive mean matching on age, height and weight",
  xlab = "Observed tv (n = 224)",
  ylab = "Imputed tv (n = 224)"
)
abline(0, 1)
cor(y, yimp, use = "pair")

# Use blots to exclude different values per column
# Create blots object
blots <- make.blots(boys)
# Exclude ml 1 through 5 from tv donor pool
blots$tv$exclude <- c(1:5)
# Exclude 100 random observed heights from tv donor pool
blots$hgt$exclude <- sample(unique(boys$hgt), 100)
imp <- mice(boys, method = "pmm", print = FALSE, blots = blots, seed=123)
blots$hgt$exclude \%in\% unlist(c(imp$imp$hgt)) # MUST be all FALSE
blots$tv$exclude \%in\% unlist(c(imp$imp$tv)) # MUST be all FALSE

# Factor quantification
xname <- c("age", "hgt", "wgt")
br <- boys[c(1:10, 101:110, 501:510, 601:620, 701:710), ]
r <- stats::complete.cases(br[, xname])
x <- br[r, xname]
y <- factor(br[r, "tv"])
ry <- !is.na(y)
table(y)

# impute factor by optimizing canonical correlation y, x
mice.impute.pmm(y, ry, x)

# only categories with at least 2 cases can be donor
mice.impute.pmm(y, ry, x, trim = 2L)

# in addition, eliminate category 20
mice.impute.pmm(y, ry, x, trim = 2L, exclude = 20)

# to get old behavior: as.integer(y))
mice.impute.pmm(y, ry, x, quantify = FALSE)
}
\references{
Little, R.J.A. (1988), Missing data adjustments in large surveys
(with discussion), Journal of Business Economics and Statistics, 6, 287--301.

Morris TP, White IR, Royston P (2015). Tuning multiple imputation by predictive
mean matching and local residual draws. BMC Med Res Methodol. ;14:75.

Van Buuren, S. (2018).
\href{https://stefvanbuuren.name/fimd/sec-pmm.html}{\emph{Flexible Imputation of Missing Data. Second Edition.}}
Chapman & Hall/CRC. Boca Raton, FL.

Van Buuren, S., Groothuis-Oudshoorn, K. (2011). \code{mice}: Multivariate
Imputation by Chained Equations in \code{R}. \emph{Journal of Statistical
Software}, \bold{45}(3), 1-67. \doi{10.18637/jss.v045.i03}
}
\seealso{
Other univariate imputation functions: 
\code{\link{mice.impute.cart}()},
\code{\link{mice.impute.lasso.logreg}()},
\code{\link{mice.impute.lasso.norm}()},
\code{\link{mice.impute.lasso.select.logreg}()},
\code{\link{mice.impute.lasso.select.norm}()},
\code{\link{mice.impute.lda}()},
\code{\link{mice.impute.logreg}()},
\code{\link{mice.impute.logreg.boot}()},
\code{\link{mice.impute.mean}()},
\code{\link{mice.impute.midastouch}()},
\code{\link{mice.impute.mnar.logreg}()},
\code{\link{mice.impute.mpmm}()},
\code{\link{mice.impute.norm}()},
\code{\link{mice.impute.norm.boot}()},
\code{\link{mice.impute.norm.nob}()},
\code{\link{mice.impute.norm.predict}()},
\code{\link{mice.impute.polr}()},
\code{\link{mice.impute.polyreg}()},
\code{\link{mice.impute.quadratic}()},
\code{\link{mice.impute.rf}()},
\code{\link{mice.impute.ri}()}
}
\author{
Gerko Vink, Stef van Buuren, Karin Groothuis-Oudshoorn
}
\concept{univariate imputation functions}
\keyword{datagen}