1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289
|
\title{MM-type Estimators for Linear Regression}
\name{lmrob}
\encoding{utf8}
\alias{lmrob}
% "Link to here", even those are not exported:
\alias{.vcov.avar1}
\alias{.vcov.w}
\description{
Computes fast MM-type estimators for linear (regression) models.
}
\usage{
lmrob(formula, data, subset, weights, na.action, method = "MM",
model = TRUE, x = !control$compute.rd, y = FALSE,
singular.ok = TRUE, contrasts = NULL, offset = NULL,
control = NULL, init = NULL, ...)
}
\arguments{
\item{formula}{a symbolic description of the model to be fit. See
\code{\link{lm}} and \code{\link{formula}} for more details.}
\item{data}{an optional data frame, list or environment (or object
coercible by \code{\link{as.data.frame}} to a data frame) containing
the variables in the model. If not found in \code{data}, the
variables are taken from \code{environment(formula)},
typically the environment from which \code{lmrob} is called.}
\item{subset}{an optional vector specifying a subset of observations
to be used in the fitting process.}
\item{weights}{an optional vector of weights to be used in the fitting
process (in addition to the robustness weights computed in the
fitting process).}
\item{na.action}{a function which indicates what should happen
when the data contain \code{NA}s. The default is set by
the \code{na.action} setting of \code{\link{options}}, and is
\code{\link{na.fail}} if that is unset. The \dQuote{factory-fresh}
default is \code{\link{na.omit}}. Another possible value is
\code{NULL}, no action. Value \code{\link{na.exclude}} can be useful.}
\item{method}{string specifying the estimator-chain. \code{MM}
is interpreted as \code{SM}. See \emph{Details}, notably the
currently recommended \code{setting = "KS2014"}.}
\item{model, x, y}{logicals. If \code{TRUE} the corresponding
components of the fit (the model frame, the model matrix, the
response) are returned.}
\item{singular.ok}{logical. If \code{FALSE} (the default in S but
not in \R) a singular fit is an error.}
\item{contrasts}{an optional list. See the \code{contrasts.arg}
of \code{\link{model.matrix.default}}.}
\item{offset}{this can be used to specify an \emph{a priori}
known component to be included in the linear predictor
during fitting. An \code{\link{offset}} term can be included in the
formula instead or as well, and if both are specified their sum is used.}
\item{control}{a \code{\link{list}} specifying control parameters; use
the function \code{\link{lmrob.control}(.)} and see its help page.}
\item{init}{an optional argument to specify or supply the initial
estimate. See \emph{Details}.}
\item{\dots}{additional arguments can be used to specify control
parameters directly instead of (but not in addition to!) via \code{control}.}
}
\details{
\describe{
\item{Overview:}{
This function computes an MM-type regression estimator as
described in Yohai (1987) and Koller and Stahel (2011). By
default it uses a bi-square redescending score function, and it
returns a highly robust and highly efficient estimator (with 50\%
breakdown point and 95\% asymptotic efficiency for normal errors).
The computation is carried out by a call to \code{\link{lmrob.fit}()}.
The argument \code{setting} of \code{\link{lmrob.control}} is provided
to set alternative defaults as suggested in Koller and Stahel (2011)
(\code{setting="KS2011"}; now do use its extension
\code{setting="KS2014"}). For further details, see \code{\link{lmrob.control}}.
}
\item{Initial Estimator \code{init}:}{
The initial estimator may be specified using the argument
\code{init}. This can either be
\itemize{
\item a \emph{string} used to specify built in internal
estimators (currently \code{"S"} and \code{"M-S"}, see \emph{See also}
below);
\item a \code{\link{function}} taking arguments \code{x, y,
control, mf} (where \code{mf} stands for \code{model.frame}) and
returning a \code{\link{list}} containing at least the initial coefficients as
component \code{"coefficients"} and the initial scale estimate as
\code{"scale"}.
\item Or a \code{\link{list}} giving the initial coefficients and
scale as components \code{"coefficients"} and \code{"scale"}. See also
\emph{Examples}.
}
Note that when \code{init} is a function or list, the
\code{method} argument must \emph{not} contain the initial estimator, e.g.,
use \code{MDM} instead of \code{SMDM}.
The default, equivalent to \code{init = "S"}, uses as initial
estimator an S-estimator (Rousseeuw and Yohai, 1984) which is
computed using the Fast-S algorithm of Salibian-Barrera and Yohai
(2006), calling \code{\link{lmrob.S}()}. That function, since
March 2012, by default uses \emph{nonsingular} subsampling which
makes the Fast-S algorithm feasible for categorical data as well,
see Koller (2012). Note that convergence problems may still show
up as warnings, e.g., \preformatted{
S refinements did not converge (to refine.tol=1e-07) in 200 (= k.max) steps
}
and often can simply be remedied by increasing (i.e. weakening)
\code{refine.tol} or increasing the allowed number of iterations
\code{k.max}, see \code{\link{lmrob.control}}.
}
\item{Method \code{method}:}{
The following chain of estimates is customizable via the
\code{method} argument. % of \code{\link{lmrob.control}}.
There are currently two types of estimates available,
\describe{
\item{\code{"M"}:}{corresponds to the standard M-regression
estimate.}
\item{\code{"D"}:}{stands for the Design Adaptive Scale estimate
as proposed in Koller and Stahel (2011).}
}
The \code{method} argument takes a string that specifies the
estimates to be calculated as a chain. Setting
\code{method='SMDM'} will result in an intial S-estimate, followed
by an M-estimate, a Design Adaptive Scale estimate and a final
M-step. For methods involving a \code{D}-step, the default value
of \code{psi} (see \code{\link{lmrob.control}}) is changed to
\code{"lqq"}.
By default, standard errors are computed using the formulas of
Croux, Dhaene and Hoorelbeke (2003) (\code{\link{lmrob.control}}
option \code{cov=".vcov.avar1"}). This method, however, works only
for MM-estimates (i.e., \code{method = "MM"} or \code{ = "SM"}). For other
\code{method} arguments, the covariance matrix estimate used is based on the asymptotic
normality of the estimated coefficients (\code{cov=".vcov.w"}) as
described in Koller and Stahel (2011).
The var-cov computation can be skipped by \code{cov = "none"} and
(re)done later by e.g., \code{vcov(<obj>, cov = ".vcov.w")}.
As of robustbase version 0.91-0 (April 2014), the computation of
robust standard errors for \code{method="SMDM"} has been changed.
The old behaviour can be restored by setting the control parameter
\code{cov.corrfact = "tauold"}.%% FIXME: regr.test for that
}
}%end {describe}
}
\value{
An object of class \code{lmrob}; a list including the following
components:
\item{coefficients}{The estimate of the coefficient vector}
\item{scale}{The scale as used in the M estimator.}
\item{residuals}{Residuals associated with the estimator.}
%loss
\item{converged}{\code{TRUE} if the IRWLS iterations have converged.}
\item{iter}{number of IRWLS iterations}
\item{rweights}{the \dQuote{robustness weights} \eqn{\psi(r_i/S) / (r_i/S)}.}
\item{fitted.values}{Fitted values associated with the estimator.}
%control
\item{init.S}{The \code{\link{list}} returned by \code{\link{lmrob.S}()} or
\code{\link{lmrob.M.S}()} (for MM-estimates, i.e., \code{method="MM"} or \code{"SM"} only)}
\item{init}{A similar list that contains the results of intermediate
estimates (\emph{not} for MM-estimates).}
%qr
\item{rank}{the numeric rank of the fitted linear model.}
\item{cov}{The estimated covariance matrix of the regression
coefficients}
\item{df.residual}{the residual degrees of freedom.}
%degree.freedom
\item{weights}{the specified weights (missing if none were used).}
\item{na.action}{(where relevant) information returned by
\code{\link{model.frame}} on the special handling of \code{NA}s.}
\item{offset}{the offset used (missing if none were used).}
\item{contrasts}{(only where relevant) the contrasts used.}
\item{xlevels}{(only where relevant) a record of the levels of the factors
used in fitting.}
\item{call}{the matched call.}
\item{terms}{the \code{terms} object used.}
%assign
\item{model}{if requested (the default), the model frame used.}
\item{x}{if requested, the model matrix used.}
\item{y}{if requested, the response used.}
In addition, non-null fits will have components \code{assign},
and \code{qr} relating to the linear fit, for use by extractor
functions such as \code{summary}.
}
\references{
Croux, C., Dhaene, G. and Hoorelbeke, D. (2003)
\emph{Robust standard errors for robust estimators},
Discussion Papers Series 03.16, K.U. Leuven, CES.
Koller, M. (2012)
Nonsingular subsampling for S-estimators with categorical predictors,
\emph{ArXiv e-prints} \url{https://arxiv.org/abs/1208.5595};
extended version published as Koller and Stahel (2017), see
\code{\link{lmrob.control}}.
Koller, M. and Stahel, W.A. (2011)
Sharpening Wald-type inference in robust regression for small samples.
\emph{Computational Statistics & Data Analysis} \bold{55}(8), 2504--2515.
Maronna, R. A., and Yohai, V. J. (2000)
Robust regression with both continuous and categorical predictors.
\emph{Journal of Statistical Planning and Inference} \bold{89}, 197--214.
Rousseeuw, P.J. and Yohai, V.J. (1984)
Robust regression by means of S-estimators,
In \emph{Robust and Nonlinear Time Series},
J. Franke, W. Härdle and R. D. Martin (eds.).
Lectures Notes in Statistics 26, 256--272,
Springer Verlag, New York.
Salibian-Barrera, M. and Yohai, V.J. (2006)
A fast algorithm for S-regression estimates,
\emph{Journal of Computational and Graphical Statistics} \bold{15}(2), 414--427.
\doi{10.1198/106186006X113629}
Yohai, V.J. (1987)
High breakdown-point and high efficiency estimates for regression.
\emph{The Annals of Statistics} \bold{15}, 642--65.
Yohai, V., Stahel, W.~A. and Zamar, R. (1991)
A procedure for robust estimation and inference in linear regression;
in Stahel and Weisberg (eds), \emph{Directions in Robust Statistics
and Diagnostics}, Part II, Springer, New York, 365--374;
\doi{10.1007/978-1-4612-4444-8_20}.
}
\author{(mainly:) Matias Salibian-Barrera and Manuel Koller}
\seealso{
\code{\link{lmrob.control}};
for the algorithms \code{\link{lmrob.S}}, \code{\link{lmrob.M.S}} and
\code{\link{lmrob.fit}};
and for methods,
\code{\link{summary.lmrob}}, for the extra \dQuote{statistics},
notably \eqn{R^2} (\dQuote{R squared});
\code{\link{predict.lmrob}},
\code{\link{print.lmrob}}, \code{\link{plot.lmrob}}, and
\code{\link{weights.lmrob}}.
}
\examples{
data(coleman)
set.seed(0)
## Default for a very long time:
summary( m1 <- lmrob(Y ~ ., data=coleman) )
## Nowadays **strongly recommended** for routine use:
summary(m2 <- lmrob(Y ~ ., data=coleman, setting = "KS2014") )
## ------------------
plot(residuals(m2) ~ weights(m2, type="robustness")) ##-> weights.lmrob()
abline(h=0, lty=3)
data(starsCYG, package = "robustbase")
## Plot simple data and fitted lines
plot(starsCYG)
lmST <- lm(log.light ~ log.Te, data = starsCYG)
(RlmST <- lmrob(log.light ~ log.Te, data = starsCYG))
abline(lmST, col = "red")
abline(RlmST, col = "blue")
## --> Least Sq.:/ negative slope \\ robust: slope ~= 2.2 % checked in ../tests/lmrob-data.R
summary(RlmST) # -> 4 outliers; rest perfect
vcov(RlmST)
stopifnot(all.equal(fitted(RlmST),
predict(RlmST, newdata = starsCYG), tol = 1e-14))
## FIXME: setting = "KS2011" or setting = "KS2014" **FAIL** here
##--- 'init' argument -----------------------------------
## 1) string
set.seed(0)
m3 <- lmrob(Y ~ ., data=coleman, init = "S")
stopifnot(all.equal(m1[-18], m3[-18]))
## 2) function
initFun <- function(x, y, control, ...) { # no 'mf' needed
init.S <- lmrob.S(x, y, control)
list(coefficients=init.S$coef, scale = init.S$scale)
}
set.seed(0)
m4 <- lmrob(Y ~ ., data=coleman, method = "M", init = initFun)
## list
m5 <- lmrob(Y ~ ., data=coleman, method = "M",
init = list(coefficients = m3$init$coef, scale = m3$scale))
stopifnot(all.equal(m4[-17], m5[-17]))
}
\keyword{robust}
\keyword{regression}
|