1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283
|
# linmod.R: Example S3 linear model.
#
# See www.milbo.org/doc/modguide.pdf.
# This software may be freely used.
linmod <- function(...) UseMethod("linmod")
linmod.default <- function(x = stop("no 'x' argument"),
y = stop("no 'y' argument"),
keep = FALSE,
...)
{
stop.if.dot.arg.used(...)
xmat <- as.matrix(x)
# use name "(Intercept)" here so coef names match linmod.formula
x <- cbind("(Intercept)" = 1, xmat)
fit <- linmod.fit(x, y)
fit$call <- match.call()
if(keep) {
fit$x <- xmat
# save y as a one-column matrix, so can use colname to save response name
colname <- deparse(substitute(y))[1]
colname <- gsub(" ", "", substr(colname, 1, 100)) # strip spaces, truncate
fit$y <- as.matrix(y, ncol = 1)
colnames(fit$y) <- colname
}
fit
}
linmod.formula <- function(formula = stop("no 'formula' argument"),
data = parent.frame(),
keep = FALSE,
...)
{
stop.if.dot.arg.used(...)
if(is.matrix(data)) # allow data to be a matrix
data <- as.data.frame(data) # will create colnames V1 V2 V3 if necessary
# note that na.action=na.pass because we will catch NAs later
# in linmod.fit, for uniformity with linmod.default
mf <- model.frame(formula = formula, data = data, na.action = na.pass)
terms <- attr(mf, "terms")
x <- model.matrix(terms, mf)
y <- model.response(mf)
fit <- linmod.fit(x, y)
fit$call <- match.call()
fit$terms <- terms
fit$xlevels <- .getXlevels(terms, mf) # for use by predict.linmod
if(keep)
fit$data <- data
fit
}
linmod.fit <- function(x = stop("no 'x' argument"),
y = stop("no 'y' argument"),
...)
{
# internal function, not for the casual user
# if model has an intercept, the first col of x must be intercept (all 1s)
stop.if.dot.arg.used(...)
x <- check.linmod.x(x)
y <- check.linmod.y(x, y)
fit <- do.linmod.fit(x, y)
class(fit) <- "linmod"
fit
}
check.linmod.x <- function(x)
{
if(!is.matrix(x))
stop("'x' is not a matrix or could not be converted to a matrix")
if(NROW(x) == 0 || NCOL(x) == 0)
stop("'x' is empty")
if(anyNA(x))
stop("NA in 'x'")
# checking just the first column of x suffices because all columns
# of a matrix have the same type
# we allow is.logical because qr etc. treat logical vars as numeric
if(!is.numeric(x[,1]) && !is.logical(x[,1]))
stop("non-numeric column in 'x'")
# ensure all columns in x are named (needed for names in vcov etc.)
# use the same naming convention as lm (prefix for unnamed cols is "V")
missing.colnames <-
if(is.null(colnames(x))) 1:NCOL(x)
else nchar(colnames(x)) == 0
colnames(x)[missing.colnames] <-
c("(Intercept)",
paste("V", seq_len(NCOL(x) - 1), sep = ""))[missing.colnames]
duplicated <- which(duplicated(colnames(x)))
if(length(duplicated))
stop("column name \"", colnames(x)[duplicated[1]],
"\" in 'x' is duplicated")
x
}
check.linmod.y <- function(x, y)
{
# as.vector(as.matrix(y)) is necessary when y is a data.frame
# (because as.vector alone on a data.frame returns a data.frame)
y <- as.vector(as.matrix(y))
if(length(y) == 0)
stop("'y' is empty")
if(anyNA(y))
stop("NA in 'y'")
if(!is.numeric(y) && !is.logical(y))
stop("'y' is not numeric or logical")
if(length(y) != nrow(x))
stop("nrow(x) is ", nrow(x), " but length(y) is ", length(y))
y
}
do.linmod.fit <- function(x, y)
{
# workhorse function for fitting linear models
# essential processing and sanity checks on x and y are already completed
# x is a numeric matrix, y is a numeric vector
qx <- qr(x) # QR-decomposition of x
if(qx$rank < ncol(x))
stop("'x' is singular (it has ", ncol(x),
" columns but its rank is ", qx$rank, ")\n colnames(x): ",
paste0(colnames(x), collapse=' '))
coef <- solve.qr(qx, y) # compute (x'x)^(-1) x'y
stopifnot(!anyNA(coef)) # NA impossible after rank check above
df.residual <- max(0, nrow(x) - ncol(x)) # degrees of freedom
sigma2 <- sum((y - x %*% coef)^2) / df.residual # variance of residuals
vcov <- sigma2 * chol2inv(qx$qr) # covar mat is sigma^2 * (x'x)^(-1)
fitted.values <- qr.fitted(qx, y)
colnames(vcov) <- rownames(vcov) <- colnames(x)
names(fitted.values) <- rownames(x)
colnames(coef) <- colnames(y)
# returned fields match lm's fields
list(coefficients = coef,
residuals = y - fitted.values,
rank = qx$rank,
fitted.values = fitted.values,
vcov = vcov,
sigma = sqrt(sigma2),
df.residual = df.residual)
}
predict.linmod <- function(object = stop("no 'object' argument"),
newdata = NULL,
type = "response",
...)
{
stopifnot(inherits(object, "linmod"))
stop.if.dot.arg.used(...)
match.arg(type, "response") # the type argument is not yet supported
if(is.null(newdata))
yhat <- fitted(object)
else {
if(NROW(newdata) == 0)
stop("'newdata' is empty") # preempt obscure message later
x <- if(is.null(object$terms)) # model built with linmod.default?
process.newdata(object, newdata)
else # model built with linmod.formula
process.newdata.formula(object, newdata)
# The following tests suffice to catch all illegal input. However
# they aren't ideal in that they don't always direct you to the root
# cause of the problem (i.e. the error messages aren't always optimal).
nvar <- length(object$coefficients) - 1 # nbr vars, -1 for intercept
if(ncol(x) - 1 != nvar)
stop("ncol(newdata) is ", ncol(x) - 1, " but should be ", nvar)
if(anyNA(x))
stop("NA in 'newdata'")
if(!is.numeric(x[,1]) && !is.logical(x[,1]))
stop("non-numeric column in 'newdata' (after processing)")
yhat <- as.vector(do.predict.linmod(object, x))
names(yhat) <- rownames(x)
}
yhat
}
process.newdata <- function(object, newdata)
{
# process newdata for models built with linmod.default
x <- if(is.vector(newdata)) # allow newdata to be a vector
matrix(newdata, ncol = length(object$coefficients) - 1)
else
as.matrix(newdata) # allow newdata to be a data.frame
cbind(1, x) # return data with an intercept column
}
process.newdata.formula <- function(object, newdata)
{
# process newdata for models built with linmod.formula
newdata <- as.data.frame(newdata) # allows newdata to be a matrix
terms <- object$terms
dataClasses <- attr(terms, "dataClasses")
iresp <- attr(terms, "response")
terms <- delete.response(terms)
# na.action=na.pass because we will catch NAs after (for clearer error msg)
# xlevels is needed to convert strings to factor levels, for example:
# a <- linmod(Sepal.Length~Species,data=iris)
# predict(a,newdata=data.frame(Species="setosa"))
mf <- model.frame(terms, newdata, na.action = na.pass, xlev = object$xlevels)
if(anyNA(mf))
stop("NA in 'newdata'")
if(NROW(mf) != NROW(newdata)) {
# Get here when model.frame() issues
# Warning: 'newdata' had M rows but variables found have N rows
# Must stop, else the call to model.matrix() below would silently return bad data.
# If a variable is missing, print its name to help the user.
# TODO This will erroneously identify "sqrt(x)" as a missing var in the
# formula "y ~ sqrt(x)" (because the var is wrapped in a func call).
varnames <- names(dataClasses)
varnames <- varnames[-iresp]
missing <- which(!(varnames %in% colnames(newdata)))
missing.msg <- ""
if(length(missing))
missing.msg <- paste0(" (variable '", varnames[missing[1]],
"' may be missing from newdata)")
stop("newdata has ", NROW(newdata),
" rows but model.frame returned ", NROW(mf), " rows", missing.msg)
}
.checkMFClasses(dataClasses, mf) # check types in newdata match original data
model.matrix(terms, mf)
}
do.predict.linmod <- function(object, x)
{
# workhorse function for linear model predictions
# processing by model.matrix etc. and sanity checks on x already completed
# x is a numeric matrix (if model has intercept, first col of x is all 1s)
x %*% coef(object)
}
summary.linmod <- function(object = stop("no 'object' argument"), ...)
{
stop.if.dot.arg.used(...)
se <- sqrt(diag(object$vcov))
t.value <- coef(object) / se
p.value <-
if(object$df.residual == 0) # avoid warning from pt()
rep_len(0, length.out=length(t.value))
else
2 * pt(-abs(t.value), df = object$df.residual)
coefficients <- cbind(Estimate = coef(object),
StdErr = se,
t.value = t.value,
p.value = p.value)
retval <- list(call = object$call,
coefficients = coefficients)
class(retval) <- "summary.linmod"
retval
}
print.linmod <- function(x = stop("no 'x' argument"), ...)
{
stop.if.dot.arg.used(...)
print.model.call(x)
print(x$coefficients)
invisible(x)
}
print.summary.linmod <- function(x = stop("no 'x' argument"), ...)
{
stop.if.dot.arg.used(...)
print.model.call(x)
print(x$coefficients)
invisible(x)
}
print.model.call <- function(x)
{
cat("Call: ") # print.lm has a newline here, but a space is more compact
# use paste0 to convert vector of strings to single string if necessary
cat(strwrap(paste0(deparse(x$call, control = NULL, nlines = 5),
sep = " ", collapse = " "), exdent = 6), sep = "\n")
cat("\n")
}
# stop.if.dot.arg.used will cause an error message if any args are passed to it.
# We use it to test if any dots arg of the calling function was used, for
# functions that must have a dots arg (to match the generic method) but don't
# actually use the dots. This helps the user catch mistyped or illegal args.
# R version 3.3-0 or higher has a function chkDots which could be used instead.
stop.if.dot.arg.used <- function()
{
NULL
}
|