File: predictrms.Rd

package info (click to toggle)
r-cran-rms 5.1-3-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 2,160 kB
  • sloc: asm: 18,851; fortran: 823; ansic: 19; makefile: 2
file content (393 lines) | stat: -rw-r--r-- 15,427 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
\name{predictrms}
\alias{predictrms}
\alias{predict.rms}
\alias{predict.bj}
\alias{predict.cph}
\alias{predict.Glm}
\alias{predict.Gls}
\alias{predict.ols}
\alias{predict.psm}

\title{Predicted Values from Model Fit}
\description{
The \code{predict} function is used to obtain a variety of values or
predicted values from either the data used to fit the model (if
\code{type="adjto"} or \code{"adjto.data.frame"} or if \code{x=TRUE} or
\code{linear.predictors=TRUE} were specified to the modeling function), or from
a new dataset. Parameters such as knots and factor levels used in creating 
the design matrix in the original fit are "remembered".
See the \code{Function} function for another method for computing the
linear predictors.
}
\usage{
\method{predict}{bj}(object, newdata,
        type=c("lp", "x", "data.frame", "terms", "cterms", "ccterms",
               "adjto", "adjto.data.frame", "model.frame"), 
        se.fit=FALSE, conf.int=FALSE,
        conf.type=c('mean','individual','simultaneous'),
        kint=1,
        na.action=na.keep, expand.na=TRUE,
        center.terms=type=="terms", \dots) # for bj

\method{predict}{cph}(object, newdata=NULL,
        type=c("lp", "x", "data.frame", "terms", "cterms", "ccterms",
               "adjto", "adjto.data.frame", "model.frame"),
        se.fit=FALSE, conf.int=FALSE,
        conf.type=c('mean','individual','simultaneous'),
        kint=1, na.action=na.keep, expand.na=TRUE,
        center.terms=type=="terms", \dots) # cph

\method{predict}{Glm}(object, newdata,
        type= c("lp", "x", "data.frame", "terms", "cterms", "ccterms",
                "adjto", "adjto.data.frame", "model.frame"),
        se.fit=FALSE, conf.int=FALSE,
        conf.type=c('mean','individual','simultaneous'),
        kint=1, na.action=na.keep, expand.na=TRUE,
        center.terms=type=="terms", \dots) # Glm

\method{predict}{Gls}(object, newdata,
        type=c("lp", "x", "data.frame", "terms", "cterms", "ccterms",
               "adjto", "adjto.data.frame", "model.frame"),
        se.fit=FALSE, conf.int=FALSE,
        conf.type=c('mean','individual','simultaneous'),
        kint=1, na.action=na.keep, expand.na=TRUE,
        center.terms=type=="terms", \dots) # Gls

\method{predict}{ols}(object, newdata,
        type=c("lp", "x", "data.frame", "terms", "cterms", "ccterms",
               "adjto", "adjto.data.frame", "model.frame"),
        se.fit=FALSE, conf.int=FALSE,
        conf.type=c('mean','individual','simultaneous'),
        kint=1, na.action=na.keep, expand.na=TRUE,
        center.terms=type=="terms", \dots) # ols

\method{predict}{psm}(object, newdata,
        type=c("lp", "x", "data.frame", "terms", "cterms", "ccterms",
               "adjto", "adjto.data.frame", "model.frame"),
        se.fit=FALSE, conf.int=FALSE,
        conf.type=c('mean','individual','simultaneous'),
        kint=1, na.action=na.keep, expand.na=TRUE,
        center.terms=type=="terms", \dots) # psm
}
\arguments{
\item{object}{a fit object with an \code{rms} fitting function}
\item{newdata}{
An S data frame, list or a matrix specifying new data for which predictions
are desired.  If \code{newdata} is a list, it is converted to a matrix first.
A matrix is converted to a data frame.  For the matrix form, categorical
variables (\code{catg} or \code{strat}) must be coded as integer category
numbers corresponding to the order in which value labels were stored.
For list or matrix forms, \code{matrx} factors must be given a single
value.  If this single value is the S missing value \code{NA}, the adjustment
values of matrx (the column medians) will later replace this value.
If the single value is not \code{NA}, it is propagated throughout the columns
of the \code{matrx} factor.  For \code{factor} variables having numeric levels,
you can specify the numeric values in \code{newdata} without first converting
the variables to factors.  These numeric values are checked to make sure
they match a level, then the variable is converted internally to a \code{factor}.
It is most typical to use a data frame
for newdata, and the S function \code{expand.grid} is very handy here.
For example, one may specify 
\cr
\code{newdata=expand.grid(age=c(10,20,30),}
\cr
   \code{race=c("black","white","other"),}
\cr
   \code{chol=seq(100,300,by=25))}.
}
\item{type}{
Type of output desired.  The default is \code{"lp"} to get the linear predictors -
predicted \eqn{X\beta}{X beta}.  For Cox models, these predictions are centered.
You may specify \code{"x"} to get an expanded design matrix
at the desired combinations of values, \code{"data.frame"} to get an
S data frame of the combinations, \code{"model.frame"} to get a data frame
of the transformed predictors, \code{"terms"} to get a matrix with
each column being the linear combination of variables making up
a factor (with separate terms for interactions), \code{"cterms"}
("combined terms") to not create separate terms for interactions 
but to add all interaction terms involving each predictor to the
main terms for each predictor, \code{"ccterms"} to combine all related
terms (related through interactions) and their interactions into a
single column, \code{"adjto"} to return a vector of 
\code{limits[2]} (see \code{datadist}) in coded 
form, and \code{"adjto.data.frame"} to return a data frame version of these
central adjustment values.  Use of \code{type="cterms"} does not make
 sense for a \code{strat} variable that does not interact with
another variable.  If \code{newdata} is not given, \code{predict}
will attempt to return information stored with the fit object if the
appropriate options were used with the modeling function (e.g., \code{x, y, linear.predictors, se.fit}).
}
\item{se.fit}{
Defaults to \code{FALSE}.  If \code{type="linear.predictors"}, set
\code{se.fit=TRUE} to return a list with components
\code{linear.predictors} and \code{se.fit} instead of just a vector of
fitted values.   For Cox model fits, standard errors of linear
predictors are computed after subtracting the original column means from
the new design matrix.
}
\item{conf.int}{
Specify \code{conf.int} as a positive fraction to obtain upper and lower
confidence intervals (e.g., \code{conf.int=0.95}).  The \eqn{t}-distribution is
used in the calculation for \code{ols} fits.  Otherwise, the normal
critical value is used.
}
\item{conf.type}{
specifies the type of confidence interval.  Default is for the mean.
For \code{ols} fits there is the option of obtaining confidence limits for
individual predicted values by specifying \code{conf.type="individual"}.
}
\item{kint}{
a single integer specifying the number of the intercept to use in
multiple-intercept models.  The default is 1 for \code{lrm} and the
        reference median intercept for \code{orm}.
}
\item{na.action}{
Function to handle missing values in \code{newdata}.  For predictions
"in data", the same \code{na.action} that was used during model fitting is
used to define an \code{naresid} function to possibly restore rows of the data matrix
that were deleted due to NAs.  For predictions "out of data", the default
\code{na.action} is \code{na.keep}, resulting in NA predictions when a row of
\code{newdata} has an NA.  Whatever \code{na.action} is in effect at the time
for "out of data" predictions, the corresponding \code{naresid} is used also.
}
\item{expand.na}{
set to \code{FALSE} to keep the \code{naresid} from having any effect, i.e., to keep
from adding back observations removed because of NAs in the returned object.
If \code{expand.na=FALSE}, the \code{na.action} attribute will be added to the returned
object.
}
\item{center.terms}{
set to \code{FALSE} to suppress subtracting adjust-to values from
columns of the design matrix before computing terms with \code{type="terms"}.
}
\item{\dots}{ignored}
}
\details{
\code{datadist} and \code{options(datadist=)} should be run before \code{predictrms}
if using \code{type="adjto"}, \code{type="adjto.data.frame"}, or \code{type="terms"},
or if the fit is a Cox model fit and you are requesting \code{se.fit=TRUE}.
For these cases, the adjustment values are needed (either for the
returned result or for the correct covariance matrix computation).
}
\author{
Frank Harrell\cr
Department of Biostatistics, Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\seealso{
	\code{\link{plot.Predict}}, \code{\link{ggplot.Predict}},
	\code{\link{summary.rms}},
	\code{\link{rms}}, \code{\link{rms.trans}}, \code{\link{predict.lrm}},
	\code{\link{predict.orm}},
	\code{\link{residuals.cph}}, \code{\link{datadist}},
	\code{\link{gendata}}, \code{\link{gIndex}},
	\code{\link{Function.rms}}, \code{\link[Hmisc]{reShape}}, 
	\code{\link[Hmisc]{xYplot}}, \code{\link{contrast.rms}}
}
\examples{
n <- 1000    # define sample size
set.seed(17) # so can reproduce the results
age            <- rnorm(n, 50, 10)
blood.pressure <- rnorm(n, 120, 15)
cholesterol    <- rnorm(n, 200, 25)
sex            <- factor(sample(c('female','male'), n,TRUE))
treat          <- factor(sample(c('a','b','c'), n,TRUE))


# Specify population model for log odds that Y=1
L <- .4*(sex=='male') + .045*(age-50) +
  (log(cholesterol - 10)-5.2)*(-2*(sex=='female') + 2*(sex=='male')) +
  .3*sqrt(blood.pressure-60)-2.3 + 1*(treat=='b')
# Simulate binary y to have Prob(y=1) = 1/[1+exp(-L)]
y <- ifelse(runif(n) < plogis(L), 1, 0)


ddist <- datadist(age, blood.pressure, cholesterol, sex, treat)
options(datadist='ddist')


fit <- lrm(y ~ rcs(blood.pressure,4) + 
           sex * (age + rcs(cholesterol,4)) + sex*treat*age)


# Use xYplot to display predictions in 9 panels, with error bars,
# with superposition of two treatments


dat <- expand.grid(treat=levels(treat),sex=levels(sex),
                   age=c(20,40,60),blood.pressure=120,
                   cholesterol=seq(100,300,length=10))
# Add variables linear.predictors and se.fit to dat
dat <- cbind(dat, predict(fit, dat, se.fit=TRUE))
# This is much easier with Predict
# xYplot in Hmisc extends xyplot to allow error bars
xYplot(Cbind(linear.predictors,linear.predictors-1.96*se.fit,
             linear.predictors+1.96*se.fit) ~ cholesterol | sex*age,
       groups=treat, data=dat, type='b')




# Since blood.pressure doesn't interact with anything, we can quickly and
# interactively try various transformations of blood.pressure, taking
# the fitted spline function as the gold standard. We are seeking a
# linearizing transformation even though this may lead to falsely
# narrow confidence intervals if we use this data-dredging-based transformation


bp <- 70:160
logit <- predict(fit, expand.grid(treat="a", sex='male', age=median(age),
                 cholesterol=median(cholesterol),
                 blood.pressure=bp), type="terms")[,"blood.pressure"]
#Note: if age interacted with anything, this would be the age
#      "main effect" ignoring interaction terms
#Could also use Predict(f, age=ag)$yhat
#which allows evaluation of the shape for any level of interacting
#factors.  When age does not interact with anything, the result from
#predict(f, \dots, type="terms") would equal the result from
#plot if all other terms were ignored


plot(bp^.5, logit)               # try square root vs. spline transform.
plot(bp^1.5, logit)              # try 1.5 power
plot(sqrt(bp-60), logit)


#Some approaches to making a plot showing how predicted values
#vary with a continuous predictor on the x-axis, with two other
#predictors varying


combos <- gendata(fit, age=seq(10,100,by=10), cholesterol=c(170,200,230),
                  blood.pressure=c(80,120,160))
#treat, sex not specified -> set to mode
#can also used expand.grid


combos$pred <- predict(fit, combos)
xyplot(pred ~ age | cholesterol*blood.pressure, data=combos, type='l')
xYplot(pred ~ age | cholesterol, groups=blood.pressure, data=combos, type='l')
Key()   # Key created by xYplot
xYplot(pred ~ age, groups=interaction(cholesterol,blood.pressure),
       data=combos, type='l', lty=1:9)
Key()


# Add upper and lower 0.95 confidence limits for individuals
combos <- cbind(combos, predict(fit, combos, conf.int=.95))
xYplot(Cbind(linear.predictors, lower, upper) ~ age | cholesterol,
       groups=blood.pressure, data=combos, type='b')
Key()


# Plot effects of treatments (all pairwise comparisons) vs.
# levels of interacting factors (age, sex)


d <- gendata(fit, treat=levels(treat), sex=levels(sex), age=seq(30,80,by=10))
x <- predict(fit, d, type="x")
betas <- fit$coef
cov   <- vcov(fit, intercepts='none')


i <- d$treat=="a"; xa <- x[i,]; Sex <- d$sex[i]; Age <- d$age[i]
i <- d$treat=="b"; xb <- x[i,]
i <- d$treat=="c"; xc <- x[i,]


doit <- function(xd, lab) {
  xb <- matxv(xd, betas)
  se <- apply((xd \%*\% cov) * xd, 1, sum)^.5
  q <- qnorm(1-.01/2)   # 0.99 confidence limits
  lower <- xb - q * se; upper <- xb + q * se
  #Get odds ratios instead of linear effects
  xb <- exp(xb); lower <- exp(lower); upper <- exp(upper)
  #First elements of these agree with 
  #summary(fit, age=30, sex='female',conf.int=.99))
  for(sx in levels(Sex)) {
    j <- Sex==sx
    errbar(Age[j], xb[j], upper[j], lower[j], xlab="Age", 
           ylab=paste(lab, "Odds Ratio"), ylim=c(.1, 20), log='y')
    title(paste("Sex:", sx))
    abline(h=1, lty=2)
  }
}


par(mfrow=c(3,2), oma=c(3,0,3,0))
doit(xb - xa, "b:a")
doit(xc - xa, "c:a")
doit(xb - xa, "c:b")

# NOTE: This is much easier to do using contrast.rms

# Demonstrate type="terms", "cterms", "ccterms"
set.seed(1)
n <- 40
x <- 1:n
w <- factor(sample(c('a', 'b'), n, TRUE))
u <- factor(sample(c('A', 'B'), n, TRUE))
y <- .01*x + .2*(w=='b') + .3*(u=='B') + .2*(w=='b' & u=='B') + rnorm(n)/5
ddist <- datadist(x, w, u)
f <- ols(y ~ x*w*u, x=TRUE, y=TRUE)
f
anova(f)
z <- predict(f, type='terms', center.terms=FALSE)
z[1:5,]
k <- coef(f)
## Manually compute combined terms
wb <- w=='b'
uB <- u=='B'
h  <- k['x * w=b * u=B']*x*wb*uB
tx <- k['x']  *x  + k['x * w=b']*x*wb + k['x * u=B']  *x*uB  + h
tw <- k['w=b']*wb + k['x * w=b']*x*wb + k['w=b * u=B']*wb*uB + h
tu <- k['u=B']*uB + k['x * u=B']*x*uB + k['w=b * u=B']*wb*uB + h
h   <- z[,'x * w * u'] # highest order term is present in all cterms
tx2 <- z[,'x']+z[,'x * w']+z[,'x * u']+h
tw2 <- z[,'w']+z[,'x * w']+z[,'w * u']+h
tu2 <- z[,'u']+z[,'x * u']+z[,'w * u']+h
ae <- function(a, b) all.equal(a, b, check.attributes=FALSE)
ae(tx, tx2)
ae(tw, tw2)
ae(tu, tu2)

zc <- predict(f, type='cterms')
zc[1:5,]
ae(tx, zc[,'x'])
ae(tw, zc[,'w'])
ae(tu, zc[,'u'])

zc <- predict(f, type='ccterms')
# As all factors are indirectly related, ccterms gives overall linear
# predictor except for the intercept
zc[1:5,]
ae(as.vector(zc + coef(f)[1]), f$linear.predictors)

\dontrun{
#A variable state.code has levels "1", "5","13"
#Get predictions with or without converting variable in newdata to factor
predict(fit, data.frame(state.code=c(5,13)))
predict(fit, data.frame(state.code=factor(c(5,13))))


#Use gendata function (gendata.rms) for interactive specification of
#predictor variable settings (for 10 observations)
df <- gendata(fit, nobs=10, viewvals=TRUE)
df$predicted <- predict(fit, df)  # add variable to data frame
df


df <- gendata(fit, age=c(10,20,30))  # leave other variables at ref. vals.
predict(fit, df, type="fitted")


# See reShape (in Hmisc) for an example where predictions corresponding to 
# values of one of the varying predictors are reformatted into multiple
# columns of a matrix
}
options(datadist=NULL)
}
\keyword{models}
\keyword{regression}