1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293

\name{curveRep}
\alias{curveRep}
\alias{print.curveRep}
\alias{plot.curveRep}
\alias{curveSmooth}
\title{Representative Curves}
\description{\code{curveRep} finds representative curves from a
relatively large collection of curves. The curves usually represent
timeresponse profiles as in serial (longitudinal or repeated) data
with possibly unequal time points and greatly varying sample sizes per
subject. After excluding records containing missing \code{x} or
\code{y}, records are first stratified into \code{kn} groups having similar
sample sizes per curve (subject). Within these strata, curves are
next stratified according to the distribution of \code{x} points per
curve (typically measurement times per subject). The
\code{\link[cluster]{clara}} clustering/partitioning function is used
to do this, clustering on one, two, or three \code{x} characteristics
depending on the minimum sample size in the current interval of sample
size. If the interval has a minimum number of unique \code{values} of
one, clustering is done on the single \code{x} values. If the minimum
number of unique \code{x} values is two, clustering is done to create
groups that are similar on both \code{min(x)} and \code{max(x)}. For
groups containing no fewer than three unique \code{x} values,
clustering is done on the trio of values \code{min(x)}, \code{max(x)},
and the longest gap between any successive \code{x}. Then within
sample size and \code{x} distribution strata, clustering of
timeresponse profiles is based on \code{p} values of \code{y} all
evaluated at the same \code{p} equallyspaced \code{x}'s within the
stratum. An option allows percurve data to be smoothed with
\code{\link{lowess}} before proceeding. Outer \code{x} values are
taken as extremes of \code{x} across all curves within the stratum.
Linear interpolation within curves is used to estimate \code{y} at the
grid of \code{x}'s. For curves within the stratum that do not extend
to the most extreme \code{x} values in that stratum, extrapolation
uses flat lines from the observed extremes in the curve unless
\code{extrap=TRUE}. The \code{p} \code{y} values are clustered using
\code{\link[cluster]{clara}}.
\code{print} and \code{plot} methods show results. By specifying an
auxiliary \code{idcol} variable to \code{plot}, other variables such
as treatment may be depicted to allow the analyst to determine for
example whether subjects on different treatments are assigned to
different timeresponse profiles. To write the frequencies of a
variable such as treatment in the upper left corner of each panel
(instead of the grand total number of clusters in that panel), specify
\code{freq}.
\code{curveSmooth} takes a set of curves and smooths them using
\code{\link{lowess}}. If the number of unique \code{x} points in a curve is
less than \code{p}, the smooth is evaluated at the unique \code{x}
values. Otherwise it is evaluated at an equally spaced set of
\code{x} points over the observed range. If fewer than 3 unique
\code{x} values are in a curve, those points are used and smoothing is not done.
}
\usage{
curveRep(x, y, id, kn = 5, kxdist = 5, k = 5, p = 5,
force1 = TRUE, metric = c("euclidean", "manhattan"),
smooth=FALSE, extrap=FALSE, pr=FALSE)
\method{print}{curveRep}(x, \dots)
\method{plot}{curveRep}(x, which=1:length(res), method=c('all','lattice'),
m=NULL, probs=c(.5, .25, .75), nx=NULL, fill=TRUE,
idcol=NULL, freq=NULL, plotfreq=FALSE,
xlim=range(x), ylim=range(y),
xlab='x', ylab='y', colorfreq=FALSE, \dots)
curveSmooth(x, y, id, p=NULL, pr=TRUE)
}
\arguments{
\item{x}{a numeric vector, typically measurement times.
For \code{plot.curveRep} is an object created by \code{curveRep}.}
\item{y}{a numeric vector of response values}
\item{id}{a vector of curve (subject) identifiers, the same length as
\code{x} and \code{y}}
\item{kn}{number of curve sample size groups to construct.
\code{curveRep} tries to divide the data into equal numbers of
curves across sample size intervals.}
\item{kxdist}{maximum number of xdistribution clusters to derive
using \code{clara}}
\item{k}{maximum number of xy profile clusters to derive using \code{clara}}
\item{p}{number of \code{x} points at which to interpolate \code{y}
for profile clustering. For \code{curveSmooth} is the number of
equally spaced points at which to evaluate the lowess smooth, and if
\code{p} is omitted the smooth is evaluated at the original \code{x}
values (which will allow \code{curveRep} to still know the \code{x}
distribution}
\item{force1}{By default if any curves have only one point, all curves
consisting of one point will be placed in a separate stratum. To
prevent this separation, set \code{force1 = FALSE}.}
\item{metric}{see \code{\link[cluster]{clara}}}
\item{smooth}{By default, linear interpolation is used on raw data to
obtain \code{y} values to cluster to determine xy profiles.
Specify \code{smooth = TRUE} to replace observed points with
\code{\link{lowess}} before computing \code{y} points on the grid.
Also, when \code{smooth} is used, it may be desirable to use
\code{extrap=TRUE}.}
\item{extrap}{set to \code{TRUE} to use linear extrapolation to
evaluate \code{y} points for xy clustering. Not recommended unless
smoothing has been or is being done.}
\item{pr}{set to \code{TRUE} to print progress notes}
\item{which}{an integer vector specifying which sample size intervals
to plot. Must be specified if \code{method='lattice'} and must be a
single number in that case.}
\item{method}{The default makes individual plots of possibly all
xdistribution by sample size by cluster combinations. Fewer may be
plotted by specifying \code{which}. Specify \code{method='lattice'}
to show a lattice \code{xyplot} of a single sample size interval,
with x distributions going across and clusters going down.}
\item{m}{the number of curves in a cluster to randomly sample if there
are more than \code{m} in a cluster. Default is to draw all curves
in a cluster. For \code{method = "lattice"} you can specify
\code{m = "quantiles"} to use the \code{xYplot} function to show
quantiles of \code{y} as a function of \code{x}, with the quantiles
specified by the \code{probs} argument. This cannot be used to draw
a group containing \code{n = 1}.}
\item{nx}{applies if \code{m = "quantiles"}. See \code{\link{xYplot}}.}
\item{probs}{3vector of probabilities with the central quantile
first. Default uses quartiles.}
\item{fill}{for \code{method = "all"}, by default if a sample size
xdistribution stratum did not have enough curves to stratify into
\code{k} xy profiles, empty graphs are drawn so that a matrix of
graphs will have the next row starting with a different sample size
range or xdistribution. See the example below.}
\item{idcol}{a named vector to be used as a table lookup for color
assignments (does not apply when \code{m = "quantile"}). The names of
this vector are curve \code{id}s and the values are color names or
numbers.}
\item{freq}{a named vector to be used as a table lookup for a grouping
variable such as treatment. The names are curve \code{id}s and
values are any values useful for grouping in a frequency tabulation.}
\item{plotfreq}{set to \code{TRUE} to plot the frequencies from the
\code{freq} variable as horizontal bars instead of printing them.
Applies only to \code{method = "lattice"}. By default the largest bar
is 0.1 times the length of a panel's xaxis. Specify
\code{plotfreq = 0.5} for example to make the longest bar half this long.}
\item{colorfreq}{set to \code{TRUE} to color the frequencies printed by
\code{plotfreq} using the colors provided by \code{idcol}.}
\item{xlim, ylim, xlab, ylab}{plotting parameters. Default ranges are
the ranges in the entire set of raw data given to \code{curveRep}.}
\item{\dots}{arguments passed to other functions.}
}
\value{a list of class \code{"curveRep"} with the following elements
\item{res}{a hierarchical list first split by sample size intervals,
then by x distribution clusters, then containing a vector of cluster
numbers with \code{id} values as a names attribute}
\item{ns}{a table of frequencies of sample sizes per curve after
removing \code{NA}s}
\item{nomit}{total number of records excluded due to \code{NA}s}
\item{missfreq}{a table of frequencies of number of \code{NA}s
excluded per curve}
\item{ncuts}{cut points for sample size intervals}
\item{kn}{number of sample size intervals}
\item{kxdist}{number of clusters on x distribution}
\item{k}{number of clusters of curves within sample size and
distribution groups}
\item{p}{number of points at which to evaluate each curve for clustering}
\item{x}{}
\item{y}{}
\item{id}{input data after removing \code{NA}s}
\code{curveSmooth} returns a list with elements \code{x,y,id}.
}
\details{
In the graph titles for the default graphic output, \code{n} refers to the
minimum sample size, \code{x} refers to the sequential xdistribution
cluster, and \code{c} refers to the sequential xy profile cluster. Graphs
from \code{method = "lattice"} are produced by
\code{\link[lattice]{xyplot}} and in the panel titles
\code{distribution} refers to the xdistribution stratum and
\code{cluster} refers to the xy profile cluster.
}
\references{
Segal M. (1994): Representative curves for longitudinal data via
regression trees. J Comp Graph Stat 3:214233.
Jones MC, Rice JA (1992): Displaying the important features of large
collections of similar curves. Am Statistician 46:140145.
Zheng X, Simpson JA, et al (2005): Data from a study of effectiveness
suggested potential prognostic factors related to the patterns of
shoulder pain. J Clin Epi 58:823830.
}
\author{
Frank Harrell\cr
Department of Biostatistics\cr
Vanderbilt University\cr
\email{f.harrell@vanderbilt.edu}
}
\note{The references describe other methods for deriving
representative curves, but those methods were not used here. The last
reference which used a cluster analysis on principal components
motivated \code{curveRep} however. The \code{kml} package does kmeans clustering of longitudinal data with imputation.}
\seealso{\code{\link[cluster]{clara}},\code{\link[Hmisc]{dataRep}}}
\examples{
\dontrun{
# Simulate 200 curves with precurve sample sizes ranging from 1 to 10
# Make curves with oddnumbered IDs have an xdistribution that is random
# uniform [0,1] and those with evennumbered IDs have an xdist. that is
# half as wide but still centered at 0.5. Shift y values higher with
# increasing IDs
set.seed(1)
N < 200
nc < sample(1:10, N, TRUE)
id < rep(1:N, nc)
x < y < id
for(i in 1:N) {
x[id==i] < if(i \%\% 2) runif(nc[i]) else runif(nc[i], c(.25, .75))
y[id==i] < i + 10*(x[id==i]  .5) + runif(nc[i], 10, 10)
}
w < curveRep(x, y, id, kxdist=2, p=10)
w
par(ask=TRUE, mfrow=c(4,5))
plot(w) # show everything, profiles going across
par(mfrow=c(2,5))
plot(w,1) # show n=1 results
# Use a color assignment table, assigning low curves to green and
# high to red. Unique curve (subject) IDs are the names of the vector.
cols < c(rep('green', N/2), rep('red', N/2))
names(cols) < as.character(1:N)
plot(w, 3, idcol=cols)
par(ask=FALSE, mfrow=c(1,1))
plot(w, 1, 'lattice') # show n=1 results
plot(w, 3, 'lattice') # show n=45 results
plot(w, 3, 'lattice', idcol=cols) # same but different color mapping
plot(w, 3, 'lattice', m=1) # show a single "representative" curve
# Show median, 10th, and 90th percentiles of supposedly representative curves
plot(w, 3, 'lattice', m='quantiles', probs=c(.5,.1,.9))
# Same plot but with much less grouping of x variable
plot(w, 3, 'lattice', m='quantiles', probs=c(.5,.1,.9), nx=2)
# Smooth data before profiling. This allows later plotting to plot
# smoothed representative curves rather than raw curves (which
# specifying smooth=TRUE to curveRep would do, if curveSmooth was not used)
d < curveSmooth(x, y, id)
w < with(d, curveRep(x, y, id))
# Example to show that curveRep can cluster profiles correctly when
# there is no noise. In the data there are four profiles  flat, flat
# at a higher mean y, linearly increasing then flat, and flat at the
# first height except for a sharp triangular peak
set.seed(1)
x < 0:100
m < length(x)
profile < matrix(NA, nrow=m, ncol=4)
profile[,1] < rep(0, m)
profile[,2] < rep(3, m)
profile[,3] < c(0:3, rep(3, m4))
profile[,4] < c(0,1,3,1,rep(0,m4))
col < c('black','blue','green','red')
matplot(x, profile, type='l', col=col)
xeval < seq(0, 100, length.out=5)
s < x %in% xeval
matplot(x[s], profile[s,], type='l', col=col)
id < rep(1:100, each=m)
X < Y < id
cols < character(100)
names(cols) < as.character(1:100)
for(i in 1:100) {
s < id==i
X[s] < x
j < sample(1:4,1)
Y[s] < profile[,j]
cols[i] < col[j]
}
table(cols)
yl < c(1,4)
w < curveRep(X, Y, id, kn=1, kxdist=1, k=4)
plot(w, 1, 'lattice', idcol=cols, ylim=yl)
# Found 4 clusters but two have same profile
w < curveRep(X, Y, id, kn=1, kxdist=1, k=3)
plot(w, 1, 'lattice', idcol=cols, freq=cols, plotfreq=TRUE, ylim=yl)
# Incorrectly combined black and red because default value p=5 did
# not result in different profiles at x=xeval
w < curveRep(X, Y, id, kn=1, kxdist=1, k=4, p=40)
plot(w, 1, 'lattice', idcol=cols, ylim=yl)
# Found correct clusters because evaluated curves at 40 equally
# spaced points and could find the sharp triangular peak in profile 4
}
}
\keyword{multivariate}
\keyword{hplot}
\concept{repeated measures}
\concept{longitudinal data}
\concept{serial data}
\concept{representative curves}
\concept{descriptive statistics}
\concept{exploratory data analysis}
