 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264 \name{Ecdf} \alias{Ecdf} \alias{Ecdf.default} \alias{Ecdf.data.frame} \alias{Ecdf.formula} \alias{panel.Ecdf} \alias{prepanel.Ecdf} \title{Empirical Cumulative Distribution Plot} \description{ Computes coordinates of cumulative distribution function of x, and by defaults plots it as a step function. A grouping variable may be specified so that stratified estimates are computed and (by default) plotted. If there is more than one group, the \code{labcurve} function is used (by default) to label the multiple step functions or to draw a legend defining line types, colors, or symbols by linking them with group labels. A \code{weights} vector may be specified to get weighted estimates. Specify \code{normwt} to make \code{weights} sum to the length of \code{x} (after removing NAs). Other wise the total sample size is taken to be the sum of the weights. \code{Ecdf} is actually a method, and \code{Ecdf.default} is what's called for a vector argument. \code{Ecdf.data.frame} is called when the first argument is a data frame. This function can automatically set up a matrix of ECDFs and wait for a mouse click if the matrix requires more than one page. Categorical variables, character variables, and variables having fewer than a set number of unique values are ignored. If \code{par(mfrow=..)} is not set up before \code{Ecdf.data.frame} is called, the function will try to figure the best layout depending on the number of variables in the data frame. Upon return the original \code{mfrow} is left intact. When the first argument to \code{Ecdf} is a formula, a Trellis/Lattice function \code{Ecdf.formula} is called. This allows for multi-panel conditioning, superposition using a \code{groups} variable, and other Trellis features, along with the ability to easily plot transformed ECDFs using the \code{fun} argument. For example, if \code{fun=qnorm}, the inverse normal transformation will be used for the y-axis. If the transformed curves are linear this indicates normality. Like the \code{xYplot} function, \code{Ecdf} will create a function \code{Key} if the \code{groups} variable is used. This function can be invoked by the user to define the keys for the groups. } \usage{ Ecdf(x, \dots) \method{Ecdf}{default}(x, what=c('F','1-F','f','1-f'), weights=rep(1, length(x)), normwt=FALSE, xlab, ylab, q, pl=TRUE, add=FALSE, lty=1, col=1, group=rep(1,length(x)), label.curves=TRUE, xlim, subtitles=TRUE, datadensity=c('none','rug','hist','density'), side=1, frac=switch(datadensity,none=NA,rug=.03,hist=.1,density=.1), dens.opts=NULL, lwd=1, log='', \dots) \method{Ecdf}{data.frame}(x, group=rep(1,nrows), weights=rep(1, nrows), normwt=FALSE, label.curves=TRUE, n.unique=10, na.big=FALSE, subtitles=TRUE, vnames=c('labels','names'),\dots) \method{Ecdf}{formula}(x, data=sys.frame(sys.parent()), groups=NULL, prepanel=prepanel.Ecdf, panel=panel.Ecdf, \dots, xlab, ylab, fun=function(x)x, what=c('F','1-F','f','1-f'), subset=TRUE) } \arguments{ \item{x}{a numeric vector, data frame, or Trellis/Lattice formula} \item{what}{ The default is \code{"F"} which results in plotting the fraction of values <= x. Set to \code{"1-F"} to plot the fraction > x or \code{"f"} to plot the cumulative frequency of values <= x. Use \code{"1-f"} to plot the cumulative frequency of values >= x. } \item{weights}{ numeric vector of weights. Omit or specify a zero-length vector or NULL to get unweighted estimates. } \item{normwt}{see above} \item{xlab}{ x-axis label. Default is label(x) or name of calling argument. For \code{Ecdf.formula}, \code{xlab} defaults to the \code{label} attribute of the x-axis variable. } \item{ylab}{ y-axis label. Default is \code{"Proportion <= x"}, \code{"Proportion > x"}, or "Frequency <= x" depending on value of \code{what}. } \item{q}{ a vector for quantiles for which to draw reference lines on the plot. Default is not to draw any. } \item{pl}{set to F to omit the plot, to just return estimates} \item{add}{ set to TRUE to add the cdf to an existing plot. Does not apply if using lattice graphics (i.e., if a formula is given as the first argument). } \item{lty}{ integer line type for plot. If \code{group} is specified, this can be a vector. } \item{lwd}{ line width for plot. Can be a vector corresponding to \code{group}s. } \item{log}{ see \code{\link{plot}}. Set \code{log='x'} to use log scale for \code{x}-axis. } \item{col}{ color for step function. Can be a vector. } \item{group}{ a numeric, character, or \code{factor} categorical variable used for stratifying estimates. If \code{group} is present, as many ECDFs are drawn as there are non--missing group levels. } \item{label.curves}{ applies if more than one \code{group} exists. Default is \code{TRUE} to use \code{labcurve} to label curves where they are farthest apart. Set \code{label.curves} to a \code{list} to specify options to \code{labcurve}, e.g., \code{label.curves=list(method="arrow", cex=.8)}. These option names may be abbreviated in the usual way arguments are abbreviated. Use for example \code{label.curves=list(keys=1:5)} to draw symbols periodically (as in \code{pch=1:5} - see \code{points}) on the curves and automatically position a legend in the most empty part of the plot. Set \code{label.curves=FALSE} to suppress drawing curve labels. The \code{col}, \code{lty}, and \code{type} parameters are automatically passed to \code{labcurve}, although you can override them here. You can set \code{label.curves=list(keys="lines")} to have different line types defined in an automatically positioned key. } \item{xlim}{ x-axis limits. Default is entire range of \code{x}. } \item{subtitles}{ set to \code{FALSE} to suppress putting a subtitle at the bottom left of each plot. The subtitle indicates the numbers of non-missing and missing observations, which are labeled \code{n}, \code{m}. } \item{datadensity}{ If \code{datadensity} is not \code{"none"}, either \code{scat1d} or \code{histSpike} is called to add a rug plot (\code{datadensity="rug"}), spike histogram (\code{datadensity="hist"}), or smooth density estimate (\code{"density"}) to the bottom or top of the ECDF. } \item{side}{ If \code{datadensity} is not \code{"none"}, the default is to place the additional information on top of the x-axis (\code{side=1}). Use \code{side=3} to place at the top of the graph. } \item{frac}{ passed to \code{histSpike} } \item{dens.opts}{ a list of optional arguments for \code{histSpike} } \item{...}{ other parameters passed to plot if add=F. For data frames, other parameters to pass to \code{Ecdf.default}. For \code{Ecdf.formula}, if \code{groups} is not used, you can also add data density information to each panel's ECDF by specifying the \code{datadensity} and optional \code{frac}, \code{side}, \code{dens.opts} arguments. } \item{n.unique}{ minimum number of unique values before an ECDF is drawn for a variable in a data frame. Default is 10. } \item{na.big}{ set to \code{TRUE} to draw the number of NAs in larger letters in the middle of the plot for \code{Ecdf.data.frame} } \item{vnames}{ By default, variable labels are used to label x-axes. Set \code{vnames="names"} to instead use variable names. } \item{method}{ method for computing the empirical cumulative distribution. See \code{wtd.Ecdf}. The default is to use the standard \code{"i/n"} method as is used by the non-Trellis versions of \code{Ecdf}. } \item{fun}{ a function to transform the cumulative proportions, for the Trellis-type usage of \code{Ecdf} } \item{data, groups, subset,prepanel, panel}{the usual Trellis/Lattice parameters, with \code{groups} causing \code{Ecdf.formula} to overlay multiple ECDFs on one panel.} } \value{ for \code{Ecdf.default} an invisible list with elements x and y giving the coordinates of the cdf. If there is more than one \code{group}, a list of such lists is returned. An attribute, \code{N}, is in the returned object. It contains the elements \code{n} and \code{m}, the number of non-missing and missing observations, respectively. } \author{ Frank Harrell \cr Department of Biostatistics, Vanderbilt University \cr \email{f.harrell@vanderbilt.edu} } \section{Side Effects}{ plots } \seealso{ \code{\link{wtd.Ecdf}}, \code{\link{label}}, \code{\link{table}}, \code{\link{cumsum}}, \code{\link{labcurve}}, \code{\link{xYplot}}, \code{\link{histSpike}} } \examples{ set.seed(1) ch <- rnorm(1000, 200, 40) Ecdf(ch, xlab="Serum Cholesterol") scat1d(ch) # add rug plot histSpike(ch, add=TRUE, frac=.15) # add spike histogram # Better: add a data density display automatically: Ecdf(ch, datadensity='density') label(ch) <- "Serum Cholesterol" Ecdf(ch) other.ch <- rnorm(500, 220, 20) Ecdf(other.ch,add=TRUE,lty=2) sex <- factor(sample(c('female','male'), 1000, TRUE)) Ecdf(ch, q=c(.25,.5,.75)) # show quartiles Ecdf(ch, group=sex, label.curves=list(method='arrow')) # Example showing how to draw multiple ECDFs from paired data pre.test <- rnorm(100,50,10) post.test <- rnorm(100,55,10) x <- c(pre.test, post.test) g <- c(rep('Pre',length(pre.test)),rep('Post',length(post.test))) Ecdf(x, group=g, xlab='Test Results', label.curves=list(keys=1:2)) # keys=1:2 causes symbols to be drawn periodically on top of curves # Draw a matrix of ECDFs for a data frame m <- data.frame(pre.test, post.test, sex=sample(c('male','female'),100,TRUE)) Ecdf(m, group=m\$sex, datadensity='rug') freqs <- sample(1:10, 1000, TRUE) Ecdf(ch, weights=freqs) # weighted estimates # Trellis/Lattice examples: region <- factor(sample(c('Europe','USA','Australia'),100,TRUE)) year <- factor(sample(2001:2002,1000,TRUE)) Ecdf(~ch | region*year, groups=sex) Key() # draw a key for sex at the default location # Key(locator(1)) # user-specified positioning of key age <- rnorm(1000, 50, 10) Ecdf(~ch | equal.count(age), groups=sex) # use overlapping shingles Ecdf(~ch | sex, datadensity='hist', side=3) # add spike histogram at top } \keyword{nonparametric} \keyword{hplot} \keyword{methods} \keyword{distribution} \concept{trellis} \concept{lattice}