1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266
|
\name{clusplot.default}
\alias{clusplot.default}
\title{Bivariate Cluster Plot (clusplot) Default Method}
\description{
Creates a bivariate plot visualizing a partition (clustering) of the data. All
observation are represented by points in the plot, using principal
components or multidimensional scaling. Around each cluster an ellipse
is drawn.
}
\usage{
\method{clusplot}{default}(x, clus, diss = FALSE,
s.x.2d = mkCheckX(x, diss), stand = FALSE,
lines = 2, shade = FALSE, color = FALSE,
labels= 0, plotchar = TRUE,
col.p = "dark green", col.txt = col.p,
col.clus = if(color) c(2, 4, 6, 3) else 5, cex = 1, cex.txt = cex,
span = TRUE,
add = FALSE,
xlim = NULL, ylim = NULL,
main = paste("CLUSPLOT(", deparse1(substitute(x)),")"),
sub = paste("These two components explain",
round(100 * var.dec, digits = 2), "\% of the point variability."),
xlab = "Component 1", ylab = "Component 2",
verbose = getOption("verbose"),
\dots)
}
\arguments{
\item{x}{matrix or data frame, or dissimilarity matrix, depending on
the value of the \code{diss} argument.
In case of a matrix (alike), each row corresponds to an observation,
and each column corresponds to a variable. All variables must be
numeric. Missing values (\code{\link{NA}}s) are allowed. They are
replaced by the median of the corresponding variable. When some
variables or some observations contain only missing values, the
function stops with a warning message.
In case of a dissimilarity matrix, \code{x} is the output of
\code{\link{daisy}} or \code{\link{dist}} or a symmetric matrix. Also,
a vector of length \eqn{n*(n-1)/2} is allowed (where \eqn{n} is the
number of observations), and will be interpreted in the same way as
the output of the above-mentioned functions. Missing values (NAs)
are not allowed.
}
\item{clus}{
a vector of length n representing a clustering of \code{x}. For
each observation the vector lists the number or name of the cluster
to which it has been assigned. \code{clus} is often the clustering
component of the output of \code{\link{pam}}, \code{\link{fanny}} or
\code{\link{clara}}.}
\item{diss}{
logical indicating if \code{x} will be considered as a dissimilarity
matrix or a matrix of observations by variables (see \code{x}
arugment above).}
\item{s.x.2d}{a \code{\link{list}} with components named \code{x} (a \eqn{n
\times 2}{n x 2} matrix; typically something like principal components of
original data), \code{labs} and \code{var.dec}.}% FIXME: 'labs' and 'var.dec' are not always needed
\item{stand}{
logical flag: if true, then the representations of the n observations in the
2-dimensional plot are standardized.
}
\item{lines}{
integer out of \code{0, 1, 2}, used to obtain an idea of the
distances between ellipses. The distance between two ellipses E1
and E2 is measured along the line connecting the centers \eqn{m1}
and \eqn{m2} of the two ellipses.
In case E1 and E2 overlap on the line through \eqn{m1} and \eqn{m2},
no line is drawn. Otherwise, the result depends on the value of
\code{lines}: If
\describe{
\item{lines = 0,}{no distance lines will appear on the plot;}
\item{lines = 1,}{the line segment between \eqn{m1} and \eqn{m2} is drawn;}
\item{lines = 2,}{a line segment between the boundaries of E1 and
E2 is drawn (along the line connecting \eqn{m1} and \eqn{m2}).}
}
}
\item{shade}{
logical flag: if TRUE, then the ellipses are shaded in relation to their
density. The density is the number of points in the cluster divided by the
area of the ellipse.
}
\item{color}{
logical flag: if TRUE, then the ellipses are colored with respect to their
density. With increasing density, the colors are light blue, light
green, red and purple. To see these colors on the graphics device, an
appropriate color scheme should be selected (we recommend a white
background).}
\item{labels}{
integer code, currently one of 0,1,2,3,4 and 5. If
\describe{
\item{labels= 0,}{no labels are placed in the plot;}
\item{labels= 1,}{points and ellipses can be identified in the plot (see
\code{\link{identify}});}
\item{labels= 2,}{all points and ellipses are labelled in the plot;}
\item{labels= 3,}{only the points are labelled in the plot;}
\item{labels= 4,}{only the ellipses are labelled in the plot.}
\item{labels= 5,}{the ellipses are labelled in the plot, and
points can be identified.}
}
The levels of the vector \code{clus} are taken as labels for the
clusters. The labels
of the points are the rownames of \code{x} if \code{x} is matrix like.
Otherwise (\code{diss = TRUE}), \code{x} is a vector, point labels
can be attached to \code{x} as a "Labels" attribute
(\code{attr(x,"Labels")}), as is done for the output of
\code{\link{daisy}}.
A possible \code{\link{names}} attribute of \code{clus} will not
be taken into account.
}
\item{plotchar}{
logical flag: if TRUE, then the plotting symbols differ for points belonging
to different clusters.
}
\item{span}{
logical flag: if TRUE, then each cluster is represented by the ellipse with
smallest area containing all its points. (This is a special case of the
minimum volume ellipsoid.)\cr
If FALSE, the ellipse is based on the mean and covariance matrix of the
same points. While this is faster to compute, it often yields a much
larger ellipse.
There are also some special cases: When a cluster consists of only
one point, a tiny circle is drawn around it. When the points of a
cluster fall on a straight line, \code{span=FALSE} draws a narrow
ellipse around it and \code{span=TRUE} gives the exact line segment.
}
\item{add}{logical indicating if ellipses (and labels if \code{labels}
is true) should be \emph{added} to an already existing plot. If
false, neither a \code{\link{title}} or sub title, see \code{sub},
is written.}
\item{col.p}{color code(s) used for the observation points.}
\item{col.txt}{color code(s) used for the labels (if \code{labels >= 2}).}
\item{col.clus}{color code for the ellipses (and their labels);
only one if color is false (as per default).}
\item{cex, cex.txt}{character \bold{ex}pansion (size), for the point
symbols and point labels, respectively.}
\item{xlim, ylim}{numeric vectors of length 2, giving the x- and y-
ranges as in \code{\link{plot.default}}.}
\item{main}{main title for the plot; by default, one is constructed.}
\item{sub}{sub title for the plot; by default, one is constructed.}
\item{xlab, ylab}{x- and y- axis labels for the plot, with defaults.}
\item{verbose}{a logical indicating, if there should be extra
diagnostic output; mainly for \sQuote{debugging}.}
\item{\dots}{Further graphical parameters may also be supplied, see
\code{\link{par}}.}
}% End Arguments
\value{
An invisible list with components:
\item{Distances}{
When \code{lines} is 1 or 2 we optain a k by k matrix (k is the number of
clusters). The element in \code{[i,j]} is the distance between ellipse
i and ellipse j.\cr
If \code{lines = 0}, then the value of this component is \code{NA}.
}
\item{Shading}{
A vector of length k (where k is the number of clusters), containing the
amount of shading per cluster. Let y be a vector where element i is the
ratio between the number of points in cluster i and the area of ellipse i.
When the cluster i is a line segment, y[i] and the density of the cluster are
set to \code{NA}. Let z be the sum of all the elements of y without the NAs.
Then we put shading = y/z *37 + 3 .
}
}
\section{Side Effects}{
a visual display of the clustering is plotted on the current graphics device.
}
\details{
\code{clusplot} uses function calls
\code{\link{princomp}(*, cor = (ncol(x) > 2))} or
\code{\link{cmdscale}(*, add=TRUE)}, respectively, depending on
\code{diss} being false or true. These functions are data reduction
techniques to represent the data in a bivariate plot.
Ellipses are then drawn to indicate the clusters. The further layout of the
plot is determined by the optional arguments.
}
\note{
When we have 4 or fewer clusters, then the \code{color=TRUE} gives
every cluster a different color. When there are more than 4 clusters,
clusplot uses the function \code{\link{pam}} to cluster the
densities into 4 groups such that ellipses with nearly the same
density get the same color. \code{col.clus} specifies the colors used.
The \code{col.p} and \code{col.txt} arguments, added for \R,
are recycled to have length the number of observations.
If \code{col.p} has more than one value, using \code{color = TRUE} can
be confusing because of a mix of point and ellipse colors.
}
\references{
Pison, G., Struyf, A. and Rousseeuw, P.J. (1999)
Displaying a Clustering with CLUSPLOT,
\emph{Computational Statistics and Data Analysis}, \bold{30}, 381--392.\cr
%% Jan.2015 : no longer there:
%% A version of this is available as technical report from
%% \url{http://www.agoras.ua.ac.be/abstract/Disclu99.htm}
Kaufman, L. and Rousseeuw, P.J. (1990).
\emph{Finding Groups in Data: An Introduction to Cluster Analysis.}
Wiley, New York.
Struyf, A., Hubert, M. and Rousseeuw, P.J. (1997).
Integrating Robust Clustering Techniques in S-PLUS,
\emph{Computational Statistics and Data Analysis}, \bold{26}, 17-37.
}
\seealso{
\code{\link{princomp}}, \code{\link{cmdscale}}, \code{\link{pam}},
\code{\link{clara}}, \code{\link{daisy}}, \code{\link{par}},
\code{\link{identify}}, \code{\link[MASS]{cov.mve}},
\code{\link{clusplot.partition}}.
}
\examples{
## plotting votes.diss(dissimilarity) in a bivariate plot and
## partitioning into 2 clusters
data(votes.repub)
votes.diss <- daisy(votes.repub)
pamv <- pam(votes.diss, 2, diss = TRUE)
clusplot(pamv, shade = TRUE)
## is the same as
votes.clus <- pamv$clustering
clusplot(votes.diss, votes.clus, diss = TRUE, shade = TRUE)
## Now look at components 3 and 2 instead of 1 and 2:
str(cMDS <- cmdscale(votes.diss, k=3, add=TRUE))
clusplot(pamv, s.x.2d = list(x=cMDS$points[, c(3,2)],
labs=rownames(votes.repub), var.dec=NA),
shade = TRUE, col.p = votes.clus,
sub="", xlab = "Component 3", ylab = "Component 2")
clusplot(pamv, col.p = votes.clus, labels = 4)# color points and label ellipses
# "simple" cheap ellipses: larger than minimum volume:
# here they are *added* to the previous plot:
clusplot(pamv, span = FALSE, add = TRUE, col.clus = "midnightblue")
## Setting a small *label* size:
clusplot(votes.diss, votes.clus, diss = TRUE, labels = 3, cex.txt = 0.6)
if(dev.interactive()) { # uses identify() *interactively* :
clusplot(votes.diss, votes.clus, diss = TRUE, shade = TRUE, labels = 1)
clusplot(votes.diss, votes.clus, diss = TRUE, labels = 5)# ident. only points
}
## plotting iris (data frame) in a 2-dimensional plot and partitioning
## into 3 clusters.
data(iris)
iris.x <- iris[, 1:4]
cl3 <- pam(iris.x, 3)$clustering
op <- par(mfrow= c(2,2))
clusplot(iris.x, cl3, color = TRUE)
U <- par("usr")
## zoom in :
rect(0,-1, 2,1, border = "orange", lwd=2)
clusplot(iris.x, cl3, color = TRUE, xlim = c(0,2), ylim = c(-1,1))
box(col="orange",lwd=2); mtext("sub region", font = 4, cex = 2)
## or zoom out :
clusplot(iris.x, cl3, color = TRUE, xlim = c(-4,4), ylim = c(-4,4))
mtext("'super' region", font = 4, cex = 2)
rect(U[1],U[3], U[2],U[4], lwd=2, lty = 3)
# reset graphics
par(op)
}
\keyword{cluster}
\keyword{hplot}
|