1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
|
\name{pvclust}
\alias{pvclust}
\alias{parPvclust}
\title{Calculating P-values for Hierchical Clustering}
\description{
calculates \eqn{p}-values for hierarchical clustering via
multiscale bootstrap resampling. Hierarchical clustering is done for
given data and \eqn{p}-values are computed for each of the clusters.
}
\usage{
pvclust(data, method.hclust="average",
method.dist="correlation", use.cor="pairwise.complete.obs",
nboot=1000, r=seq(.5,1.4,by=.1), store=FALSE, weight=FALSE)
parPvclust(cl=NULL, data, method.hclust="average",
method.dist="correlation", use.cor="pairwise.complete.obs",
nboot=1000, r=seq(.5,1.4,by=.1), store=FALSE, weight=FALSE,
init.rand=TRUE, seed=NULL, iseed=NULL)
}
\arguments{
\item{data}{numeric data matrix or data frame.}
\item{method.hclust}{
the agglomerative method used in hierarchical clustering. This
should be (an abbreviation of) one of \code{"average"}, \code{"ward"},
\code{"single"}, \code{"complete"}, \code{"mcquitty"},
\code{"median"} or \code{"centroid"}. The default is
\code{"average"}. See \code{method} argument in
\code{\link[stats]{hclust}}.
}
\item{method.dist}{the distance measure to be used. This should be (an
abbreviation of) one of \code{"correlation"}, \code{"uncentered"},
\code{"abscor"} or those which are allowed for \code{method}
argument in \code{dist} function. The default is
\code{"correlation"}. See \emph{details} section in this help and
\code{method} argument in \code{\link[stats]{dist}}.
}
\item{use.cor}{character string which specifies the method for
computing correlation with data including missing values. This
should be (an abbreviation of) one of \code{"all.obs"},
\code{"complete.obs"} or \code{"pairwise.complete.obs"}. See
the \code{use} argument in \code{\link[stats]{cor}} function.
}
\item{nboot}{the number of bootstrap replications. The default is
\code{1000}.}
\item{r}{numeric vector which specifies the relative sample sizes of
bootstrap replications. For original sample size \eqn{n} and
bootstrap sample size \eqn{n'}, this is defined as \eqn{r=n'/n}.}
\item{store}{locical. If \code{store=TRUE}, all bootstrap replications
are stored in the output object. The default is \code{FALSE}.}
\item{cl}{a cluster object created by package \pkg{parallel} or \pkg{snow}.
If NULL, use the registered default cluster.}
\item{weight}{logical. If \code{weight=TRUE}, resampling is made by
weight vector instead of index vector. Useful for large \code{r}
value (\code{r>10}). Currently, available only for distance
\code{"correlation"} and \code{"abscor"}.}
% \item{init.rand}{logical. If \code{init.rand=TRUE}, random number
% generators are initialized at child processes. Random seeds can be
% set by \code{seed} argument.}
% \item{seed}{integer vector of random seeds. It should have the same
% length as \code{cl}. If \code{NULL} is specified,
% \code{1:length(cl)} is used as seed vector. The default is \code{NULL}.}
\item{init.rand}{logical. If \code{init.rand=TRUE}, random number generators are initialized.
Use \code{iseed} argument to achieve reproducible results.}
\item{seed}{integer vector of random seeds. It should have the same
length as \code{cl}. \strong{This argument is duplicated and will be unavailable in the future. Consider using \code{iseed} instead.} }
\item{iseed}{an integer to be passed to \code{clusterSetRNGStream} when \code{init.rand=TRUE}.}
}
\details{
Function \code{pvclust} conducts multiscale bootstrap resampling to calculate
\eqn{p}-values for each cluster in the result of hierarchical
clustering. \code{parPvclust} is the parallel version of this
procedure which depends on package \pkg{parallel} for parallel computation.
For data expressed as \eqn{(n \times p)}{(n, p)} matrix or data frame, we
assume that the data is \eqn{n} observations of \eqn{p} objects, which
are to be clustered. The \eqn{i}'th row vector corresponds to the
\eqn{i}'th observation of these objects and the \eqn{j}'th column
vector corresponds to a sample of \eqn{j}'th object with size \eqn{n}.
There are several methods to measure the dissimilarities between
objects. For data matrix \eqn{X=\{x_{ij}\}}{X},
\code{"correlation"}
method takes
\deqn{
1 - \frac{
\sum_{i=1}^n (x_{ij} - \bar{x}_j) (x_{ik} - \bar{x}_k)
}
{
\sqrt{\sum_{i=1}^n (x_{ij} - \bar{x}_j)^2}
\sqrt{\sum_{i=1}^n (x_{ik} - \bar{x}_k)^2}
}
}{%
1 - cor(X)[j,k]
}
for dissimilarity between \eqn{j}'th and \eqn{k}'th object, where
\eqn{\bar{x}_j = \frac{1}{n} \sum_{i=1}^n x_{ij} \mbox{and}
\bar{x}_k = \frac{1}{n} \sum_{i=1}^n x_{ik}}{cor is function \code{cor}}.
\code{"uncentered"} takes uncentered sample correlation
\deqn{
1 - \frac{
\sum_{i=1}^n x_{ij} x_{ik}
}
{
\sqrt{\sum_{i=1}^n x_{ij}^2}
\sqrt{\sum_{i=1}^n x_{ik}^2}
}
}{%
1 - sum(x[,j] * x[,k]) / (sqrt(sum(x[,j]^2)) * sqrt(sum(x[,k]^2)))
}
and \code{"abscor"} takes the absolute value of sample correlation
\deqn{
1 - \ \Biggl| \frac{
\sum_{i=1}^n (x_{ij} - \bar{x}_j) (x_{ik} - \bar{x}_k)
}
{
\sqrt{\sum_{i=1}^n (x_{ij} - \bar{x}_j)^2}
\sqrt{\sum_{i=1}^n (x_{ik} - \bar{x}_k)^2}
} \Biggl|.
}{%
1 - abs(cor(X)[j,k]).
}
}
\value{
\item{hclust}{hierarchical clustering for original data generated by
function \code{hclust}. See \code{\link[stats]{hclust}} for details.}
\item{edges}{data frame object which contains \eqn{p}-values and
supporting informations such as standard errors.}
\item{count}{data frame object which contains primitive information
about the result of multiscale bootstrap resampling.}
\item{msfit}{list whose elements are results of curve fitting for
multiscale bootstrap resampling, of class \code{msfit}. See
\code{\link{msfit}} for details.}
\item{nboot}{numeric vector of number of bootstrap replications.}
\item{r}{numeric vector of the relative sample size for bootstrap
replications.}
\item{store}{list contains bootstrap replications if \code{store=TRUE}
was given for function \code{pvclust} or \code{parPvclust}.}
}
\seealso{\code{\link{lines.pvclust}}, \code{\link{print.pvclust}},
\code{\link{msfit}}, \code{\link{plot.pvclust}},
\code{\link{text.pvclust}}, \code{\link{pvrect}} and
\code{\link{pvpick}}.}
\references{
Shimodaira, H. (2004)
"Approximately unbiased tests of regions using multistep-multiscale
bootstrap resampling",
\emph{Annals of Statistics}, 32, 2616-2641.
Shimodaira, H. (2002)
"An approximately unbiased test of phylogenetic tree selection",
\emph{Systematic Biology}, 51, 492-508.
Suzuki, R. and Shimodaira, H. (2004)
"An application of multiscale bootstrap resampling to hierarchical
clustering of microarray data: How accurate are these clusters?",
\emph{The Fifteenth International Conference on Genome Informatics 2004},
P034.
\url{http://www.is.titech.ac.jp/~shimo/prog/pvclust/}
}
\examples{
## using Boston data in package MASS
library(MASS)
data(Boston)
## multiscale bootstrap resampling
boston.pv <- pvclust(Boston, nboot=100)
## CAUTION: nboot=100 may be too small for actual use.
## We suggest nboot=1000 or larger.
## plot/print functions will be useful for diagnostics.
## plot dendrogram with p-values
plot(boston.pv)
ask.bak <- par()$ask
par(ask=TRUE)
## highlight clusters with high au p-values
pvrect(boston.pv)
## print the result of multiscale bootstrap resampling
print(boston.pv, digits=3)
## plot diagnostic for curve fitting
msplot(boston.pv, edges=c(2,4,6,7))
par(ask=ask.bak)
## Print clusters with high p-values
boston.pp <- pvpick(boston.pv)
boston.pp
\dontrun{
## parallel computation
library(parallel)
cl <- makeCluster(2, type = "PSOCK")
## parallel version of pvclust
boston.pv <- parPvclust(cl, Boston, nboot=1000)
stopCluster(cl)
}
}
\author{Ryota Suzuki \email{suzuki@ef-prime.com}}
\keyword{cluster}
|