File: pvclust.Rd

package info (click to toggle)
pvclust 1.2-2-1
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 188 kB
  • sloc: makefile: 3
file content (203 lines) | stat: -rwxr-xr-x 7,659 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
\name{pvclust}
\alias{pvclust}
\alias{parPvclust}
\title{Calculating P-values for Hierchical Clustering}
\description{
  calculates \eqn{p}-values for hierarchical clustering via
  multiscale bootstrap resampling. Hierarchical clustering is done for
  given data and \eqn{p}-values are computed for each of the clusters.
}
\usage{
pvclust(data, method.hclust="average",
        method.dist="correlation", use.cor="pairwise.complete.obs",
        nboot=1000, r=seq(.5,1.4,by=.1), store=FALSE, weight=FALSE)

parPvclust(cl, data, method.hclust="average",
           method.dist="correlation", use.cor="pairwise.complete.obs",
           nboot=1000, r=seq(.5,1.4,by=.1), store=FALSE, weight=FALSE,
           init.rand=TRUE, seed=NULL)

}
\arguments{
  \item{data}{numeric data matrix or data frame.}
  \item{method.hclust}{
    the agglomerative method used in hierarchical clustering. This
    should be (an abbreviation of) one of \code{"average"}, \code{"ward"},
    \code{"single"}, \code{"complete"}, \code{"mcquitty"},
    \code{"median"} or \code{"centroid"}. The default is
    \code{"average"}. See \code{method} argument in
    \code{\link[stats]{hclust}}.
  }
  \item{method.dist}{the distance measure to be used. This should be (an
    abbreviation of) one of \code{"correlation"}, \code{"uncentered"},
    \code{"abscor"} or those which are allowed for \code{method}
    argument in \code{dist} function. The default is
    \code{"correlation"}. See \emph{details} section in this help and
    \code{method} argument in \code{\link[stats]{dist}}.
    }
  \item{use.cor}{character string which specifies the method for
    computing correlation with data including missing values. This
    should be (an abbreviation of) one of \code{"all.obs"},
    \code{"complete.obs"} or \code{"pairwise.complete.obs"}. See
    the \code{use} argument in \code{\link[stats]{cor}} function.
  }
  \item{nboot}{the number of bootstrap replications. The default is
    \code{1000}.}
  \item{r}{numeric vector which specifies the relative sample sizes of
    bootstrap replications. For original sample size \eqn{n} and
    bootstrap sample size \eqn{n'}, this is defined as \eqn{r=n'/n}.}
  \item{store}{locical. If \code{store=TRUE}, all bootstrap replications
  are stored in the output object. The default is \code{FALSE}.}
  \item{cl}{\pkg{snow} cluster object which may be generated by
    function \code{makeCluster}. See \code{\link[snow]{snow-startstop}}
    in \pkg{snow} package.}
  \item{weight}{logical. If \code{weight=TRUE}, resampling is made by
    weight vector instead of index vector. Useful for large \code{r}
    value (\code{r>10}).  Currently, available only for distance
    \code{"correlation"} and \code{"abscor"}.}
  \item{init.rand}{logical. If \code{init.rand=TRUE}, random number
    generators are initialized at child processes. Random seeds can be
    set by \code{seed} argument.}
  \item{seed}{integer vector of random seeds. It should have the same
    length as \code{cl}. If \code{NULL} is specified,
    \code{1:length(cl)} is used as seed vector. The default is \code{NULL}.}
}
\details{
  Function \code{pvclust} conducts multiscale bootstrap resampling to calculate
  \eqn{p}-values for each cluster in the result of hierarchical
  clustering. \code{parPvclust} is the parallel version of this
  procedure which depends on \pkg{snow} package for parallel
  computation.

  For data expressed as \eqn{(n \times p)}{(n, p)} matrix or data frame, we
  assume that the data is \eqn{n} observations of \eqn{p} objects, which
  are to be clustered. The \eqn{i}'th row vector corresponds to the
  \eqn{i}'th observation of these objects and the \eqn{j}'th column
  vector corresponds to a sample of \eqn{j}'th object with size \eqn{n}.

  There are several methods to measure the dissimilarities between
  objects. For data matrix \eqn{X=\{x_{ij}\}}{X},
  \code{"correlation"}
  method takes
  \deqn{
    1 - \frac{
      \sum_{i=1}^n (x_{ij} - \bar{x}_j) (x_{ik} - \bar{x}_k)
    }
    {
      \sqrt{\sum_{i=1}^n (x_{ij} - \bar{x}_j)^2}
      \sqrt{\sum_{i=1}^n (x_{ik} - \bar{x}_k)^2}
    }
    }{%
      1 - cor(X)[j,k]
    }
  for dissimilarity between \eqn{j}'th and \eqn{k}'th object, where
  \eqn{\bar{x}_j = \frac{1}{n} \sum_{i=1}^n x_{ij} \mbox{and}
  \bar{x}_k = \frac{1}{n} \sum_{i=1}^n x_{ik}}{cor is function \code{cor}}.

  \code{"uncentered"} takes uncentered sample correlation
  \deqn{
    1 - \frac{
      \sum_{i=1}^n x_{ij} x_{ik}
    }
    {
      \sqrt{\sum_{i=1}^n x_{ij}^2}
      \sqrt{\sum_{i=1}^n x_{ik}^2}
    }
    }{%
      1 - sum(x[,j] * x[,k]) / (sqrt(sum(x[,j]^2)) * sqrt(sum(x[,k]^2)))
    }
  and \code{"abscor"} takes the absolute value of sample correlation
  \deqn{
    1 - \ \Biggl| \frac{
      \sum_{i=1}^n (x_{ij} - \bar{x}_j) (x_{ik} - \bar{x}_k)
    }
    {
      \sqrt{\sum_{i=1}^n (x_{ij} - \bar{x}_j)^2}
      \sqrt{\sum_{i=1}^n (x_{ik} - \bar{x}_k)^2}
    } \Biggl|.
  }{%
    1 - abs(cor(X)[j,k]).
  }
  }
\value{
  \item{hclust}{hierarchical clustering for original data generated by
    function \code{hclust}. See \code{\link[stats]{hclust}} for details.}
  \item{edges}{data frame object which contains \eqn{p}-values and
    supporting informations such as standard errors.}
  \item{count}{data frame object which contains primitive information
    about the result of multiscale bootstrap resampling.}
  \item{msfit}{list whose elements are results of curve fitting for
    multiscale bootstrap resampling, of class \code{msfit}. See
    \code{\link{msfit}} for details.}
  \item{nboot}{numeric vector of number of bootstrap replications.}
  \item{r}{numeric vector of the relative sample size for bootstrap
    replications.}
  \item{store}{list contains bootstrap replications if \code{store=TRUE}
  was given for function \code{pvclust} or \code{parPvclust}.}
}
\seealso{\code{\link{lines.pvclust}}, \code{\link{print.pvclust}},
  \code{\link{msfit}}, \code{\link{plot.pvclust}},
  \code{\link{text.pvclust}}, \code{\link{pvrect}} and
  \code{\link{pvpick}}.}
\references{
  Shimodaira, H. (2004)
  "Approximately unbiased tests of regions using multistep-multiscale
  bootstrap resampling",
  \emph{Annals of Statistics}, 32, 2616-2641.

  Shimodaira, H. (2002)
  "An approximately unbiased test of phylogenetic tree selection",
  \emph{Systematic Biology}, 51, 492-508.

  Suzuki, R. and Shimodaira, H. (2004)
  "An application of multiscale bootstrap resampling to hierarchical
  clustering of microarray data: How accurate are these clusters?",
  \emph{The Fifteenth International Conference on Genome Informatics 2004},
  P034.

  \url{http://www.is.titech.ac.jp/~shimo/prog/pvclust/}
}
\examples{
## using Boston data in package MASS
library(MASS)
data(Boston)

## multiscale bootstrap resampling
boston.pv <- pvclust(Boston, nboot=100)

## CAUTION: nboot=100 may be too small for actual use.
##          We suggest nboot=1000 or larger.
##          plot/print functions will be useful for diagnostics.

## plot dendrogram with p-values
plot(boston.pv)

ask.bak <- par()$ask
par(ask=TRUE)

## highlight clusters with high au p-values
pvrect(boston.pv)

## print the result of multiscale bootstrap resampling
print(boston.pv, digits=3)

## plot diagnostic for curve fitting
msplot(boston.pv, edges=c(2,4,6,7))

par(ask=ask.bak)

## Print clusters with high p-values
boston.pp <- pvpick(boston.pv)
boston.pp

\dontrun{
## parallel computation via snow package
library(snow)
cl <- makeCluster(10, type="MPI")

## parallel version of pvclust
boston.pv <- parPvclust(cl, Boston, nboot=1000)
}
}
\author{Ryota Suzuki \email{ryota.suzuki@is.titech.ac.jp}}
\keyword{cluster}