1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374
|
# Copyright (C) Tal Galili
#
# This file is part of dendextend.
#
# dendextend is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# dendextend is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# A copy of the GNU General Public License is available at
# http://www.r-project.org/Licenses/
#
#' @title Draw Rectangles Around a Dendrogram's Clusters
#' @export
#' @description
#' Draws rectangles around the branches of a dendrogram
#' highlighting the corresponding clusters.
#' First the dendrogram is cut at a certain level,
#' then a rectangle is drawn around selected branches.
#' @param tree a \link{dendrogram} object.
#' @param k Scalar. Cut the dendrogram such that exactly k clusters (if possible) are produced.
#' @param h Scalar. Cut the dendrogram by cutting at height h. (k overrides h)
#' @param which A vector selecting the clusters around
#' which a rectangle should be drawn. which selects clusters by number
#' (from left to right in the tree), Default is which = 1:k.
#' @param x A vector selecting the clusters around
#' which a rectangle should be drawn. x selects clusters
#' containing the respective horizontal coordinates.
#' @param border Vector with border colors for the rectangles.
#' @param cluster Optional vector with cluster memberships as returned by
#' cutree(dend_obj, k = k), can be specified for efficiency if already computed.
#' @param horiz logical (FALSE), indicating if the rectangles
#' should be drawn horizontally or not (for when using
#' plot(dend, horiz = TRUE) ) .
#' @param density Passed to \link{rect}: the density of shading lines,
#' in lines per inch. The default value of NULL means that
#' no shading lines are drawn. A zero value of density means
#' no shading lines whereas negative values (and NA)
#' suppress shading (and so allow color filling).
#' If border is a vector of colors, the color of density will default to 1.
#' @param angle Passed to \link{rect}: angle (in degrees) of the shading lines.
#' (default is 45)
#' @param text a character vector of labels to plot underneath the clusters.
#' When NULL (default), no text is displayed.
#' @param text_cex a numeric (scalar) value of the text's cex value.
#' @param text_col a (scalar) value of the text's col(or) value.
#' @param xpd A logical value (or NA.), passed to \link{par}.
#' Default is TRUE, in order to allow the rect to be below the labels.
#' If FALSE, all plotting is clipped to the plot region, if TRUE,
#' all plotting is clipped to the figure region, and if NA, all plotting
#' is clipped to the device region. See also \link{clip}.
#' @param lower_rect a (scalar) value of how low should the lower part of the rect be.
#' If missing, it will take the value of par("usr")[3L] (or par("usr")[2L], depending
#' if horiz = TRUE or not), with also the width of the labels. (notice that we
#' would like to keep xpd = TRUE if we want the rect to be after the labels!)
#' You can use a value such as 0, to get the rect above the labels.
#'
#' Notice that for a plot with small margins, it would be better to set this
#' parameter manually.
#' @param upper_rect a (scalar) value to add (default is 0) to how high should the upper part of the rect be.
#' @param prop_k_height a (scalar) value (should be between 0 to 1), indicating what proportion
#' of the height our rect will be between the height needed for k and k+1 clustering.
#' @param stop_if_out logical (default is TRUE). This makes the function
#' stop if k of the locator is outside the range (this default reproduces the behavior
#' of the rect.hclust function).
#' @param ... parameters passed to rect (such as lwd, lty, etc.)
#' @seealso
#' \link{rect.hclust}, \link{order.dendrogram}, \link{cutree.dendrogram}
#' @return
#' (Invisibly) returns a list where each element contains a vector
#' of data points contained in the respective cluster.
#' @source
#' This function is based on \link{rect.hclust}, with slight modifications
#' to have it work with a dendrogram, as well as a few added features
#' (e.g: ... to rect, and horiz)
#'
#' The idea of adding text and shading lines under the clusters comes from
#' skullkey from here:
#' \url{http://stackoverflow.com/questions/4720307/change-dendrogram-leaves}
#'
#' @examples
#'
#' set.seed(23235)
#' ss <- sample(1:150, 10 )
#' hc <- iris[ss,-5] %>% dist %>% hclust
#' dend <- hc %>% as.dendrogram
#'
#' plot(dend)
#' rect.dendrogram(dend,2, border = 2)
#' rect.dendrogram(dend,3, border = 4)
#' Vectorize(rect.dendrogram, "k")(dend,4:5, border = 6)
#'
#' plot(dend)
#' rect.dendrogram(dend, 3 , border = 1:3,
#' density = 2, text = c("1", "b", "miao"), text_cex = 3)
#'
#' plot(dend)
#' rect.dendrogram(dend,4, which = c(1,3), border = c(2,3))
#' rect.dendrogram(dend,4, x = 5, border = c(4))
#' rect.dendrogram(dend,3, border = 3, lwd = 2, lty = 2)
#' # now THIS, you can not do with the old rect.hclust
#' plot(dend, horiz = TRUE)
#' rect.dendrogram(dend,2, border = 2, horiz = TRUE)
#' rect.dendrogram(dend,4, border = 4, lty = 2, lwd = 3, horiz = TRUE)
#'
#' # This had previously failed since it worked with a wrong k.
#'
#' dend15 <- c(1:5) %>% dist %>% hclust(method = "average") %>% as.dendrogram
#' # dend15 <- c(1:25) %>% dist %>% hclust(method = "average") %>% as.dendrogram
#' dend15 %>% set("branches_k_color") %>% plot
#' dend15 %>% rect.dendrogram(k=3,
#' border = 8, lty = 5, lwd = 2)
rect.dendrogram <- function (tree, k = NULL, which = NULL, x = NULL, h = NULL, border = 2,
cluster = NULL, horiz = FALSE, density = NULL, angle = 45,
text = NULL, text_cex = 1, text_col = 1 , xpd = TRUE,
lower_rect, upper_rect = 0, prop_k_height = 0.5,
stop_if_out = FALSE,
...)
{
if(!is.dendrogram(tree)) stop("x is not a dendrogram object.")
if (length(h) > 1L | length(k) > 1L)
stop("'k' and 'h' must be a scalar(i.e.: of length 1)")
# In tree_heights I am removing the first element
# in order to be consistant with rect.hclust
tree_heights <- heights_per_k.dendrogram(tree)[-1] # this is NOT really tree heights, but the height for which you need to cut in order to cut the tree
tree_order <- order.dendrogram(tree)
# rm0 <- function(x) x[x != 0]
# height_to_add <- min(rm0(abs(diff(tree_heights))))/2 # the height to add so to be sure we get a "clear" cut
if (!is.null(h)) {
if (!is.null(k))
stop("specify exactly one of 'k' and 'h'")
ss_ks <- tree_heights < h
k <- min(as.numeric(names(ss_ks))[ss_ks])
k <- max(k, 2) # I don't like this default...
}
else if (is.null(k))
stop("specify exactly one of 'k' and 'h'")
if (k < 2 | k > length(tree_heights)) {
if(stop_if_out) {
stop(gettextf("k must be between 2 and %d", length(tree_heights)),
domain = NA)
} else {
warning(gettextf("k must be between 2 and %d", length(tree_heights)),
domain = NA)
}
}
if (is.null(cluster))
cluster <- cutree(tree, k = k)
clustab <- table(cluster)[unique(cluster[tree_order])]
m <- c(0, cumsum(clustab))
if (!is.null(x)) {
if (!is.null(which))
stop("specify exactly one of 'which' and 'x'")
which <- x
for (n in seq_along(x)) which[n] <- max(which(m < x[n]))
} else if (is.null(which))
which <- 1L:k
if (any(which > k))
stop(gettextf("all elements of 'which' must be between 1 and %d",
k), domain = NA)
border <- rep_len(border, length(which))
retval <- list()
old_xpd <- par()["xpd"]
par(xpd=xpd)
for (n in seq_along(which)) {
# this is to deal with the case when k is not defined for all values
# and that we can not use the next k+1 value to decide on the height to use.
next_k_height <- tree_heights[names(tree_heights) == k+1]
if(length(next_k_height) == 0) {
next_k_height <- 0
prop_k_height <- 1 # use only the "k" now.
# if(upper_rect == 0) upper_rect <- min(abs(diff(tree_heights))) / 2
}
if(!horiz) { # the default
xleft = m[which[n]] + 0.66
# if(missing(lower_rect)) lower_rect <- par("usr")[3L] - strheight("W")*(max(nchar(labels(tree))) + 1)
if(missing(lower_rect)) {
lower_rect <- -max(strheight2(labels(tree)))
dLeaf <- -0.75 * strheight("x")
extra_space <- -strheight2("_")
lower_rect <- lower_rect + dLeaf + extra_space
}
ybottom = lower_rect
xright = m[which[n] + 1] + 0.33
# ytop = mean(tree_heights[(k - 1):k])
# ytop = tree_heights[k] + abs(ybottom)
# ytop = mean(tree_heights[(k - 1):k]) + abs(ybottom)
ytop <- tree_heights[names(tree_heights) == k] * prop_k_height +
next_k_height * (1-prop_k_height) +
upper_rect # tree_heights[k] + height_to_add # + abs(xright)
} else {
ybottom = m[which[n]] + 0.66
# if(missing(lower_rect)) lower_rect <- par("usr")[2L] + strwidth("X")*(max(nchar(labels(tree))) + 1)
# if(missing(lower_rect)) lower_rect <- -max(strwidth(labels(dend)))
if(missing(lower_rect)) {
lower_rect <- min(strwidth(labels(tree))) # notice the char length is negative!
dLeaf <- 0.75 * strwidth("w")
extra_space <- strwidth("_")
lower_rect <- lower_rect + dLeaf + extra_space
}
xright = lower_rect
ytop = m[which[n] + 1] + 0.33
# xleft = mean(tree_heights[(k - 1):k])
xleft <-
tree_heights[names(tree_heights) == k] * prop_k_height +
next_k_height * (1-prop_k_height) +
upper_rect # tree_heights[k] + height_to_add # + abs(xright)
}
rect(xleft,
ybottom,
xright ,
ytop,
border = border[n], density = density, angle = angle, ...)
# allow for a vectorized version of "text"
if(!is.null(text)) text((m[which[n]] + m[which[n] + 1]+1)/2,
grconvertY(grconvertY(par("usr")[3L],"user","ndc")+0.02,"ndc","user"),
text[n],
cex = text_cex, col = text_col)
retval[[n]] <- which(cluster == as.integer(names(clustab)[which[n]]))
}
par(xpd=old_xpd)
invisible(retval)
}
#' @title Identify Clusters in a Dendrogram (not hclust)
#' @export
#' @description
#' Just like \link{identify.hclust}:
#' reads the position of the graphics pointer when the (first)
#' mouse button is pressed. It then cuts the tree at the vertical
#' position of the pointer and highlights the cluster containing
#' the horizontal position of the pointer. Optionally a function is applied
#' to the index of data points contained in the cluster.
#'
#' @param x a \link{dendrogram} object.
#' @param FUN (optional) function to be applied to the index numbers of the
#' data points in a cluster (see 'Details' below).
#' @param N the maximum number of clusters to be identified.
#' @param MAXCLUSTER the maximum number of clusters that can be produced by
#' a cut (limits the effective vertical range of the pointer).
#' @param DEV.FUN (optional) integer scalar. If specified, the corresponding
#' graphics device is made active before FUN is applied.
#' @param horiz logical (FALSE), indicating if the rectangles
#' should be drawn horizontally or not (for when using
#' plot(dend, horiz = TRUE) ) .
#' @param stop_if_out logical (default is FALSE). This default makes the function
#' NOT stop if k of the locator is outside the range (this default is different than the behavior
#' of the identify.hclust function - but it is nicer for the user.).
#' @param ... further arguments to FUN.
#' @details
#' By default clusters can be identified using the mouse and an invisible
#' list of indices of the respective data points is returned.
#' If FUN is not NULL, then the index vector of data points is passed to this
#' function as first argument, see the examples below. The active
#' graphics device for FUN can be specified using DEV.FUN.
#' The identification process is terminated by pressing any mouse button other
#' than the first, see also identify.
#' @seealso
#' \link{identify.hclust},
#' \link{rect.hclust}, \link{order.dendrogram}, \link{cutree.dendrogram}
#' @return
#' (Invisibly) returns a list where each element contains a vector
#' of data points contained in the respective cluster.
#' @source
#' This function is based on \link{identify.hclust}, with slight modifications
#' to have it work with a dendrogram, as well as adding "horiz"
#' @examples
#'
#' \dontrun{
#' set.seed(23235)
#' ss <- sample(1:150, 10 )
#' hc <- iris[ss,-5] %>% dist %>% hclust
#' dend <- hc %>% as.dendrogram
#'
#' plot(dend)
#' identify(dend)
#'
#' plot(dend, horiz = TRUE)
#' identify(dend, horiz = TRUE)
#'
#' }
identify.dendrogram <- function (x, FUN = NULL, N = 20, MAXCLUSTER, DEV.FUN = NULL,
horiz = FALSE,
stop_if_out = FALSE,
...)
{
# In tree_heights I am removing the first element
# in order to be consistant with rect.hclust
x_heights <- heights_per_k.dendrogram(x)[-1]
# this is like rev(x$height)
# tree_order <- order.dendrogram(tree)
if(missing(MAXCLUSTER)) MAXCLUSTER <- nleaves(x)
cluster <- cutree(x, k = 2:MAXCLUSTER)
retval <- list()
oldk <- NULL
oldx <- NULL
DEV.x <- grDevices::dev.cur()
for (n in 1L:N) {
grDevices::dev.set(DEV.x)
X <- locator(1)
if(horiz) {
tmp <- X$x
X$x <- X$y
X$y <- tmp
}
if (is.null(X))
break
k <- min(which(x_heights < X$y), MAXCLUSTER)
k <- max(k, 2)
if (!is.null(oldx)) {
rect.dendrogram(x, k = oldk, x = oldx,
cluster = cluster[, oldk - 1],
border = "grey", horiz = horiz, stop_if_out = stop_if_out)
}
retval[[n]] <- unlist(rect.dendrogram(x, k = k, x = X$x,
cluster = cluster[, k - 1],
border = "red", horiz = horiz, stop_if_out = stop_if_out))
if (!is.null(FUN)) {
if (!is.null(DEV.FUN)) {
grDevices::dev.set(DEV.FUN)
}
retval[[n]] <- FUN(retval[[n]], ...)
}
oldx <- X$x
oldk <- k
}
grDevices::dev.set(DEV.x)
invisible(retval)
}
|