1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273
|
# Copyright (C) Tal Galili
#
# This file is part of dendextend.
#
# dendextend is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# dendextend is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# A copy of the GNU General Public License is available at
# http://www.r-project.org/Licenses/
#
#' @title Duplicate a leaf X times
#' @export
#' @description
#' Duplicates a leaf in a tree. Useful for non-parametric bootstraping trees
#' since it emulates what would have happened if the tree was constructed based
#' on a row-sample with replacments from the original data matrix.
#' @param dend a dendrogram object
#' @param leaf_label the label of the laef to replicate.
#' @param times the number of times we will have this leaf after replication
#' @param fix_members logical (TRUE). Fix the number of members in attr
#' using \link{fix_members_attr.dendrogram}
#' @param fix_order logical (TRUE). Fix the leaves order
#' @param fix_midpoint logical (TRUE). Fix the midpoint value.
#' If TRUE, it overrides "fix_members" and turns it into TRUE (since it must
#' have a correct number of members in order to work).
#' values using \link{rank_order.dendrogram}
#' @param ... not used
#' @return
#' A dendrogram, after duplicating one of its leaves.
#' @examples
#'
#' \dontrun{
#' # define dendrogram object to play with:
#' dend <- USArrests[1:3, ] %>%
#' dist() %>%
#' hclust(method = "ave") %>%
#' as.dendrogram()
#' plot(dend)
#' duplicate_leaf(dend, "Alaska", 3)
#' duplicate_leaf(dend, "Arizona", 2, fix_members = FALSE, fix_order = FALSE)
#' plot(duplicate_leaf(dend, "Alaska", 2))
#' plot(duplicate_leaf(dend, "Alaska", 4))
#' plot(duplicate_leaf(dend, "Arizona", 2))
#' plot(duplicate_leaf(dend, "Arizona", 4))
#' }
#'
duplicate_leaf <- function(dend, leaf_label, times, fix_members = TRUE, fix_order = TRUE, fix_midpoint = TRUE, ...) {
# extreme case:
if (length(leaf_label) > 1) {
leaf_label <- leaf_label[1]
warning("leaf_label had a length >1, only the first element was used.")
}
# dend_node <- dend
duplicate_leaf_in_node <- function(dend_node) {
# if this is a leaf - we can not replicate the leaf
# we have to do it in the parent node.
if (is.leaf(dend_node)) {
return(unclass(dend_node))
}
# if leaf_label is not in this node, return the dend_node
# as there is no point to keep going done that node
if (!(leaf_label %in% labels(dend_node))) {
return(unclass(dend_node))
}
# since the leaf is in this node, is it one of its children?
# let's first get the childrens labels
return_label_or_NA <- function(x) {
x_label <- attr(x, "label")
if (is.null(x_label)) x_label <- NA
x_label
}
# typeof(sapply(dend, return_label_or_NA))
childrens_labels <- sapply(dend_node, return_label_or_NA)
# if we can't find the leaf in one of these node's children,
# then Recall the function on all children nodes and then return the node:
if (!(leaf_label %in% childrens_labels)) {
for (i in seq(dend_node)) {
dend_node[[i]] <- Recall(dend_node[[i]])
}
return(unclass(dend_node))
}
# but if it is here(!), let us duplicate it:
n_childrens <- length(childrens_labels)
leaf_label_location <- which(childrens_labels == leaf_label)
the_leaf <- unclass(dend_node[[leaf_label_location]])
leaf_order_value <- as.vector(the_leaf)
for (i in 2:times) {
temp_new_leaf <- the_leaf
attr(temp_new_leaf, "label") <- paste(leaf_label, i, sep = "_")
temp_new_leaf[1] <- leaf_order_value + i / (n_childrens + 1)
dend_node[[n_childrens + i - 1]] <- temp_new_leaf
}
return(unclass(dend_node))
}
if (!(leaf_label %in% labels(dend))) {
warning(paste("The label", leaf_label, "Is not available in this tree. Original tree returned."))
return(dend)
}
if (times == 1) {
# no point in replicating this if it needs to be only once, now is there?
return(dend)
}
if (!is.leaf(dend)) {
dend <- duplicate_leaf_in_node(dend)
} else {
# extreme case:
# in this case, we need to do something special
# if we got this far - we already know this tree has the only leaf we want.
attr(dend, "height") <- 0 # just to be sure the leaf has height 0.
new_dend <- list()
# dend <- rep(dend,times) # FAILS!
for (i in seq_len(times)) new_dend[[i]] <- dend
dend <- new_dend
attr(dend, "members") <- times
attr(dend, "height") <- 1
labels(dend) <- paste(leaf_label, seq_len(times), sep = "_")
}
class(dend) <- "dendrogram"
if (fix_midpoint) fix_members <- TRUE # for midpoint to work, we MUST have a correct number of members.
if (fix_members) dend <- fix_members_attr.dendrogram(dend)
if (fix_order) dend <- rank_order.dendrogram(dend)
if (fix_midpoint) dend <- suppressWarnings(stats_midcache.dendrogram(dend)) # fixing the middle point thing
return(dend)
}
#' @title Sample a tree
#' @export
#' @description
#' Samples a tree, either by permuting the labels (which is usefull for
#' a permutation test), or by repeated sampling of the same labels (essential
#' for bootstraping when we don't have access to the original data which
#' produced the tree).
#'
#' Duplicates a leaf in a tree. Useful for non-parametric bootstraping trees
#' since it emulates what would have happened if the tree was constructed based
#' on a row-sample with replacments from the original data matrix.
#' @param dend a dendrogram object
#' @param replace logical (FALSE). Should we shuffle the labels (if FALSE),
#' or should we replicate the same leaf over and over, while omitting other
#' leaves? (this is when set to TRUE).
#' @param dend_labels a character vector of the tree's labels.
#' This can save the time it takes for getting the tree labels (in case we run
#' a simulating, computing this once might save some running time).
#' If missing, it uses \link{labels} in order to get the labels.
#' @param sampled_labels a character vector of the tree's sampled labels.
#' This can help us if we wish to compare two trees. In such a case we'd like
#' to be able to have the same sample of labels used on both trees.
#' If missing, it uses \link{sample} in order to get the sampled labels.
#'
#' Only works when replace=TRUE!
#' @param fix_members logical (TRUE). Fix the number of members in attr
#' using \link{fix_members_attr.dendrogram}
#' @param fix_order logical (TRUE). Fix the leaves order
#' @param fix_midpoint logical (TRUE). Fix the midpoint value.
#' If TRUE, it overrides "fix_members" and turns it into TRUE (since it must
#' have a correct number of members in order to work).
#' values using \link{rank_order.dendrogram}
#' @param ... not used
#' @return
#' A dendrogram, after "sampling" its leaves.
#' @seealso
#' \link{sample}, \link{duplicate_leaf}
#' @examples
#'
#' \dontrun{
#' # define dendrogram object to play with:
#' dend <- USArrests[1:5, ] %>%
#' dist() %>%
#' hclust(method = "ave") %>%
#' as.dendrogram()
#' plot(dend)
#'
#' # # same tree, with different order of labels
#' plot(sample.dendrogram(dend, replace = FALSE))
#'
#' # # A different tree (!), with some labels duplicated,
#' # while others are pruned
#' plot(sample.dendrogram(dend, replace = TRUE))
#' }
#'
sample.dendrogram <- function(dend, replace = FALSE,
dend_labels, sampled_labels,
fix_members = TRUE, fix_order = TRUE, fix_midpoint = TRUE, ...) {
if (missing(dend_labels)) dend_labels <- labels(dend)
if (replace) {
if (missing(sampled_labels)) sampled_labels <- sample(dend_labels, replace = TRUE)
# 1) prune redundent leaves
ss_kept_labels <- dend_labels %in% sampled_labels
ss_removed_labels <- !ss_kept_labels
removed_labels <- dend_labels[ss_removed_labels]
dend <- prune(dend, leaves = removed_labels)
# 2) add new leaves
#
# table of the new sampled labels
t_sampled_labels <- table(sampled_labels)
names_t_sampled_labels <- names(t_sampled_labels)
for (i in seq_along(t_sampled_labels)) {
dend <- duplicate_leaf(dend,
leaf_label = names_t_sampled_labels[i],
times = unname(t_sampled_labels[i]),
fix_members = FALSE,
fix_order = FALSE,
fix_midpoint = FALSE
)
}
# it is better to do it here than over at duplicate_leaf
# since we are fine with doing this fix only once after we finished
# adding all of the new labels.
# # 3) Fix members
# # 4) Fix leaves order values
if (fix_midpoint) fix_members <- TRUE # for midpoint to work, we MUST have a correct number of members.
if (fix_members) dend <- fix_members_attr.dendrogram(dend)
if (fix_order) dend <- rank_order.dendrogram(dend)
if (fix_midpoint) dend <- suppressWarnings(stats_midcache.dendrogram(dend)) # fixing the middle point thing
} else {
# don't replace
# this is MUCH simpler, and useful for permutation tests.
n_dend <- nleaves(dend)
new_order <- sample(n_dend)
labels(dend) <- dend_labels[new_order]
if (fix_order) order.dendrogram(dend) <- order.dendrogram(dend)[new_order]
}
class(dend) <- "dendrogram"
return(dend)
}
|