File: common_subtrees.R

package info (click to toggle)
r-cran-dendextend 1.19.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 3,076 kB
  • sloc: sh: 13; makefile: 2
file content (210 lines) | stat: -rw-r--r-- 5,654 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# Copyright (C) Tal Galili
#
# This file is part of dendextend.
#
# dendextend is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# dendextend is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
#  A copy of the GNU General Public License is available at
#  http://www.r-project.org/Licenses/
#






# Uses distinct_edges to find which nodes are distinct
# And then returns a logical vector indicating for each node
# if it is one in which it, and all of its children, are nodes
# that contain items that are shared (as a group) with those in dend2
# put differently - these are the nodes that form a sub-tree which will be
# identical (topologically) with some subtree in dend2
nodes_with_shared_labels <- function(dend1, dend2, ...) {
  distinct_nodes <- distinct_edges(dend1, dend2)
  good_nodes <- logical(nnodes(dend1))
  i_nodes <- 0

  check_if_node_is_good <- function(dend_node) {
    i_nodes <<- i_nodes + 1
    current_i_nodes <- i_nodes

    if (is.leaf(dend_node)) {
      good_nodes[current_i_nodes] <<- TRUE
      return(TRUE)
    }

    is_distinct_node <- i_nodes %in% distinct_nodes

    n_children <- length(dend_node)
    are_children_good <- logical(n_children)

    for (i in 1:n_children) {
      are_children_good[i] <- check_if_node_is_good(dend_node[[i]])
    }

    is_good_node <- all(c(are_children_good, !is_distinct_node))
    if (is_good_node) good_nodes[current_i_nodes] <<- TRUE
    return(is_good_node)
  }


  check_if_node_is_good(dend1)

  good_nodes
}




#' Rank a vector based on clusters
#' @export
#'
#' @param x numeric vector
#' @param ignore0 logical (FALSE). If TRUE, will ignore the 0's in the vector
#' @param ... not used
#'
#' @return
#' an integer vector with the number of unique values
#' as the number of uniques in the original vector.
#' And the values are ranked from 1 (in the beginning of the vector)
#' to the number of unique clusters.
#'
#' @examples
#'
#' rank_values_with_clusters(c(1, 2, 3))
#' rank_values_with_clusters(c(1, 1, 3))
#' rank_values_with_clusters(c(0.1, 0.1, 3000))
#' rank_values_with_clusters(c(3, 1, 2))
#' rank_values_with_clusters(c(1, 3, 3, 3, 3, 3, 3, 4, 2, 2))
#'
#' rank_values_with_clusters(c(3, 1, 2), ignore0 = TRUE)
#' rank_values_with_clusters(c(3, 1, 2), ignore0 = FALSE)
#' rank_values_with_clusters(c(3, 1, 0, 2), ignore0 = TRUE)
#' rank_values_with_clusters(c(3, 1, 0, 2), ignore0 = FALSE)
rank_values_with_clusters <- function(x, ignore0 = FALSE, ...) {
  if (ignore0) {
    old_x <- x
    x <- old_x[old_x != 0]
  }

  rle_lengths <- rle(x)$lengths
  x <- rep(seq_along(rle_lengths), times = rle_lengths)

  if (ignore0) {
    old_x[old_x != 0] <- x
    x <- old_x
  }

  x
}




replace_unique_items_with_0_and_rank <- function(x, ...) {
  # insert zeros
  tbl_x <- table(x)
  unique_char_values <- names(tbl_x)[tbl_x == 1]
  ss_unique <- as.character(x) %in% unique_char_values
  x[ss_unique] <- 0
  #    x # this is cleaned

  ### # fails:
  ### x[!ss_unique] <- rank(x[!ss_unique], ties = "min")

  # now we rank
  x[!ss_unique] <- rank_values_with_clusters(x[!ss_unique])
  #    rep(1:3, times = c(1,3,3))
  #    rle(c(1,2,2,3,3,4))
  x
}
# replace_unique_items_with_0_and_rank(c(1,2,2,3,3,4))




#' Find clusters of common subtrees
#' @export
#' @description
#' Gets a dend and the output from "nodes_with_shared_labels"
#' and returns a vector (length of labels), indicating the clusters
#' forming shared subtrees
#' @param dend1 a \link{dendrogram}.
#' @param dend2 a \link{dendrogram}.
#' @param leaves_get_0_cluster logical (TRUE). Should the leaves which are not part of
#' a larger common subtree get a unique cluster number, or the value 0.
#' @param ... not used.
#'
#' @return
#' An integer vector, with values indicating which leaves in dend1 form
#' a common subtree cluster, with ones available in dend2
#' @seealso  \link{color_branches}, \link{tanglegram}
#'
#' @examples
#'
#'
#' library(dendextend)
#' dend1 <- 1:6 %>%
#'   dist() %>%
#'   hclust() %>%
#'   as.dendrogram()
#' dend2 <- dend1 %>% set("labels", c(1:4, 6:5))
#' tanglegram(dend1, dend2)
#'
#' clusters1 <- common_subtrees_clusters(dend1, dend2)
#' dend1_2 <- color_branches(dend1, clusters = clusters1)
#' plot(dend1_2)
#' plot(dend1_2, horiz = TRUE)
#' tanglegram(dend1_2, dend2, highlight_distinct_edges = FALSE)
#' tanglegram(dend1_2, dend2)
common_subtrees_clusters <- function(dend1, dend2, leaves_get_0_cluster = TRUE, ...) {
  good_nodes <- nodes_with_shared_labels(dend1, dend2)
  dend <- dend1


  clusters <- numeric(nleaves(dend))
  cluster_group <- 0
  leaf_id <- 0
  i_nodes <- 0


  fill_clusters <- function(dend_node) {
    i_nodes <<- i_nodes + 1
    current_node <- i_nodes

    if (is.leaf(dend_node)) {
      if (!good_nodes[current_node]) cluster_group <<- cluster_group + 1
      leaf_id <<- leaf_id + 1
      clusters[leaf_id] <<- cluster_group
      return(NULL)
    }

    # If not leaf:
    n_children <- length(dend_node)

    for (i in 1:n_children) {
      if (!good_nodes[current_node]) cluster_group <<- cluster_group + 1
      fill_clusters(dend_node[[i]])
    }

    return(NULL)
  }

  fill_clusters(dend)

  clusters <- rank_values_with_clusters(clusters)

  if (leaves_get_0_cluster) {
    clusters <- replace_unique_items_with_0_and_rank(clusters)
  }

  clusters
}