1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
|
#' Tip-tip MRCA depths
#'
#' This function creates a matrix where columns 1 and 2 correspond to tip labels and column 3 gives the depth of the MRCA of that pair of tips.
#' It is strongly based on \code{treeVec} and is used by \code{relatedTreeDist} and \code{treeConcordance} where tip labels belong to "categories".
#'
#' @param tree An object of class phylo
#'
#' @import ape
#' @importFrom combinat combn
#' @importFrom Rcpp evalCpp
#' @importFrom methods is
#'
#' @examples
#' tree <- rtree(10)
#' plot(tree)
#' tipsMRCAdepths(tree)
#'
#' @export
tipsMRCAdepths <- function(tree) {
if(!is(tree,"phylo")) stop("Tree should be of class phylo")
if(!is.rooted(tree)) stop("Function is for rooted trees only")
root <- min(tree$edge[,1])
if (length(split(tree$edge[,2], factor(tree$edge[,1], root))[[1]]) < 2) stop("Tree root must have at least two descendants")
tree$edge.length <- rep(1,length(tree$edge))
num_leaves <- length(tree$tip.label)
num_edges <- nrow(tree$edge)
# We work with ordered labels, using this vector to transform indices.
tip_order <- match(1:num_leaves, order(tree$tip.label))
# Ordering the edges by first column places the root at the bottom.
# Descendants will be placed always before parents.
edge_order <- ape::reorder.phylo(tree, "postorder", index.only=T)
edges <- tree$edge[edge_order,]
edge_lengths <- tree$edge.length[edge_order]
# We annotated the nodes of the tree in this list. In two passes we are going to
# compute the partition each node induces in the tips (bottom-up pass) and the distance
# (in branch length and number of branches) from the root to each node (top-down pass).
annotated_nodes <- list()
# Bottom up (compute partitions, we store the branch lengths to compute distances
# to the root on the way down).
for(i in 1:num_edges) {
parent <- edges[i,1]
child <- edges[i,2]
# Initialization (leaves).
if(child <= num_leaves) {
# We translate the index for the sorted labels.
child <- tip_order[child]
# Leaves have as children themselves.
annotated_nodes[[child]] <- list(root_distance=NULL, edges_to_root=1, partitions=list(child))
}
# Aggregate the children partitions (only if we are not visiting a leaf).
aggregated_partitions <- annotated_nodes[[child]]$partitions[[1]]
if((child > num_leaves)) {
for(p in 2:length(annotated_nodes[[child]]$partitions))
aggregated_partitions <- c(aggregated_partitions, annotated_nodes[[child]]$partitions[[p]])
}
# Update the branch length on the child.
annotated_nodes[[child]]$root_distance <- edge_lengths[i]
# We have not visited this internal node before.
if(parent > length(annotated_nodes) || is.null(annotated_nodes[[parent]])) {
# Assume the first time we get the left child partition.
annotated_nodes[[parent]] <- list(root_distance=NULL, edges_to_root=1, partitions=list(aggregated_partitions))
}
# This is not the first time we have visited the node.
else {
# We store the next partition of leaves.
annotated_nodes[[parent]]$partitions[[length(annotated_nodes[[parent]]$partitions)+1]] <- aggregated_partitions
}
}
# Update the distance to the root at the root (i.e. 0)
# And the number of edges to the root (i.e. 0).
annotated_nodes[[num_leaves+1]]$root_distance <- 0
annotated_nodes[[num_leaves+1]]$edges_to_root <- 0
# Top down, compute distances to the root for each node.
for(i in num_edges:1) {
parent <- edges[i,1]
child <- edges[i,2]
# If the child is a leaf we translate the index for the sorted labels.
if(child <= num_leaves)
child <- tip_order[child]
annotated_nodes[[child]]$root_distance <- annotated_nodes[[child]]$root_distance + annotated_nodes[[parent]]$root_distance
annotated_nodes[[child]]$edges_to_root <- annotated_nodes[[child]]$edges_to_root + annotated_nodes[[parent]]$edges_to_root
}
# Distance vectors
vector_length <- (num_leaves*(num_leaves-1)/2) + num_leaves
length_root_distances <- double(vector_length)
topological_root_distances <- integer(vector_length)
# Fill-in the leaves (notice the involved index translation for leaves).
# Consensus version: final k entries are the heights of leaves, not pendant lengths
for (i in (vector_length-num_leaves+1):vector_length) {
topological_root_distances[[i]] <- annotated_nodes[[i-vector_length+num_leaves]]$edges_to_root
}
length_root_distances[(vector_length-num_leaves+1):vector_length] <- edge_lengths[match(1:num_leaves, edges[,2])][order(tree$tip.label)]
# Instead of computing the lexicographic order for the combination pairs assume we
# are filling in a symmetric distance matrix (using only the triangular upper part).
# We just need to "roll" the matrix indices into the vector indices.
# Examples for (k=5)
# The combination c(1,4) would be located at position 3 on the vector.
# The combination c(2,1) would be located at position 1 on the vector because d(2,1) = d(1,2).
# The combination c(2,3) would be located at position 5 on the vector.
index_offsets <- c(0, cumsum((num_leaves-1):1))
# This is the slow part, we compute both vectors as gain would be marginal.
sapply(annotated_nodes, function(node) {
# We skip leaves and the root (if the MRCA for M groups of leaves is at the root
# all combinations of leaves -among different groups- have 0 as distance to the root).
# For large trees this can spare us of computing a lot of combinations.
# Example: In a perfectly balanced binary tree (N/2 leaves at each side of the root),
# at the root we'd save (N/2) * (N/2) combinations to update. Worst case scenario is completely
# unbalanced tree (N-1,1), we'd save in that case only N-1 combinations.
# Make sure we are not visiting a leaf or the root.
if(length(node$partitions) > 1 && node$root_distance > 0) {
# Update all combinations for pairs of leaves from different groups.
num_groups <- length(node$partitions)
for(group_a in 1:(num_groups-1)) {
for(group_b in (group_a+1):num_groups) {
updateDistancesWithCombinations(length_root_distances, topological_root_distances, node$partitions[[group_a]],
node$partitions[[group_b]], index_offsets, node$root_distance, node$edges_to_root)
}
}
}
})
# make tip label vector
tips_in_order <- tree$tip.label[order(tree$tip.label)]
label_pairs <- t(combn(tips_in_order,2, fun=c(), simplify=TRUE))
#self_pairs <- rbind(tips_in_order,tips_in_order)
tipsMRCAvec <- cbind(label_pairs,topological_root_distances[1:(num_leaves*(num_leaves-1)/2)])
colnames(tipsMRCAvec) <- c("tip1","tip2","rootdist")
return(tipsMRCAvec)
}
|