1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291
|
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE.txt, distributed with this software.
# ----------------------------------------------------------------------------
from collections import defaultdict
import numpy as np
from skbio.tree import TreeNode
from skbio.util._decorator import experimental
def _walk_clades(trees, weights):
"""Walk all the clades of all the trees
Parameters
----------
trees : list of TreeNode
The trees to walk
weights : np.array
Tree weights
Returns
-------
list of tuple
The clades and support values sorted by support value such that the
most supported clade is index 0. The tuples are of the form:
(frozenset, float).
defaultdict(float)
The edge lengths, keyed by frozenset of the clade, and valued by the
weighted average length of the clade by the trees the clade was
observed in.
"""
clade_counts = defaultdict(float)
edge_lengths = defaultdict(float)
total = weights.sum()
# get clade counts
def tipnames_f(n):
return [n.name] if n.is_tip() else []
for tree, weight in zip(trees, weights):
tree.cache_attr(tipnames_f, 'tip_names', frozenset)
for node in tree.postorder():
tip_names = node.tip_names
# if node.length is not None, fetch it and weight it
length = node.length * weight if node.length is not None else None
clade_counts[tip_names] += weight
if length is None:
edge_lengths[tip_names] = None
else:
edge_lengths[tip_names] += length / total
# sort clades by number times observed
clade_counts = sorted(clade_counts.items(), key=lambda x: len(x[0]),
reverse=True)
return clade_counts, edge_lengths
def _filter_clades(clade_counts, cutoff_threshold):
"""Filter clades that not well supported or are contradicted
Parameters
----------
clade_counts : list of tuple
Where the first element in each tuple is the frozenset of the clade,
and the second element is the support value. It is expected that this
list is sorted by descending order by support.
cutoff_threshold : float
The minimum weighted observation count that a clade must have to be
considered supported.
Returns
-------
dict
A dict of the accepted clades, keyed by the frozenset of the clade and
valued by the support value.
"""
accepted_clades = {}
for clade, count in clade_counts:
conflict = False
if count <= cutoff_threshold:
continue
if len(clade) > 1:
# check the current clade against all the accepted clades to see if
# it conflicts. A conflict is defined as:
# 1. the clades are not disjoint
# 2. neither clade is a subset of the other
for accepted_clade in accepted_clades:
intersect = clade.intersection(accepted_clade)
subset = clade.issubset(accepted_clade)
superset = clade.issuperset(accepted_clade)
if intersect and not (subset or superset):
conflict = True
if conflict is False:
accepted_clades[clade] = count
return accepted_clades
def _build_trees(clade_counts, edge_lengths, support_attr, tree_node_class):
"""Construct the trees with support
Parameters
----------
clade_counts : dict
Keyed by the frozenset of the clade and valued by the support
edge_lengths : dict
Keyed by the frozenset of the clade and valued by the weighted length
support_attr : str
The name of the attribute to hold the support value
tree_node_class : type
Specifies type of consensus trees that are returned. Either
``TreeNode`` or a type that implements the same interface (most
usefully, a subclass of ``TreeNode``).
Returns
-------
list of tree_node_class instances
A list of the constructed trees
"""
nodes = {}
queue = [(len(clade), clade) for clade in clade_counts]
while queue:
# The values within the queue are updated on each iteration, so it
# doesn't look like an insertion sort will make sense unfortunately
queue.sort()
(clade_size, clade) = queue.pop(0)
new_queue = []
# search for ancestors of clade
for (_, ancestor) in queue:
if clade.issubset(ancestor):
# update ancestor such that, in the following example:
# ancestor == {1, 2, 3, 4}
# clade == {2, 3}
# new_ancestor == {1, {2, 3}, 4}
new_ancestor = (ancestor - clade) | frozenset([clade])
# update references for counts and lengths
clade_counts[new_ancestor] = clade_counts.pop(ancestor)
edge_lengths[new_ancestor] = edge_lengths.pop(ancestor)
ancestor = new_ancestor
new_queue.append((len(ancestor), ancestor))
# if the clade is a tip, then we have a name
if clade_size == 1:
name = list(clade)[0]
else:
name = None
# the clade will not be in nodes if it is a tip
children = [nodes.pop(c) for c in clade if c in nodes]
length = edge_lengths[clade]
node = tree_node_class(children=children, length=length, name=name)
setattr(node, support_attr, clade_counts[clade])
nodes[clade] = node
queue = new_queue
return list(nodes.values())
@experimental(as_of="0.4.0")
def majority_rule(trees, weights=None, cutoff=0.5, support_attr='support',
tree_node_class=TreeNode):
r"""Determines consensus trees from a list of rooted trees
Parameters
----------
trees : list of TreeNode
The trees to operate on
weights : list or np.array of {int, float}, optional
If provided, the list must be in index order with `trees`. Each tree
will receive the corresponding weight. If omitted, all trees will be
equally weighted.
cutoff : float, 0.0 <= cutoff <= 1.0, optional
Any clade that has <= cutoff support will be dropped. If cutoff is
< 0.5, then it is possible that ties will result. If so, ties are
broken arbitrarily depending on list sort order.
support_attr : str, optional
The attribute to be decorated onto the resulting trees that contain the
consensus support.
tree_node_class : type, optional
Specifies type of consensus trees that are returned. Either
``TreeNode`` (the default) or a type that implements the same interface
(most usefully, a subclass of ``TreeNode``).
Returns
-------
list of tree_node_class instances
Each tree will be of type `tree_node_class`. Multiple trees can be
returned in the case of two or more disjoint sets of tips represented
on input.
Notes
-----
This code was adapted from PyCogent's majority consensus code originally
written by Matthew Wakefield. The method is based off the original
description of consensus trees in [1]_. An additional description can be
found in the Phylip manual [2]_. This method does not support majority rule
extended.
Support is computed as a weighted average of the tree weights in which the
clade was observed in. For instance, if {A, B, C} was observed in 5 trees
all with a weight of 1, its support would then be 5.
References
----------
.. [1] Margush T, McMorris FR. (1981) "Consensus n-trees." Bulletin for
Mathematical Biology 43(2) 239-44.
.. [2] http://evolution.genetics.washington.edu/phylip/doc/consense.html
Examples
--------
Computing the majority consensus, using the example from the Phylip manual
with the exception that we are computing majority rule and not majority
rule extended.
>>> from skbio.tree import TreeNode
>>> from io import StringIO
>>> trees = [
... TreeNode.read(StringIO("(A,(B,(H,(D,(J,(((G,E),(F,I)),C))))));")),
... TreeNode.read(StringIO("(A,(B,(D,((J,H),(((G,E),(F,I)),C)))));")),
... TreeNode.read(StringIO("(A,(B,(D,(H,(J,(((G,E),(F,I)),C))))));")),
... TreeNode.read(StringIO("(A,(B,(E,(G,((F,I),((J,(H,D)),C))))));")),
... TreeNode.read(StringIO("(A,(B,(E,(G,((F,I),(((J,H),D),C))))));")),
... TreeNode.read(StringIO("(A,(B,(E,((F,I),(G,((J,(H,D)),C))))));")),
... TreeNode.read(StringIO("(A,(B,(E,((F,I),(G,(((J,H),D),C))))));")),
... TreeNode.read(StringIO("(A,(B,(E,((G,(F,I)),((J,(H,D)),C)))));")),
... TreeNode.read(StringIO("(A,(B,(E,((G,(F,I)),(((J,H),D),C)))));"))]
>>> consensus = majority_rule(trees, cutoff=0.5)[0]
>>> for node in sorted(consensus.non_tips(),
... key=lambda k: k.count(tips=True)):
... support_value = node.support
... names = ' '.join(sorted(n.name for n in node.tips()))
... print("Tips: %s, support: %s" % (names, support_value))
Tips: F I, support: 9.0
Tips: D H J, support: 6.0
Tips: C D H J, support: 6.0
Tips: C D F G H I J, support: 6.0
Tips: C D E F G H I J, support: 9.0
Tips: B C D E F G H I J, support: 9.0
In the next example, multiple trees will be returned which can happen if
clades are not well supported across the trees. In addition, this can arise
if not all tips are present across all trees.
>>> trees = [
... TreeNode.read(StringIO("((a,b),(c,d),(e,f));")),
... TreeNode.read(StringIO("(a,(c,d),b,(e,f));")),
... TreeNode.read(StringIO("((c,d),(e,f),b);")),
... TreeNode.read(StringIO("(a,(c,d),(e,f));"))]
>>> consensus_trees = majority_rule(trees)
>>> len(consensus_trees)
4
"""
if weights is None:
weights = np.ones(len(trees), dtype=float)
else:
weights = np.asarray(weights)
if len(weights) != len(trees):
raise ValueError("Number of weights and trees differ.")
cutoff_threshold = cutoff * weights.sum()
clade_counts, edge_lengths = _walk_clades(trees, weights)
clade_counts = _filter_clades(clade_counts, cutoff_threshold)
trees = _build_trees(clade_counts, edge_lengths, support_attr,
tree_node_class)
return trees
|