1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
|
#! /usr/bin/env python
##############################################################################
## DendroPy Phylogenetic Computing Library.
##
## Copyright 2010-2015 Jeet Sukumaran and Mark T. Holder.
## All rights reserved.
##
## See "LICENSE.rst" for terms and conditions of usage.
##
## If you use this work or any portion thereof in published work,
## please cite it as:
##
## Sukumaran, J. and M. T. Holder. 2010. DendroPy: a Python library
## for phylogenetic computing. Bioinformatics 26: 1569-1571.
##
##############################################################################
"""
Statistics, metrics, measurements, and values calculated on (single) trees.
"""
import math
from dendropy.calculate import phylogeneticdistance
EULERS_CONSTANT = 0.5772156649015328606065120900824024310421
## legacy: will soon be deprecated
class PatrisiticDistanceMatrix(phylogeneticdistance.PhylogeneticDistanceMatrix):
def __init__(self, tree):
phylogeneticdistance.PhylogeneticDistanceMatrix.__init__(self)
self.compile_from_tree(tree=tree)
def patristic_distance(tree, taxon1, taxon2, is_bipartitions_updated=False):
"""
Given a tree with bipartitions encoded, and two taxa on that tree, returns the
patristic distance between the two. Much more inefficient than constructing
a PhylogeneticDistanceMatrix object.
"""
mrca = tree.mrca(taxa=[taxon1, taxon2], is_bipartitions_updated=is_bipartitions_updated)
dist = 0
n = tree.find_node(lambda x: x.taxon == taxon1)
while n != mrca:
if n.edge.length is not None:
dist += n.edge.length
n = n.parent_node
n = tree.find_node(lambda x: x.taxon == taxon2)
while n != mrca:
if n.edge.length is not None:
dist += n.edge.length
n = n.parent_node
return dist
###########################################################################
### Metrics -- Unary
def B1(tree):
"""
Returns the B1 statistic: the reciprocal of the sum of the maximum
number of nodes between each interior node and tip over all internal
nodes excluding root.
"""
b1 = 0.0
nd_mi = {}
for nd in tree.postorder_node_iter():
if nd._parent_node is None:
continue
child_nodes = nd._child_nodes
if len(child_nodes) == 0:
nd_mi[nd] = 0.0
continue
mi = max(nd_mi[ch] for ch in child_nodes)
mi += 1
nd_mi[nd] = mi
b1 += 1.0/mi
return b1
def colless_tree_imbalance(tree, normalize="max"):
"""
Returns Colless' tree imbalance or I statistic: the sum of differences
of numbers of children in left and right subtrees over all internal
nodes. ``normalize`` specifies the normalization:
* "max" or True [DEFAULT]
normalized to maximum value for tree of
this size
* "yule"
normalized to the Yule model
* "pda"
normalized to the PDA (Proportional to Distinguishable
Arrangements) model
* None or False
no normalization
"""
colless = 0.0
num_leaves = 0
subtree_leaves = {}
for nd in tree.postorder_node_iter():
if nd.is_leaf():
subtree_leaves[nd] = 1
num_leaves += 1
else:
total_leaves = 0
if len(nd._child_nodes) > 2:
raise TypeError("Colless' tree imbalance statistic requires strictly bifurcating trees")
left = subtree_leaves[nd._child_nodes[0]]
right = subtree_leaves[nd._child_nodes[1]]
colless += abs(right-left)
subtree_leaves[nd] = right + left
if normalize == "yule":
colless = float(colless - (num_leaves * math.log(num_leaves)) - (num_leaves * (EULERS_CONSTANT - 1.0 - math.log(2))))/num_leaves
elif normalize == "pda":
colless = colless / pow(num_leaves, 3.0/2)
elif normalize is True or normalize == "max":
## note that Mooers 1995 (Evolution 49(2):379-384)
## remarks that the correct normalization factor is
## 2/((num_leaves - 1) * (num_leaves -2))
colless = colless * (2.0/(num_leaves * (num_leaves-3) + 2))
elif normalize is not None and normalize is not False:
raise TypeError("``normalization`` accepts only None, True, False, 'yule' or 'pda' as argument values")
return colless
def pybus_harvey_gamma(tree, prec=0.00001):
"""Returns the gamma statistic of Pybus and Harvey (2000). This statistic
is used to test for constancy of birth and death rates over the course of
a phylogeny. Under the pure-birth process, the statistic should follow
a standard Normal distibution: a Normal(mean=0, variance=1).
If the lengths of different paths to the node differ by more than ``prec``,
then a ValueError exception will be raised indicating deviation from
ultrametricty.
Raises a Value Error if the tree is not ultrametric, is non-binary, or has
only 2 leaves.
As a side effect a ``age`` attribute is added to the nodes of the tree.
Pybus and Harvey. 2000. "Testing macro-evolutionary models using incomplete
molecular phylogenies." Proc. Royal Society Series B: Biological Sciences.
(267). 2267-2272
"""
# the equation is given by:
# T = \sum_{j=2}^n (jg_j)
# C = T \sqrt{\frac{1}{12(n-2)}}
# C gamma = \frac{1}{n-2}\sum_{i=2}^{n-1} (\sum_{k=2}^i kg_k) - \frac{T}{2}
# where n is the number of taxa, and g_2 ... g_n is the vector of waiting
# times between consecutive (in time, not along a branch) speciation times.
node = None
speciation_ages = []
n = 0
if tree.seed_node.age is None:
tree.calc_node_ages(ultrametricity_precision=prec)
for node in tree.postorder_node_iter():
if len(node.child_nodes()) == 2:
speciation_ages.append(node.age)
else:
n += 1
if node is None:
raise ValueError("Empty tree encountered")
speciation_ages.sort(reverse=True)
g = []
older = speciation_ages[0]
for age in speciation_ages[1:]:
g.append(older - age)
older = age
g.append(older)
if not g:
raise ValueError("No internal nodes found (other than the root)")
assert(len(g) == (n - 1))
T = 0.0
accum = 0.0
for i in range(2, n):
list_index = i - 2
T += i * float(g[list_index])
accum += T
list_index = n - 2
T += (n) * g[list_index]
nmt = n - 2.0
numerator = accum/nmt - T/2.0
C = T*pow(1/(12*nmt), 0.5)
return numerator/C
def N_bar(tree):
"""
Returns the $\bar{N}$ statistic: the average number of nodes above a
terminal node.
"""
leaf_count = 0
nbar = 0
for leaf_node in tree.leaf_node_iter():
leaf_count += 1
for parent in leaf_node.ancestor_iter(inclusive=False):
nbar += 1
return float(nbar) / leaf_count
def sackin_index(tree, normalize=True):
"""
Returns the Sackin's index: the sum of the number of ancestors for each
tip of the tree. The larger the Sackin's index, the less balanced the
tree. ``normalize`` specifies the normalization:
* True [DEFAULT]
normalized to number of leaves; this results in a value
equivalent to that given by Tree.N_bar()
* "yule"
normalized to the Yule model
* "pda"
normalized to the PDA (Proportional to Distinguishable
Arrangements) model
* None or False
no normalization
"""
leaf_count = 0
num_anc = 0
for leaf_node in tree.leaf_node_iter():
leaf_count += 1
for parent in leaf_node.ancestor_iter(inclusive=False):
num_anc += 1
if normalize == "yule":
x = sum(1.0/j for j in range(2, leaf_count+1))
s = float(num_anc - (2 * leaf_count * x))/leaf_count
elif normalize == "pda":
s = float(num_anc)/(pow(leaf_count, 3.0/2))
elif normalize is True:
s = float(num_anc)/leaf_count
elif normalize is None or normalize is False:
s = float(num_anc)
elif normalize is not None and normalize is not False:
raise TypeError("``normalization`` accepts only None, True, False, 'yule' or 'pda' as argument values")
return s
def treeness(tree):
"""
Returns the proportion of total tree length that is taken up by
internal branches.
"""
internal = 0.0
external = 0.0
for nd in tree.postorder_node_iter():
if not nd._parent_node:
continue
if nd.is_leaf():
external += nd.edge.length
else:
internal += nd.edge.length
return internal/(external + internal)
|