File: tree_space.py

package info (click to toggle)
python-cogent 1.5.3-2
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 16,424 kB
  • ctags: 24,343
  • sloc: python: 134,200; makefile: 100; ansic: 17; sh: 10
file content (252 lines) | stat: -rw-r--r-- 9,708 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
#!/usr/bin/env python
from __future__ import division
import numpy
import itertools
from cogent.core.tree import TreeBuilder
from cogent.phylo.tree_collection import ScoredTreeCollection
from cogent.util import parallel, checkpointing, progress_display as UI

__author__ = "Peter Maxwell"
__copyright__ = "Copyright 2007-2012, The Cogent Project"
__credits__ = ["Peter Maxwell"]
__license__ = "GPL"
__version__ = "1.5.3"
__maintainer__ = "Peter Maxwell"
__email__ = "pm67nz@gmail.com"
__status__ = "Production"


def ismallest(data, size):
    """There are many ways to get the k smallest items from an N sequence, and
    which one performs best depends on k, N and k/N.  This algorithm appears to
    beat anything heapq can do, and stays with a factor of 2 of sort() and
    min().  Is uses memory O(2*k) and so is particularly suitable for lazy 
    application to large N.  It returns the smallest k sorted too."""
    limit = 2 * size
    data = iter(data)
    best = list(itertools.islice(data, limit))
    while True:
        best.sort()
        if len(best) <= size:
            break
        del best[size:]
        worst_of_best = best[-1]
        for item in data:
            if item < worst_of_best:
                best.append(item)
                if len(best) > limit:
                    break
    return best

# Trees are represented as "ancestry" matricies in which A[i,j] iff j is an
# ancestor of i.  For LS calculations the ancestry matrix is converted
# to a "paths" matrix or "split metric" in which S[p,j] iff the path between
# the pth pair of tips passes through edge j.  For ML calculations the
# ancestry matrix is converted back into an ordinary cogent tree object.

def tree2ancestry(tree, order=None):
    nodes = tree.unrooted().getEdgeVector()[:-1]
    if order is not None:
        lookup = dict([(k,i) for (i,k) in enumerate(order)])
        def _ordered_tips_first(n):
            if n.Children:
                return len(order)
            else:
                return lookup[n.Name]
        nodes.sort(key=_ordered_tips_first)

    n = len(nodes)
    A = numpy.zeros([n, n], int)
    seen = {}
    for (i, node) in enumerate(nodes):
        A[i, i] = 1
        seen[id(node)] = i
        for c in node.Children:
            A[:,i] |= A[:,seen[id(c)]]
    names = [n.Name for n in nodes if not n.Children]
    lengths = [n.Length for n in nodes]
    return (A, names, lengths)

def ancestry2tree(A, lengths, tip_names):
    """Convert edge x edge ancestry matrix to a cogent Tree object"""
    tips = {}
    tip = 0
    for i in range(len(A)):
        if numpy.sum(A[:,i]) == 1:
            tips[i] = tip_names[tip]
            tip += 1
    assert tip == len(tip_names)
    
    constructor = TreeBuilder().createEdge
    free = {}
    for i in numpy.argsort(numpy.sum(A, axis=0)):
        children = [j for j in range(len(A)) if A[j, i] and j != i]
        child_nodes = [free.pop(j) for j in children if j in free]
        if child_nodes:
            name = None
        else:
            name = tips[i]
        if lengths is None:
            params = {}
        else:
            params = {'length':lengths[i]}
        node = constructor(child_nodes, name, params)
        free[i] = node
    return constructor(free.values(), 'root', {})

def grown(B, split_edge):
    """Ancestry matrix 'B' with one extra leaf added at 'split_edge'.
    Row/column order within the matrix is independent of the topology it 
    represents. The added leaf will be the last one in the matrix, which keeps 
    the leaf node order the same as the order in which they are added, which is 
    what is assumed by ancestry2tree and ancestry2paths"""
    n = len(B)
    A = numpy.zeros([n+2, n+2], int)
    A[:n, :n] = B
    (sibling, parent) = (n, n + 1)
    A[sibling] = A[parent] = A[split_edge]
    A[:,parent] = A[:,split_edge]
    A[sibling,split_edge] = 0
    A[parent, split_edge] = 0
    A[sibling,sibling] = 1
    A[parent,parent] = 1
    A[sibling,parent] = 1
    A[split_edge,parent] = 1
    return A

class TreeEvaluator(object):
    """Subclass must provide makeTreeScorer and result2output"""
    
    def results2output(self, results):
        return ScoredTreeCollection(results)
        
    def evaluateTopology(self, tree):
        """Optimal (score, tree) for the one topology 'tree'"""
        (ancestry, names, lengths) = tree2ancestry(tree)
        evaluate = self.makeTreeScorer(names)
        (err, lengths) = evaluate(ancestry)
        return self.result2output(err, ancestry, lengths, names)
    
    def evaluateTree(self, tree):
        """score for 'tree' with lengths as-is"""
        (ancestry, names, lengths) = tree2ancestry(tree)
        evaluate = self.makeTreeScorer(names)
        (err, result) = evaluate(ancestry, lengths=lengths)
        return err
    
    def _consistentNameOrder(self, fixed_names, ordered_names=None):
        """fixed_names followed by ordered_names without duplicates"""
        all_names = set(self.names)

        fixed_names_set = set(fixed_names)
        assert fixed_names_set.issubset(all_names)

        if ordered_names:
            assert set(ordered_names).issubset(all_names)
        else:
            ordered_names = self.names
        names = list(fixed_names) + [n for n in ordered_names 
                if n not in fixed_names_set]
        return names
        
    @UI.display_wrap
    def trex(self, a=8, k=1000, start=None, order=None, return_all=False, 
            filename=None, interval=None, ui=None):
        """TrexML policy for tree sampling - all trees up to size 'a' and
        then keep no more than 'k' best trees at each tree size.
        'order' is an optional list of tip names.  
        'start' is an optional list of initial trees.  Each of the trees must
        contain the same tips.
        'filename' and 'interval' control checkpointing.
        
        Advanced step-wise addition algorithm
        M. J. Wolf, S. Easteal, M. Kahn, B. D. McKay, and L. S. Jermiin.
        Trexml: a maximum-likelihood approach for extensive tree-space
        exploration.
        Bioinformatics, 16(4):383 94, 2000."""
        
        checkpointer = checkpointing.Checkpointer(filename, interval)
        if checkpointer.available():
            (init_tree_size, fixed_names, trees) = checkpointer.load()
            names = self._consistentNameOrder(fixed_names, order)
        elif start is not None:
            if not isinstance(start, list):
                start = [start]
            fixed_names = start[0].getTipNames()
            names = self._consistentNameOrder(fixed_names, order)
            trees = []
            for tree in start:
                # check the start tree represents a subset of tips
                assert set(tree.getTipNames()) < set(self.names), \
                    "Starting tree names not a subset of the sequence names"
                
                (ancestry, fixed_names2, lengths) = tree2ancestry(
                        tree, order=fixed_names)
                assert fixed_names2 == fixed_names
                trees.append((None, None, ancestry))
            init_tree_size = len(fixed_names)
        else:
            trees = [(None, None, numpy.identity(3, int))]
            names = self._consistentNameOrder([], order)
            init_tree_size = 3
            
        tree_size = len(names)
        assert tree_size > 3
        if a > tree_size:
            a = tree_size
        if a < 4:
            a = 4

        # All trees of size a-1, no need to compare them
        for n in range(init_tree_size+1, a):
            trees2 = []
            for (err2, lengths2, ancestry) in trees:
                for split_edge in range(len(ancestry)):
                    ancestry2 = grown(ancestry, split_edge)
                    trees2.append((None, None, ancestry2))
            trees = trees2
            init_tree_size = n
        
        # Pre calculate how much work is to be done, for progress display
        tree_count = len(trees)
        total_work = 0
        work_done = [0] * (init_tree_size+1)
        for n in range(init_tree_size+1, tree_size+1):
            evals = tree_count * (n*2-5)
            total_work += evals * n
            tree_count = min(k, evals)
            work_done.append(total_work)
        
        # For each tree size, grow at each edge of each tree. Keep best k.
        for n in range(init_tree_size+1, tree_size+1):
            evaluate = self.makeTreeScorer(names[:n])            

            def grown_tree(spec):
                (tree_ordinal, tree, split_edge) = spec
                (old_err, old_lengths, old_ancestry) = tree
                ancestry = grown(old_ancestry, split_edge)
                (err, lengths) = evaluate(ancestry)
                return (err, tree_ordinal, split_edge, lengths, ancestry)
            
            specs = [(i, tree, edge) 
                        for (i,tree) in enumerate(trees) 
                        for edge in range(n*2-5)]

            candidates = ui.imap(grown_tree, specs, noun=('%s leaf tree' % n),
                start=work_done[n-1]/total_work, end=work_done[n]/total_work)
            
            best = ismallest(candidates, k)
            
            trees = [(err, lengths, ancestry) for (err, parent_ordinal, 
                    split_edge, lengths, ancestry) in best]
            
            checkpointer.record((n, names[:n], trees))
        
        results = (self.result2output(err, ancestry, lengths, names)
                    for (err, lengths, ancestry) in trees)
        if return_all:
            result = self.results2output(results)
        else:
            result = results.next()
        return result