File: parsimony.py

package info (click to toggle)
python-dendropy 4.2.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 68,392 kB
  • ctags: 3,947
  • sloc: python: 41,840; xml: 1,400; makefile: 15
file content (407 lines) | stat: -rw-r--r-- 15,022 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
#! /usr/bin/env python

##############################################################################
##  DendroPy Phylogenetic Computing Library.
##
##  Copyright 2010-2015 Jeet Sukumaran and Mark T. Holder.
##  All rights reserved.
##
##  See "LICENSE.rst" for terms and conditions of usage.
##
##  If you use this work or any portion thereof in published work,
##  please cite it as:
##
##     Sukumaran, J. and M. T. Holder. 2010. DendroPy: a Python library
##     for phylogenetic computing. Bioinformatics 26: 1569-1571.
##
##############################################################################

"""
Models, modeling and model-fitting of parsimony.
"""

from functools import reduce
import operator
import dendropy
from dendropy.utility.error import TaxonNamespaceIdentityError

class _NodeStateSetMap(dict):
    def __init__(self, taxon_state_sets_map=None):
        self.taxon_state_sets_map = taxon_state_sets_map
    def __getitem__(self, key):
        try:
            return dict.__getitem__(self, key)
        except KeyError:
            v = self.taxon_state_sets_map[key.taxon]
            self[key] = v
            return v

def _store_sets_as_attr(n, state_sets_attr_name, v):
    setattr(n, state_sets_attr_name, v)

def _retrieve_state_sets_from_attr(n, state_sets_attr_name, taxon_state_sets_map):
    try:
        return getattr(n, state_sets_attr_name)
    except AttributeError:
        v = taxon_state_sets_map[n.taxon]
        setattr(n, state_sets_attr_name, v)
        return v

def fitch_down_pass(
        postorder_nodes,
        state_sets_attr_name="state_sets",
        taxon_state_sets_map=None,
        weights=None,
        score_by_character_list=None,
        ):
    """
    Returns the parsimony score given a list of nodes in postorder and
    associated states, using Fitch's (1971) unordered parsimony algorithm.

    Parameters
    ----------
    postorder_nodes : iterable of/over |Node| objects
        An iterable of |Node| objects in in order of post-order
        traversal of the tree.
    state_sets_attr_name : str
        Name of attribute on |Node| objects in which state set lists
        will stored/accessed. If |None|, then state sets will not be stored on
        the tree.
    taxon_state_sets_map : dict[taxon] = state sets
        A dictionary that takes a taxon object as a key and returns a state set
        list as a value. This will be used to populate the state set of a node
        that has not yet had its state sets scored and recorded (typically,
        leaves of a tree that has not yet been processed).
    weights : iterable
        A list of weights for each pattern.
    score_by_character_list : None or list
        If not |None|, should be a reference to a list object.
        This list will be populated by the scores on a character-by-character
        basis.

    Returns
    -------
    s : int
        Parismony score of tree.

    Notes
    -----
    Currently this requires a bifurcating tree (even at the root).

    Examples
    --------

    Assume that we have a tree, ``tree``, and an associated data set, ``data``::

        import dendropy
        from dendropy.model.parsimony import fitch_down_pass

        taxa = dendropy.TaxonNamespace()
        data = dendropy.StandardCharacterMatrix.get_from_path(
                "apternodus.chars.nexus",
                "nexus",
                taxon_namespace=taxa)
        tree = dendropy.Tree.get_from_path(
                "apternodus.tre",
                "nexus",
                taxon_namespace=taxa)
        taxon_state_sets_map = data.taxon_state_sets_map(gaps_as_missing=True)

    The following will return the parsimony score of the ``tree`` with
    respect to the data in ``data``::

        score = fitch_down_pass(
                nodes=tree.postorder_node_iter(),
                taxon_state_sets_map=taxon_set_map)
        print(score)

    In the above, every |Node| object of ``tree`` will have an attribute
    added, "state_sets", that stores the list of state sets from the analysis::

        for nd in tree:
            print(nd.state_sets)

    If you want to store the list of state sets in a different attribute, e.g.,
    "analysis1_states"::

        score = fitch_down_pass(
                nodes=tree.postorder_node_iter(),
                state_sets_attr_name="analysis1_states",
                taxon_state_sets_map=taxon_set_map)
        print(score)
        for nd in tree:
            print(nd.analysis1_states)

    Or not to store these at all::

        score = fitch_down_pass(
                nodes=tree.postorder_node_iter(),
                state_sets_attr_name=None,
                taxon_state_sets_map=taxon_set_map)
        print(score)

    Scoring custom data can be done by something like the following::

        taxa = dendropy.TaxonNamespace()
        taxon_state_sets_map = {}
        t1 = taxa.require_taxon("A")
        t2 = taxa.require_taxon("B")
        t3 = taxa.require_taxon("C")
        t4 = taxa.require_taxon("D")
        t5 = taxa.require_taxon("E")
        taxon_state_sets_map[t1] = [ set([0,1]),  set([0,1]),  set([0]),     set([0]) ]
        taxon_state_sets_map[t2] = [ set([1]),    set([1]),    set([1]),     set([0]) ]
        taxon_state_sets_map[t3] = [ set([0]),    set([1]),    set([1]),     set([0]) ]
        taxon_state_sets_map[t4] = [ set([0]),    set([1]),    set([0,1]),   set([1]) ]
        taxon_state_sets_map[t5] = [ set([1]),    set([0]),    set([1]),     set([1]) ]
        tree = dendropy.Tree.get_from_string(
                "(A,(B,(C,(D,E))));", "newick",
                taxon_namespace=taxa)
        score = fitch_down_pass(tree.postorder_node_iter(),
                taxon_state_sets_map=taxon_state_sets_map)
        print(score)

    """
    if score_by_character_list is not None:
        assert len(score_by_character_list) == 0
        for idx in range(len(list(taxon_state_sets_map.values())[0])): # this is unacceptable!
            score_by_character_list.append(0)
    score = 0
    if state_sets_attr_name is None:
        node_state_set_map = _NodeStateSetMap(taxon_state_sets_map)
        get_node_state_sets = lambda node : node_state_set_map[node]
        set_node_state_sets = lambda node, v : node_state_set_map.__setitem__(node, v)
    else:
        get_node_state_sets = lambda node : _retrieve_state_sets_from_attr(node, state_sets_attr_name, taxon_state_sets_map)
        set_node_state_sets = lambda node, v : _store_sets_as_attr(node, state_sets_attr_name, v)
    for nd in postorder_nodes:
        c = nd.child_nodes()
        if not c:
            ss = get_node_state_sets(nd)
            continue
        left_c, right_c = c[:2]
        remaining = c[2:]
        left_ssl = get_node_state_sets(left_c)
        while True:
            right_ssl = get_node_state_sets(right_c)
            result = []
            for n, ssp in enumerate(zip(left_ssl, right_ssl)):
                left_ss, right_ss = ssp
                inter = left_ss.intersection(right_ss)
                if inter:
                    result.append(inter)
                else:
                    if weights is None:
                        wt = 1
                    else:
                        wt = weights[n]
                    score += wt
                    result.append(left_ss.union(left_ss, right_ss))
                    if score_by_character_list is not None:
                        # try:
                        #     score_by_character_list[n] += wt
                        # except IndexError:
                        #     score_by_character_list.append(wt)
                        score_by_character_list[n] += wt
            if remaining:
                right_c = remaining.pop(0)
                left_ssl = result
            else:
                break
        # setattr(nd, state_sets_attr_name, result)
        set_node_state_sets(nd, result)
    return score

def fitch_up_pass(
        preorder_node_list,
        state_sets_attr_name="state_sets",
        taxon_state_sets_map=None):
    """
    Finalizes the state set lists associated with each node using the "final
    phase" of Fitch's (1971) unordered parsimony algorithm.

    Parameters
    ----------
    postorder_nodes : iterable of/over |Node| objects
        An iterable of |Node| objects in in order of post-order
        traversal of the tree.
    state_sets_attr_name : str
        Name of attribute on |Node| objects in which state set lists
        will stored/accessed. If |None|, then state sets will not be stored on
        the tree.
    taxon_state_sets_map : dict[taxon] = state sets
        A dictionary that takes a taxon object as a key and returns a state set
        list as a value. This will be used to populate the state set of a node
        that has not yet had its state sets scored and recorded (typically,
        leaves of a tree that has not yet been processed).

    Notes
    -----
    Currently this requires a bifurcating tree (even at the root).

    Examples
    --------

    ::

        taxa = dendropy.TaxonNamespace()
        data = dendropy.StandardCharacterMatrix.get_from_path(
                "apternodus.chars.nexus",
                "nexus",
                taxon_namespace=taxa)
        tree = dendropy.Tree.get_from_path(
                "apternodus.tre",
                "nexus",
                taxon_namespace=taxa)
        taxon_state_sets_map = data.taxon_state_sets_map(gaps_as_missing=True)
        score = fitch_down_pass(tree.postorder_node_iter(),
                taxon_state_sets_map=taxon_state_sets_map)
        print(score)
        fitch_up_pass(tree.preorder_node_iter())
        for nd in tree:
            print(nd.state_sets)

    """
    node_state_sets_map = {}
    for nd in preorder_node_list:
        c = nd.child_nodes()
        p = nd.parent_node
        if (not c) or (not p):
            continue
        assert(len(c) == 2)
        left_c, right_c = c
        try:
            left_ssl = getattr(left_c, state_sets_attr_name)
        except AttributeError:
            if not taxon_state_sets_map:
                raise
            left_ssl = taxon_state_sets_map[left_c.taxon]
        try:
            right_ssl = getattr(right_c, state_sets_attr_name)
        except AttributeError:
            if not taxon_state_sets_map:
                raise
            right_ssl = taxon_state_sets_map[right_c.taxon]
        par_ssl = getattr(p, state_sets_attr_name)
        curr_ssl = getattr(nd, state_sets_attr_name)
        result = []
        for n, ssp in enumerate(zip(par_ssl, curr_ssl, left_ssl, right_ssl)):
            par_ss, curr_ss, left_ss, right_ss = ssp

            down_parup_inter = par_ss.intersection(curr_ss)
            if down_parup_inter == par_ss:
                final_ss = down_parup_inter
            else:
                rl_inter = left_ss.intersection(right_ss)
                if not rl_inter:
                    final_ss = par_ss.union(curr_ss)
                else:
                    in_par_and_left = par_ss.intersection(left_ss)
                    in_par_and_right = par_ss.intersection(right_ss)
                    final_ss = in_par_and_left.union(in_par_and_right, curr_ss)
            #_LOG.debug("downpass = %s, par = %s, left = %s, right = %s, final_ss= %s" %
            #                    (str(curr_ss), str(par_ss), str(left_ss), str(right_ss), str(final_ss)))
            result.append(final_ss)
        setattr(nd, state_sets_attr_name, result)


def parsimony_score(
        tree,
        chars,
        gaps_as_missing=True,
        weights=None,
        score_by_character_list=None,
        ):
    """
    Calculates the score of a tree, ``tree``, given some character data,
    ``chars``, under the parsimony model using the Fitch algorithm.

    Parameters
    ----------
    tree : a |Tree| instance
        A |Tree| to be scored. Must reference the same |TaxonNamespace| as
        ``chars``.
    chars : a |CharacterMatrix| instance
        A |CharacterMatrix|-derived object with data to be scored. Must have
        the same |TaxonNamespace| as ``tree``.
    gap_as_missing : bool
        If |True| [default], then gaps will be treated as missing data.
        If |False|, then gaps will be treated as a new/additional state.
    weights : iterable
        A list of weights for each pattern/column in the matrix.
    score_by_character_list : None or list
        If not |None|, should be a reference to a list object.
        This list will be populated by the scores on a character-by-character
        basis.

    Returns
    -------
    pscore : int
        The parsimony score of the tree given the data.

    Examples
    --------

    ::

        import dendropy
        from dendropy.calculate import treescore

        # establish common taxon namespace
        taxon_namespace = dendropy.TaxonNamespace()

        # Read data; if data is, e.g., "standard", use StandardCharacterMatrix.
        # If unsure of data type, can do:
        #       dataset = dendropy.DataSet.get(
        #               path="path/to/file.nex",
        #               schema="nexus",
        #               taxon_namespace=tns,)
        #       chars = dataset.char_matrices[0]
        chars = dendropy.DnaCharacterMatrix.get(
                path="pythonidae.chars.nexus",
                schema="nexus",
                taxon_namespace=taxon_namespace)
        tree = dendropy.Tree.get(
                path="pythonidae.mle.newick",
                schema="newick",
                taxon_namespace=taxon_namespace)

        # We store the site-specific scores here
        # This is optional; if we do not want to
        # use the per-site scores, just pass in |None|
        # for the ``score_by_character_list`` argument
        # or do not specify this argument at all.
        score_by_character_list = []

        score = treescore.parsimony_score(
                tree,
                chars,
                gaps_as_missing=False,
                score_by_character_list=score_by_character_list)

        # Print the results: the score
        print("Score: {}".format(score))

        # Print the results: the per-site scores
        for idx, x in enumerate(score_by_character_list):
            print("{}: {}".format(idx+1, x))

    Notes
    -----

    If the same data is going to be used to score multiple trees or multiple times,
    it is probably better to generate the 'taxon_state_sets_map' once and call
    "fitch_down_pass" directly yourself, as this function generates a new map
    each time.

    """
    if tree.taxon_namespace is not chars.taxon_namespace:
        raise TaxonNamespaceIdentityError(tree, data)
    taxon_state_sets_map = chars.taxon_state_sets_map(gaps_as_missing=gaps_as_missing)
    nodes = tree.postorder_node_iter()
    pscore = fitch_down_pass(nodes,
            taxon_state_sets_map=taxon_state_sets_map,
            weights=weights,
            score_by_character_list=score_by_character_list)
    return pscore