File: newick.py

package info (click to toggle)
python-bx 0.13.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,000 kB
  • sloc: python: 17,136; ansic: 2,326; makefile: 24; sh: 8
file content (127 lines) | stat: -rw-r--r-- 3,679 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""
Support for parsing phylogenetic tree's in newick format.

TODO: Tree/Edge should be a generic data structure, not newick specific.
"""

from functools import total_ordering

from pyparsing import (
    alphas,
    CaselessLiteral,
    Combine,
    delimitedList,
    Forward,
    nums,
    Optional,
    QuotedString,
    Suppress,
    Word,
)

__all__ = ["Tree", "Edge", "NewickParser", "newick_parser"]


def indent(s):
    return "\n".join("    " + line for line in s.split("\n"))


def print_(p, s):
    print(p, type(s), s)
    return s


@total_ordering
class Tree:
    def __init__(self, label, edges=None):
        self.label = label
        self.edges = edges

    def pretty(self):
        if self.edges:
            return "Tree( '{}',\n{}\n)".format(self.label, indent("\n".join(repr(edge) for edge in self.edges)))
        else:
            return f"Tree( '{self.label}' )"

    def __lt__(self, other):
        return self.__dict__ < other.__dict__

    def __eq__(self, other):
        return self.__dict__ == other.__dict__

    def __repr__(self):
        return f"Tree( {repr(self.label)}, {repr(self.edges)} )"


@total_ordering
class Edge:
    def __init__(self, length, tip):
        self.length = length
        self.tip = tip

    def pretty(self):
        return f"Edge( {repr(self.length)}, \n{indent(repr(self.tip))}\n)"

    def __lt__(self, other):
        return self.__dict__ < other.__dict__

    def __eq__(self, other):
        return self.__dict__ == other.__dict__

    def __repr__(self):
        return f"Edge( {repr(self.length)}, {repr(self.tip)} )"


def create_parser():
    """
    Create a 'pyparsing' parser for newick format trees roughly based on the
    grammar here:
        http://evolution.genetics.washington.edu/phylip/newick_doc.html

    Problems:
        - Is a single leaf a valid tree?
        - Branch length on root? Doesn't make sense to me, and forces the root
          to be an edge.
    """
    # Basic tokens
    real = Combine(
        Word("+-" + nums, nums)
        + Optional("." + Optional(Word(nums)))
        + Optional(CaselessLiteral("E") + Word("+-" + nums, nums))
    )
    lpar = Suppress("(")
    rpar = Suppress(")")
    colon = Suppress(":")
    semi = Suppress(";")
    # Labels are either unquoted or single quoted, if unquoted underscores will be replaced with spaces
    quoted_label = QuotedString("'", None, "''").setParseAction(lambda s, l, t: t[0])
    simple_label = Word(alphas + nums + "_.").setParseAction(lambda s, l, t: t[0].replace("_", " "))
    label = quoted_label | simple_label
    # Branch length is a real number (note though that exponents are not in the spec!)
    branch_length = real.setParseAction(lambda s, l, t: float(t[0]))
    # Need to forward declare this due to circularity
    node_list = Forward()
    # A node might have an list of edges (for a subtree), a label, and/or a branch length
    node = (Optional(node_list, None) + Optional(label, "") + Optional(colon + branch_length, None)).setParseAction(
        lambda s, l, t: Edge(t[2], Tree(t[1] or None, t[0]))
    )
    node_list << (lpar + delimitedList(node) + rpar).setParseAction(lambda s, l, t: [t.asList()])
    # The root cannot have a branch length
    tree = (node_list + Optional(label, "") + semi).setParseAction(lambda s, l, t: Tree(t[1] or None, t[0]))
    # Return the outermost element
    return tree


class NewickParser:
    """
    Class wrapping a parser for building Trees from newick format strings
    """

    def __init__(self):
        self.parser = create_parser()

    def parse_string(self, s):
        return self.parser.parseString(s)[0]


newick_parser = NewickParser()