File: tokenizer.py

package info (click to toggle)
python-dendropy 4.2.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 68,392 kB
  • ctags: 3,947
  • sloc: python: 41,840; xml: 1,400; makefile: 15
file content (293 lines) | stat: -rw-r--r-- 10,928 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
#! /usr/bin/env python

##############################################################################
##  DendroPy Phylogenetic Computing Library.
##
##  Copyright 2010-2015 Jeet Sukumaran and Mark T. Holder.
##  All rights reserved.
##
##  See "LICENSE.rst" for terms and conditions of usage.
##
##  If you use this work or any portion thereof in published work,
##  please cite it as:
##
##     Sukumaran, J. and M. T. Holder. 2010. DendroPy: a Python library
##     for phylogenetic computing. Bioinformatics 26: 1569-1571.
##
##############################################################################

import sys
from dendropy.utility import error

##############################################################################
## Tokenizer

class Tokenizer(object):
    """
    Stream tokenizer.
    """

    class TokenizerError(error.DataParseError):

        def __init__(self,
                message=None,
                line_num=None,
                col_num=None,
                stream=None):
            error.DataParseError.__init__(self,
                    message=message,
                    line_num=line_num,
                    col_num=col_num,
                    stream=stream)

    class UnterminatedQuoteError(TokenizerError):

        def __init__(self,
                quote_char=None,
                line_num=None,
                col_num=None,
                stream=None):
            Tokenizer.TokenizerError.__init__(self,
                    message="Unterminated quote: {}".format(quote_char),
                    line_num=line_num,
                    col_num=col_num,
                    stream=stream)

    class UnexpectedEndOfStreamError(TokenizerError):

        def __init__(self,
                message=None,
                line_num=None,
                col_num=None,
                stream=None):
            Tokenizer.TokenizerError.__init__(self,
                    message=message,
                    line_num=line_num,
                    col_num=col_num,
                    stream=stream)

    def __init__(self,
            src,                        # source stream
            uncaptured_delimiters,      # delimiters between tokens (not returned)
            captured_delimiters,        # delimiters between tokens (returned as tokens)
            quote_chars,                # characters enclosing literals
            escape_quote_by_doubling,   # should two consecutive quote characters indicate a literal character (rather than a quote)?
            escape_chars,               # characters indicating beginning of escaped character
            comment_begin,              # string indicating beginning of comment
            comment_end,                # string indicating end of comment
            capture_comments,           # are comments to be stored?
            preserve_unquoted_underscores,       # are unquoted underscores to be preserved
            ):
        # Tokenizer behavior customization
        self.uncaptured_delimiters = uncaptured_delimiters
        self.captured_delimiters = captured_delimiters
        self.quote_chars = quote_chars
        self.escape_quote_by_doubling = escape_quote_by_doubling
        self.escape_chars = escape_chars
        self.comment_begin = comment_begin
        self.comment_end = comment_end
        self.capture_comments = capture_comments
        self.preserve_unquoted_underscores = preserve_unquoted_underscores

        # State (internals)
        self.src = src
        self._cur_char = None
        self.current_token = None
        self.is_token_quoted = False

        # Meta-information
        self.captured_comments = []
        self.current_line_num = 1
        self.current_column_num = 0
        self.token_line_num = 0
        self.token_column_num = 0

    def reset(self):
        self.set_stream(src=None)

    def set_stream(self, src=None):
        self.src = src
        self._cur_char = None
        self.current_token = None
        self.is_token_quoted = False
        self.captured_comments = []
        self.current_line_num = 1
        self.current_column_num = 0
        self.token_line_num = 0
        self.token_column_num = 0

    def is_eof(self):
        return self._cur_char == ""

    def has_captured_comments(self):
        return len(self.captured_comments) > 0

    def next_token(self):
        try:
            t = self.__next__()
            return t
        except StopIteration:
            self.current_token = None
            return None

    def require_next_token(self):
        try:
            t = self.__next__()
            return t
        except StopIteration:
            # In Python 3, if you catch an exception and then raise an
            # exception that is not a subclass of the original exception,
            # the original exception is not considered to have been
            # handled.
            # In Python > 3.3, this can be solved by:
            #
            #   raise Tokenizer.UnexpectedEndOfStreamError(
            #                   message="Unexpected end of stream",
            #                   line_num=self.current_line_num,
            #                   col_num=self.current_column_num,
            #                   stream=self.src) from None
            #
            # To accommodate other versions, the following
            # is required:
            exc = Tokenizer.UnexpectedEndOfStreamError(
                            message="Unexpected end of stream",
                            line_num=self.current_line_num,
                            col_num=self.current_column_num,
                            stream=self.src)
            exc.__context__ = None # Python 3.0, 3.1, 3.2
            exc.__cause__ = None # Python 3.3, 3.4
            raise exc

    def clear_captured_comments(self):
        del self.captured_comments[:]

    def pull_captured_comments(self):
        if not self.captured_comments:
            return None
        c = self.captured_comments[:]
        del self.captured_comments[:]
        return c

    def __iter__(self):
        return self

    def __next__(self):
        self.is_token_quoted = False
        if self._cur_char is None:
            self._get_next_char()
        self._skip_to_significant_char()
        if self._cur_char == "":
            raise StopIteration
        if self._cur_char in self.captured_delimiters:
            self.current_token = self._cur_char
            self.token_line_num = self.current_line_num
            self.token_column_num = self.current_column_num
            self._get_next_char()
            return self.current_token
        elif self._cur_char in self.quote_chars:
            self.token_line_num = self.current_line_num
            self.token_column_num = self.current_column_num
            dest = []
            self.is_token_quoted = True
            cur_quote_char = self._cur_char
            self._get_next_char()
            while True:
                if self._cur_char == "":
                    raise Tokenizer.UnterminatedQuoteError(
                            quote_char=cur_quote_char,
                            line_num=self.current_line_num,
                            col_num=self.current_column_num,
                            stream=src)
                if self._cur_char == cur_quote_char:
                    self._get_next_char()
                    if self.escape_quote_by_doubling:
                        if self._cur_char == cur_quote_char:
                            # dest.write(cur_quote_char)
                            dest.append(cur_quote_char)
                            self._get_next_char()
                        else:
                            break
                    else:
                        self._get_next_char()
                        break
                else:
                    # dest.write(self._cur_char)
                    dest.append(self._cur_char)
                    self._get_next_char()
            # self.current_token = dest.getvalue()
            self.current_token = "".join(dest)
            return self.current_token
        else:
            # unquoted
            self.token_line_num = self.current_line_num
            self.token_column_num = self.current_column_num
            dest = []
            self.is_token_quoted = False
            while self._cur_char != "":
                if self._cur_char in self.uncaptured_delimiters:
                    self._get_next_char()
                    break
                elif self._cur_char in self.captured_delimiters:
                    break
                elif self._cur_char in self.comment_begin:
                    self._handle_comment()
                    if self._cur_char == "":
                        break
                else:
                    if self._cur_char == "_" and not self.preserve_unquoted_underscores:
                        self._cur_char = " "
                    dest.append(self._cur_char)
                    self._get_next_char()
            # self.current_token = dest.getvalue()
            self.current_token = "".join(dest)
            if self.current_token == "":
                if self._cur_char != "":
                    self.__next__()
                else:
                    raise StopIteration
            return self.current_token
    next = __next__ # Python 2 legacy support

    def _skip_to_significant_char(self):
        if self._cur_char == "":
            return
        if self._cur_char is None:
            self._get_next_char()
        if self._cur_char not in self.uncaptured_delimiters:
            return
        while self._cur_char != "" and self._cur_char in self.uncaptured_delimiters:
            self._get_next_char()
        return

    def _get_next_char(self):
        self._cur_char = self.src.read(1)
        if self._cur_char != "":
            if self._cur_char == "\n":
                self.current_line_num += 1
                self.current_column_num = 1
            else:
                # print("@@@ {}: {}".format(self.current_column_num, self._cur_char))
                self.current_column_num += 1
        return self._cur_char

    def _handle_comment(self):
        dest = []
        nesting = 0
        comment_complete = False
        while self._cur_char != "":
            if self._cur_char in self.comment_end:
                nesting -= 1
                if nesting <= 0:
                    comment_complete = True
                    self._get_next_char()
                    break
            elif self._cur_char in self.comment_begin:
                nesting += 1
            elif self.capture_comments:
                # dest.write(self._cur_char)
                dest.append(self._cur_char)
            self._get_next_char()
        if self.capture_comments:
            # self.captured_comments.append(dest.getvalue())
            self.captured_comments.append("".join(dest))