File: tokenizer.py

package info (click to toggle)
python-efilter 1.5-2.1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye
  • size: 596 kB
  • sloc: python: 4,342; makefile: 51
file content (365 lines) | stat: -rw-r--r-- 13,773 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
# EFILTER Forensic Query Language
#
# Copyright 2015 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This module implements a reusable expression tokenizer.
"""

__author__ = "Adam Sindelar <adamsh@google.com>"


import collections
import re
import six

from efilter import errors

from efilter.parsers.common import grammar


class Pattern(collections.namedtuple("Pattern",
                                     "name states regex action next_state")):
    """Defines a token pattern for the tokenizer.

    Arguments:
        name: The name of the pattern will be used to name the token.
        states: The pattern will only be applied if we're in one these states.
        regex: A regular expression to try and match from the current point.
            A named matched group 'token' will be saved in Token.value.
        action: The handler to call.
        next_state: The next state we transition to if this pattern matched.
    """

    def __new__(cls, name, states, regex, action, next_state):
        return super(Pattern, cls).__new__(
            cls, name, states,
            re.compile(regex, re.DOTALL | re.M | re.S | re.U),
            action, next_state)


class LazyTokenizer(object):
    """Configurable tokenizer usable with most expression grammars.

    This class is directly usable, and will, by default, produce tokens for
    string and number literals, parens, brackets, commas and words (symbols).

    Notes on performance:
        The runtime complexity grows with the number of patterns (m) and the
        number of tokens (n) in source. It is O(n*m) in the worst case.

        The tokenizer is lazy, and uses a deque to cache parsed tokens which
        haven't been skipped yet. When using 'peek' without 'skip' all tokens
        have to be cached, and this leads to O(n) memory complexity!

    Extending the tokenizer:
        This class is capable of tokenizing most any sane expression language,
        but can be further extended to (1) yield more specific token names for
        certain grammar (e.g. distinguishing between symbols and operators),
        as well as (2) supporting further tokens, such as curly braces.

        In the majority of cases, adding more patterns will be sufficient. For
        example, to support curly braces, one would add the following to
        DEFAULT_PATTERNS:

            Pattern("lbrace", # Give the token a new name.
                    ("INITIAL",), # Match this only if you're not in a string.
                    r"(?P<token>\\{)", # The regex should match an lbrace, and
                                      # capture it in the group named 'token'.
                    "emit", # This will yield a Token(name='lbrace', value='{').
                    None, # Matching an lbrace doesn't change the state.
            ),
            Pattern("rbrace", ("INITIAL",), r"(?P<token>\\})", "emit", None)

        For more complex use cases, it may be necessary to implement additional
        actions, which are just instance methods. Take a look at how string
        literals are implemented (string_start, string_end) for an example.

    Built-in actions:
        emit: Will emit a token with the supplied name and value set to whatever
            the named match group 'token' contains.
        emit_param: The tokenizer will emit a parse-time parameter for
            interpolation by a parser. The parameter token can be indexed,
            keyed on a string, or both. Indexing happens automatically, starting
            from 0.
        emit_int: The tokenizer will emit an integer obtained by interpreting
            the matched substring as an integer in base 10.
        emit_hex: Same as 'emit_int' but base 16.
        emit_oct: Same as 'emit_int' but base 8.
        emit_float: Same as 'emit_int' but emits a base 10 float.
        string_end: Emits a token with the last matched string.

    Public interface:
        next_token: Returns the next token and advances the tokenizer.
        skip: Skips N tokens ahead, without returning them.
        peek: Looks ahead over N tokens WITHOUT advancing the tokenizer. This
            fills up the token lookahead queue with N tokens - avoid supplying
            large values of N.
        __iter__: Returns an iterator that doesn't advance the tokenizer (
            same as calling peek() with increasing values of N). This can fill
            up the token queue quickly and should not be the primary interface.

    Arguments:
        source: Source string that will be lexed.
        patterns (optional): Overrides self.DEFAULT_PATTERNS
    """

    # Used if no patterns are supplied to the constructor. Subclasses can
    # override.
    DEFAULT_PATTERNS = (
        # Parens/brackets and separators.
        Pattern("lparen", ("INITIAL,"), r"(?P<token>\()", "emit", None),
        Pattern("rparen", ("INITIAL,"), r"(?P<token>\))", "emit", None),
        Pattern("lbracket", ("INITIAL,"), r"(?P<token>\[)", "emit", None),
        Pattern("rbracket", ("INITIAL,"), r"(?P<token>\])", "emit", None),
        Pattern("comma", ("INITIAL,"), r"(?P<token>,)", "emit", None),

        # Built-time parameters.
        Pattern("param", ("INITIAL",), r"\{(?P<token>[a-z_0-9]*)\}",
                "emit_param", None),
        Pattern("param", ("INITIAL,"), r"(?P<token>\?)", "emit_param", None),

        # Numeric literals.
        Pattern("literal", ("INITIAL,"),
                r"(?P<token>\d+\.\d+)", "emit_float", None),
        Pattern("literal", ("INITIAL,"), r"(?P<token>0\d+)", "emit_oct", None),
        Pattern("literal", ("INITIAL,"),
                r"(?P<token>0x[0-9a-zA-Z]+)", "emit_hex", None),
        Pattern("literal", ("INITIAL,"), r"(?P<token>\d+)", "emit_int", None),

        # String literals.
        Pattern(None, ("INITIAL",), r"\"", "string_start", "STRING"),
        Pattern(None, ("INITIAL",), r"'", "string_start", "SQ_STRING"),

        Pattern("literal", ("STRING",), "\"", "string_end", None),
        Pattern(None, ("STRING",), r"\\(.)", "string_escape", None),
        Pattern(None, ("STRING",), r"([^\\\"]+)", "string_append", None),

        Pattern("literal", ("SQ_STRING",), "'", "string_end", None),
        Pattern(None, ("SQ_STRING",), r"\\(.)", "string_escape", None),
        Pattern(None, ("SQ_STRING",), r"([^\\']+)", "string_append",
                None),

        # Prefer to match symbols only as far as they go, should they be
        # followed by a literal with no whitespace in between.
        Pattern("symbol", ("INITIAL",), r"([a-zA-Z_][\w_]*)", "emit", None),

        # Special characters are also valid as a symbol, but we won't match them
        # eagerly so as to not swallow valid literals that follow.
        Pattern("symbol", ("INITIAL",), r"([-+*\/=~\.><\[\]!:]+)", "emit",
                None),

        # Whitespace is ignored.
        Pattern(None, ("INITIAL",), r"(\s+)", None, None),
    )

    # Ordered instances of Pattern.
    patterns = None

    # A deque with tokens that have been parsed, but haven't been skipped yet.
    lookahead = None

    # The input string being tokenized.
    source = None

    # List of states, as determined by rules in 'patterns'.
    state_stack = None

    # The length of 'source'.
    limit = None

    # The latest matched literal string.
    string = None

    def __init__(self, source, patterns=None):
        self.source = source
        self.state_stack = ["INITIAL"]
        self.current_token = None
        self._position = 0
        self.limit = len(source)
        self.lookahead = collections.deque()
        self._param_idx = 0

        self.patterns = patterns or self.DEFAULT_PATTERNS

        # Make sure current_token starts containing something.
        self.next_token()

    # API for the parser:

    @property
    def position(self):
        """Returns the logical position (unaffected by lookahead)."""
        if self.lookahead:
            return self.lookahead[0].start

        return self._position

    def __iter__(self):
        """Look ahead from current position."""
        if self.current_token is not None:
            yield self.current_token

        for token in self.lookahead:
            yield token

        while True:
            token = self._parse_next_token()
            if not token:
                return

            self.lookahead.append(token)
            yield token

    def peek(self, steps=1):
        """Look ahead, doesn't affect current_token and next_token."""
        try:
            tokens = iter(self)
            for _ in six.moves.range(steps):
                next(tokens)

            return next(tokens)
        except StopIteration:
            return None

    def skip(self, steps=1):
        """Skip ahead by 'steps' tokens."""
        for _ in six.moves.range(steps):
            self.next_token()

    def next_token(self):
        """Returns the next logical token, advancing the tokenizer."""
        if self.lookahead:
            self.current_token = self.lookahead.popleft()
            return self.current_token

        self.current_token = self._parse_next_token()
        return self.current_token

    # Implementation:

    def _pop_state(self, **_):
        try:
            self.state_stack.pop()
        except IndexError:
            self._error("Pop state called on an empty stack.", self.position)

    def _parse_next_token(self):
        """Will parse patterns until it gets to the next token or EOF."""
        while self._position < self.limit:
            token = self._next_pattern()
            if token:
                return token

        return None

    def _next_pattern(self):
        """Parses the next pattern by matching each in turn."""
        current_state = self.state_stack[-1]
        position = self._position
        for pattern in self.patterns:
            if current_state not in pattern.states:
                continue

            m = pattern.regex.match(self.source, position)
            if not m:
                continue

            position = m.end()
            token = None

            if pattern.next_state:
                self.state_stack.append(pattern.next_state)

            if pattern.action:
                callback = getattr(self, pattern.action, None)
                if callback is None:
                    raise RuntimeError(
                        "No method defined for pattern action %s!" %
                        pattern.action)

                if "token" in m.groups():
                    value = m.group("token")
                else:
                    value = m.group(0)
                token = callback(string=value, match=m,
                                 pattern=pattern)

            self._position = position

            return token

        self._error("Don't know how to match next. Did you forget quotes?",
                    start=self._position, end=self._position + 1)

    def _error(self, message, start, end=None):
        """Raise a nice error, with the token highlighted."""
        raise errors.EfilterParseError(
            source=self.source, start=start, end=end, message=message)

    # Actions:

    def emit(self, string, match, pattern, **_):
        """Emits a token using the current pattern match and pattern label."""
        return grammar.Token(name=pattern.name, value=string,
                             start=match.start(), end=match.end())

    def emit_param(self, match, pattern, **_):
        param_name = match.group(1)

        if not param_name or param_name == "?":
            param_name = self._param_idx
            self._param_idx += 1
        elif param_name and re.match(r"^\d+$", param_name):
            param_name = int(param_name)

        return grammar.Token(name=pattern.name, value=param_name,
                             start=match.start(), end=match.end())

    def emit_int(self, string, match, pattern, **_):
        return grammar.Token(name=pattern.name, value=int(string),
                             start=match.start(), end=match.end())

    def emit_oct(self, string, match, pattern, **_):
        return grammar.Token(name=pattern.name, value=int(string, 8),
                             start=match.start(), end=match.end())

    def emit_hex(self, string, match, pattern, **_):
        return grammar.Token(name=pattern.name, value=int(string, 16),
                             start=match.start(), end=match.end())

    def emit_float(self, string, match, pattern, **_):
        return grammar.Token(name=pattern.name, value=float(string),
                             start=match.start(), end=match.end())

    # String parsing:

    def string_start(self, match, **_):
        self.string = ""
        self.string_position = match.start()

    def string_escape(self, string, match, **_):
        if match.group(1) in "'\"rnbt":
            self.string += string.decode("string_escape")
        else:
            self.string += string

    def string_append(self, string="", **_):
        self.string += string

    def string_end(self, pattern, match, **_):
        self._pop_state()
        return grammar.Token(name=pattern.name, value=self.string,
                             start=self.string_position, end=match.end())