File: grammar.py

package info (click to toggle)
python-efilter 1.5-2.1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye
  • size: 596 kB
  • sloc: python: 4,342; makefile: 51
file content (369 lines) | stat: -rw-r--r-- 12,055 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
# EFILTER Forensic Query Language
#
# Copyright 2015 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This module provides grammar primitives common across most parsers.

### What is a grammar?

In the EFILTER world, we use the word 'grammar' to mean a collection of
stateless functions that take an iterable of tokens and return a TokenMatch
if the tokens match the grammatical construct the function represents.

These functions are called 'grammar functions' (gasp!).

For example, a grammar function that matches a parenthesis would be:

def lparen(tokens):
    first_token = next(iter(tokens))
    if first_token.name == "lparen":
        return TokenMatch(operator=None, value=None, tokens=[first_token])

To make writing grammar functions easier, this module provides a number of
primitives that largely insulate the programmer from having to write all of
the above. The real world lparen function actually looks like this:

def lparen(tokens):
    return token_name(tokens, "lparen")

### Operators

A common theme across most grammars are operators, and this module provides
a convenient container to group grammar functions related to operators.

The 'Operator' container groups basic information about an operator, starting
with its name and docstring, its suffix, infix and prefix parts, and also an
AST construct that the operator maps onto.

For example, the addition operator would look like this:

plus = Operator(
    handler=ast.Sum,
    assoc="left",
    infix="+"
    ...)
"""

__author__ = "Adam Sindelar <adamsh@google.com>"

import collections
import itertools
import six


class Token(collections.namedtuple("Token", "name value start end")):
    """Represents one token, which is what grammars operate on."""

    def __new__(cls, name, value, start=None, end=None):
        return super(Token, cls).__new__(cls, name, value, start, end)

    def __repr__(self):
        return "Token(name='%s', value='%s', start=%d, end=%d)" % (
            self.name, self.value, self.start or 0, self.end or 0)

    def __eq__(self, other):
        """Tokens compare on name and value, not on position."""
        return (self.name, self.value) == (other.name, other.value)

    def __hash__(self):
        """Tokens hash on name and value, not on position."""
        return hash((self.name, self.value))


class Operator(collections.namedtuple(
        "Operator",
        "name precedence assoc handler docstring prefix infix suffix")):
    """Declares an operator in a grammar with functions to match it.

    Operators can have prefix, infix and suffix parts, each of which is
    represented by the token (like Token("keyword", "+", None)). Each operator
    must have at least one of the *fixes. This class has no restriction on which
    *fixes can be used together, but the individual parsers may not support
    every combination. For example, DottySQL doesn't parse circumfix
    (prefix + suffix) operators.

    Previously, DottySQL used grammar functions for suffixes, which works well
    when there is only a small number of them, but is very slow if there are
    many operators. In practice, the grammar functions matching *fixes almost
    always just call _keyword, which means they can be replaced with a lookup
    in the operator table.

    Arguments:
        name: The literal name of the operator, such as "+" or "not".
        precedence: Integer precedence with operators of the same arity.
        handler: Callable that emits AST for this operator.
        docstring: Documentation for the operator.
        assoc: Associativity - can be left or right for infix operators.
        suffix: (OPTIONAL) The token (not grammar function) of the suffix.
        prefix: (OPTIONAL) The token (not grammar function) of the prefix.
        infix: (OPTIONAL) The token (not grammar function) of the infix.
    """


class TokenLookupTable(object):
    """Ordered associative container where tokens are keys.

    Public properties:
        case_sensitive (default False): If set to False, all lookups will be
            converted to lower case. NOTE: Does not affect insertion:
            case-insensitive grammar should insert operators in lower case.
    """
    _max_len = 1  # Longest match so far.
    _table = None  # Ordered dict keyed on tokens.

    # This affects only lookups, not insertion.
    case_sensitive = False

    def __init__(self, *entries):
        self._table = collections.OrderedDict()

        for tokens, entry in entries:
            self.set(tokens, entry)

    def set(self, tokens, entry):
        if isinstance(tokens, Token):
            tokens = (tokens,)
        elif isinstance(tokens, tuple):
            self._max_len = max(self._max_len, len(tokens))
        else:
            raise TypeError(
                "TokenLookupTable only supports instances of Token or "
                "tuples thereof for keys. Got %r." % tokens)

        if tokens in self._table:
            raise ValueError("Duplicate token key %r for %r." % (
                tokens, entry))

        self._table[tokens] = entry

    def _normalize_token(self, token):
        if (isinstance(token.value, six.string_types)
                and not self.case_sensitive):
            return token._replace(value=token.value.lower())

        return token

    def match(self, tokens):
        # Try to match longest known match first.
        for match_len in range(self._max_len, 0, -1):
            needle = tuple((self._normalize_token(t)
                            for t in itertools.islice(tokens, match_len)))
            result = self._table.get(needle)
            if result:
                return result, needle

        return None, None


class OperatorTable(object):
    """A complete set of operators in a grammar, keyed on their *fix tokens."""
    prefix = None
    infix = None
    suffix = None
    by_name = None
    by_handler = None

    def __init__(self, *operators):
        self.prefix = TokenLookupTable()
        self.infix = TokenLookupTable()
        self.suffix = TokenLookupTable()
        self.by_name = dict()
        self.by_handler = dict()

        for operator in operators:
            if operator.name in self.by_name:
                raise ValueError("Duplicit operator name %r." % operator.name)
            self.by_name[operator.name] = operator

            if operator.handler not in self.by_handler:
                # Multiple operators can have the same handler, in which case
                # they are probably aliases that mean the same thing. In that
                # case the first operator "wins" and will likely be what
                # the formatter for this syntax ends up using as default when
                # it formats this AST.
                self.by_handler[operator.handler] = operator

            # An operator can have multiple components, but it is only indexed
            # by the first one to prevent ambiguity.
            if operator.prefix:
                self.prefix.set(operator.prefix, operator)
            elif operator.infix:
                self.infix.set(operator.infix, operator)
            elif operator.suffix:
                self.suffix.set(operator.suffix, operator)


# Grammar primitives and helpers. (No grammar functions until the end of file.)

class TokenMatch(collections.namedtuple(
        "TokenMatch", "operator value tokens")):
    """Represents a one or more matching tokens and, optionally, their contents.

    Arguments:
        operator: The Operator instance that matched, if any.
        value: The literal value that matched, if any.
        tokens: The actual tokens the match consumed.
    """

    @property
    def start(self):
        return self.tokens[0].start

    @property
    def end(self):
        return self.tokens[-1].end

    @property
    def first(self):
        return self.tokens[0]


def keyword(tokens, expected):
    """Case-insensitive keyword match."""
    try:
        token = next(iter(tokens))
    except StopIteration:
        return

    if token and token.name == "symbol" and token.value.lower() == expected:
        return TokenMatch(None, token.value, (token,))


def multi_keyword(tokens, keyword_parts):
    """Match a case-insensitive keyword consisting of multiple tokens."""
    tokens = iter(tokens)
    matched_tokens = []
    limit = len(keyword_parts)

    for idx in six.moves.range(limit):
        try:
            token = next(tokens)
        except StopIteration:
            return

        if (not token or token.name != "symbol" or
                token.value.lower() != keyword_parts[idx]):
            return

        matched_tokens.append(token)

    return TokenMatch(None, token.value, matched_tokens)


def keywords(tokens, expected):
    """Match against any of a set/dict of keywords.

    Not that this doesn't support multi-part keywords. Any multi-part keywords
    must be special-cased in their grammar function.
    """
    try:
        token = next(iter(tokens))
    except StopIteration:
        return

    if token and token.name == "symbol" and token.value.lower() in expected:
        return TokenMatch(None, token.value, (token,))


def prefix(tokens, operator_table):
    """Match a prefix of an operator."""
    operator, matched_tokens = operator_table.prefix.match(tokens)
    if operator:
        return TokenMatch(operator, None, matched_tokens)


def infix(tokens, operator_table):
    """Match an infix of an operator."""
    operator, matched_tokens = operator_table.infix.match(tokens)
    if operator:
        return TokenMatch(operator, None, matched_tokens)


def suffix(tokens, operator_table):
    """Match a suffix of an operator."""
    operator, matched_tokens = operator_table.suffix.match(tokens)
    if operator:
        return TokenMatch(operator, None, matched_tokens)


def token_name(tokens, expected):
    """Match a token name (type)."""
    try:
        token = next(iter(tokens))
    except StopIteration:
        return

    if token and token.name == expected:
        return TokenMatch(None, token.value, (token,))


def match_tokens(expected_tokens):
    """Generate a grammar function that will match 'expected_tokens' only."""
    if isinstance(expected_tokens, Token):
        # Match a single token.
        def _grammar_func(tokens):
            try:
                next_token = next(iter(tokens))
            except StopIteration:
                return

            if next_token == expected_tokens:
                return TokenMatch(None, next_token.value, (next_token,))

    elif isinstance(expected_tokens, tuple):
        # Match multiple tokens.
        match_len = len(expected_tokens)
        def _grammar_func(tokens):
            upcoming = tuple(itertools.islice(tokens, match_len))
            if upcoming == expected_tokens:
                return TokenMatch(None, None, upcoming)
    else:
        raise TypeError(
            "'expected_tokens' must be an instance of Token or a tuple "
            "thereof. Got %r." % expected_tokens)

    return _grammar_func


# Some common grammar functions:


def literal(tokens):
    return token_name(tokens, "literal")


def symbol(tokens):
    return token_name(tokens, "symbol")


def lparen(tokens):
    return token_name(tokens, "lparen")


def rparen(tokens):
    return token_name(tokens, "rparen")


def lbracket(tokens):
    return token_name(tokens, "lbracket")


def rbracket(tokens):
    return token_name(tokens, "rbracket")


def comma(tokens):
    return token_name(tokens, "comma")