1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369
|
# EFILTER Forensic Query Language
#
# Copyright 2015 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This module provides grammar primitives common across most parsers.
### What is a grammar?
In the EFILTER world, we use the word 'grammar' to mean a collection of
stateless functions that take an iterable of tokens and return a TokenMatch
if the tokens match the grammatical construct the function represents.
These functions are called 'grammar functions' (gasp!).
For example, a grammar function that matches a parenthesis would be:
def lparen(tokens):
first_token = next(iter(tokens))
if first_token.name == "lparen":
return TokenMatch(operator=None, value=None, tokens=[first_token])
To make writing grammar functions easier, this module provides a number of
primitives that largely insulate the programmer from having to write all of
the above. The real world lparen function actually looks like this:
def lparen(tokens):
return token_name(tokens, "lparen")
### Operators
A common theme across most grammars are operators, and this module provides
a convenient container to group grammar functions related to operators.
The 'Operator' container groups basic information about an operator, starting
with its name and docstring, its suffix, infix and prefix parts, and also an
AST construct that the operator maps onto.
For example, the addition operator would look like this:
plus = Operator(
handler=ast.Sum,
assoc="left",
infix="+"
...)
"""
__author__ = "Adam Sindelar <adamsh@google.com>"
import collections
import itertools
import six
class Token(collections.namedtuple("Token", "name value start end")):
"""Represents one token, which is what grammars operate on."""
def __new__(cls, name, value, start=None, end=None):
return super(Token, cls).__new__(cls, name, value, start, end)
def __repr__(self):
return "Token(name='%s', value='%s', start=%d, end=%d)" % (
self.name, self.value, self.start or 0, self.end or 0)
def __eq__(self, other):
"""Tokens compare on name and value, not on position."""
return (self.name, self.value) == (other.name, other.value)
def __hash__(self):
"""Tokens hash on name and value, not on position."""
return hash((self.name, self.value))
class Operator(collections.namedtuple(
"Operator",
"name precedence assoc handler docstring prefix infix suffix")):
"""Declares an operator in a grammar with functions to match it.
Operators can have prefix, infix and suffix parts, each of which is
represented by the token (like Token("keyword", "+", None)). Each operator
must have at least one of the *fixes. This class has no restriction on which
*fixes can be used together, but the individual parsers may not support
every combination. For example, DottySQL doesn't parse circumfix
(prefix + suffix) operators.
Previously, DottySQL used grammar functions for suffixes, which works well
when there is only a small number of them, but is very slow if there are
many operators. In practice, the grammar functions matching *fixes almost
always just call _keyword, which means they can be replaced with a lookup
in the operator table.
Arguments:
name: The literal name of the operator, such as "+" or "not".
precedence: Integer precedence with operators of the same arity.
handler: Callable that emits AST for this operator.
docstring: Documentation for the operator.
assoc: Associativity - can be left or right for infix operators.
suffix: (OPTIONAL) The token (not grammar function) of the suffix.
prefix: (OPTIONAL) The token (not grammar function) of the prefix.
infix: (OPTIONAL) The token (not grammar function) of the infix.
"""
class TokenLookupTable(object):
"""Ordered associative container where tokens are keys.
Public properties:
case_sensitive (default False): If set to False, all lookups will be
converted to lower case. NOTE: Does not affect insertion:
case-insensitive grammar should insert operators in lower case.
"""
_max_len = 1 # Longest match so far.
_table = None # Ordered dict keyed on tokens.
# This affects only lookups, not insertion.
case_sensitive = False
def __init__(self, *entries):
self._table = collections.OrderedDict()
for tokens, entry in entries:
self.set(tokens, entry)
def set(self, tokens, entry):
if isinstance(tokens, Token):
tokens = (tokens,)
elif isinstance(tokens, tuple):
self._max_len = max(self._max_len, len(tokens))
else:
raise TypeError(
"TokenLookupTable only supports instances of Token or "
"tuples thereof for keys. Got %r." % tokens)
if tokens in self._table:
raise ValueError("Duplicate token key %r for %r." % (
tokens, entry))
self._table[tokens] = entry
def _normalize_token(self, token):
if (isinstance(token.value, six.string_types)
and not self.case_sensitive):
return token._replace(value=token.value.lower())
return token
def match(self, tokens):
# Try to match longest known match first.
for match_len in range(self._max_len, 0, -1):
needle = tuple((self._normalize_token(t)
for t in itertools.islice(tokens, match_len)))
result = self._table.get(needle)
if result:
return result, needle
return None, None
class OperatorTable(object):
"""A complete set of operators in a grammar, keyed on their *fix tokens."""
prefix = None
infix = None
suffix = None
by_name = None
by_handler = None
def __init__(self, *operators):
self.prefix = TokenLookupTable()
self.infix = TokenLookupTable()
self.suffix = TokenLookupTable()
self.by_name = dict()
self.by_handler = dict()
for operator in operators:
if operator.name in self.by_name:
raise ValueError("Duplicit operator name %r." % operator.name)
self.by_name[operator.name] = operator
if operator.handler not in self.by_handler:
# Multiple operators can have the same handler, in which case
# they are probably aliases that mean the same thing. In that
# case the first operator "wins" and will likely be what
# the formatter for this syntax ends up using as default when
# it formats this AST.
self.by_handler[operator.handler] = operator
# An operator can have multiple components, but it is only indexed
# by the first one to prevent ambiguity.
if operator.prefix:
self.prefix.set(operator.prefix, operator)
elif operator.infix:
self.infix.set(operator.infix, operator)
elif operator.suffix:
self.suffix.set(operator.suffix, operator)
# Grammar primitives and helpers. (No grammar functions until the end of file.)
class TokenMatch(collections.namedtuple(
"TokenMatch", "operator value tokens")):
"""Represents a one or more matching tokens and, optionally, their contents.
Arguments:
operator: The Operator instance that matched, if any.
value: The literal value that matched, if any.
tokens: The actual tokens the match consumed.
"""
@property
def start(self):
return self.tokens[0].start
@property
def end(self):
return self.tokens[-1].end
@property
def first(self):
return self.tokens[0]
def keyword(tokens, expected):
"""Case-insensitive keyword match."""
try:
token = next(iter(tokens))
except StopIteration:
return
if token and token.name == "symbol" and token.value.lower() == expected:
return TokenMatch(None, token.value, (token,))
def multi_keyword(tokens, keyword_parts):
"""Match a case-insensitive keyword consisting of multiple tokens."""
tokens = iter(tokens)
matched_tokens = []
limit = len(keyword_parts)
for idx in six.moves.range(limit):
try:
token = next(tokens)
except StopIteration:
return
if (not token or token.name != "symbol" or
token.value.lower() != keyword_parts[idx]):
return
matched_tokens.append(token)
return TokenMatch(None, token.value, matched_tokens)
def keywords(tokens, expected):
"""Match against any of a set/dict of keywords.
Not that this doesn't support multi-part keywords. Any multi-part keywords
must be special-cased in their grammar function.
"""
try:
token = next(iter(tokens))
except StopIteration:
return
if token and token.name == "symbol" and token.value.lower() in expected:
return TokenMatch(None, token.value, (token,))
def prefix(tokens, operator_table):
"""Match a prefix of an operator."""
operator, matched_tokens = operator_table.prefix.match(tokens)
if operator:
return TokenMatch(operator, None, matched_tokens)
def infix(tokens, operator_table):
"""Match an infix of an operator."""
operator, matched_tokens = operator_table.infix.match(tokens)
if operator:
return TokenMatch(operator, None, matched_tokens)
def suffix(tokens, operator_table):
"""Match a suffix of an operator."""
operator, matched_tokens = operator_table.suffix.match(tokens)
if operator:
return TokenMatch(operator, None, matched_tokens)
def token_name(tokens, expected):
"""Match a token name (type)."""
try:
token = next(iter(tokens))
except StopIteration:
return
if token and token.name == expected:
return TokenMatch(None, token.value, (token,))
def match_tokens(expected_tokens):
"""Generate a grammar function that will match 'expected_tokens' only."""
if isinstance(expected_tokens, Token):
# Match a single token.
def _grammar_func(tokens):
try:
next_token = next(iter(tokens))
except StopIteration:
return
if next_token == expected_tokens:
return TokenMatch(None, next_token.value, (next_token,))
elif isinstance(expected_tokens, tuple):
# Match multiple tokens.
match_len = len(expected_tokens)
def _grammar_func(tokens):
upcoming = tuple(itertools.islice(tokens, match_len))
if upcoming == expected_tokens:
return TokenMatch(None, None, upcoming)
else:
raise TypeError(
"'expected_tokens' must be an instance of Token or a tuple "
"thereof. Got %r." % expected_tokens)
return _grammar_func
# Some common grammar functions:
def literal(tokens):
return token_name(tokens, "literal")
def symbol(tokens):
return token_name(tokens, "symbol")
def lparen(tokens):
return token_name(tokens, "lparen")
def rparen(tokens):
return token_name(tokens, "rparen")
def lbracket(tokens):
return token_name(tokens, "lbracket")
def rbracket(tokens):
return token_name(tokens, "rbracket")
def comma(tokens):
return token_name(tokens, "comma")
|