1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
|
# EFILTER Forensic Query Language
#
# Copyright 2015 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This module implements a parser that manages tokenizer output based on rules.
"""
__author__ = "Adam Sindelar <adamsh@google.com>"
from efilter import errors
from efilter.parsers.common import grammar
class TokenStream(object):
"""Manages and enforces grammar over tokenizer output.
Most recursive descent parsers need a mechanism to accept, reject, expect
or peek at the next token based on matching loging supplied by grammar
functions. This class manages the tokenizer for the parser, and enforces
the expectations set by grammar.
Arguments:
tokenizer: Must support the tokenizer interface (skip and peek).
"""
tokenizer = None
matched = None
def __init__(self, tokenizer=None):
self.tokenizer = tokenizer
def match(self, f, *args):
"""Match grammar function 'f' against next token and set 'self.matched'.
Arguments:
f: A grammar function - see efilter.parsers.common.grammar. Must
return TokenMatch or None.
args: Passed to 'f', if any.
Returns:
Instance of efilter.parsers.common.grammar.TokenMatch or None.
Comment:
If a match is returned, it will also be stored in self.matched.
"""
try:
match = f(self.tokenizer, *args)
except StopIteration:
# The grammar function might have tried to access more tokens than
# are available. That's not really an error, it just means it didn't
# match.
return
if match is None:
return
if not isinstance(match, grammar.TokenMatch):
raise TypeError("Invalid grammar function %r returned %r."
% (f, match))
self.matched = match
return match
def accept(self, f, *args):
"""Like 'match', but consume the token (tokenizer advances.)"""
match = self.match(f, *args)
if match is None:
return
self.tokenizer.skip(len(match.tokens))
return match
def reject(self, f, *args):
"""Like 'match', but throw a parse error if 'f' matches.
This is useful when a parser wants to be strict about specific things
being prohibited. For example, DottySQL bans the use of SQL keywords as
variable names.
"""
match = self.match(f, *args)
if match:
token = self.peek(0)
raise errors.EfilterParseError(
query=self.tokenizer.source, token=token,
message="Was not expecting a %s here." % token.name)
def expect(self, f, *args):
"""Like 'accept' but throws a parse error if 'f' doesn't match."""
match = self.accept(f, *args)
if match:
return match
try:
func_name = f.func_name
except AttributeError:
func_name = "<unnamed grammar function>"
start, end = self.current_position()
raise errors.EfilterParseError(
query=self.tokenizer.source, start=start, end=end,
message="Was expecting %s here." % (func_name))
def current_position(self):
"""Return a tuple of (start, end)."""
token = self.tokenizer.peek(0)
if token:
return token.start, token.end
return self.tokenizer.position, self.tokenizer.position + 1
def peek(self, n):
"""Same as self.tokenizer.peek."""
return self.tokenizer.peek(n)
def skip(self, n):
"""Same as self.tokenizer.skip."""
return self.tokenizer.skip(n)
def __iter__(self):
"""Self as iter(self.tokenizer)."""
return iter(self.tokenizer)
|