1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
|
import sys
import logging
import ply.lex
from jsonpath_ng.exceptions import JsonPathLexerError
logger = logging.getLogger(__name__)
class JsonPathLexer:
'''
A Lexical analyzer for JsonPath.
'''
def __init__(self, debug=False):
self.debug = debug
if self.__doc__ is None:
raise JsonPathLexerError('Docstrings have been removed! By design of PLY, jsonpath-rw requires docstrings. You must not use PYTHONOPTIMIZE=2 or python -OO.')
def tokenize(self, string):
'''
Maps a string to an iterator over tokens. In other words: [char] -> [token]
'''
new_lexer = ply.lex.lex(module=self, debug=self.debug, errorlog=logger)
new_lexer.latest_newline = 0
new_lexer.string_value = None
new_lexer.input(string)
while True:
t = new_lexer.token()
if t is None:
break
t.col = t.lexpos - new_lexer.latest_newline
yield t
if new_lexer.string_value is not None:
raise JsonPathLexerError('Unexpected EOF in string literal or identifier')
# ============== PLY Lexer specification ==================
#
# This probably should be private but:
# - the parser requires access to `tokens` (perhaps they should be defined in a third, shared dependency)
# - things like `literals` might be a legitimate part of the public interface.
#
# Anyhow, it is pythonic to give some rope to hang oneself with :-)
literals = ['*', '.', '[', ']', '(', ')', '$', ',', ':', '|', '&', '~']
reserved_words = {
'where': 'WHERE',
'wherenot': 'WHERENOT',
}
tokens = ['DOUBLEDOT', 'NUMBER', 'ID', 'NAMED_OPERATOR'] + list(reserved_words.values())
states = [ ('singlequote', 'exclusive'),
('doublequote', 'exclusive'),
('backquote', 'exclusive') ]
# Normal lexing, rather easy
t_DOUBLEDOT = r'\.\.'
t_ignore = ' \t'
def t_ID(self, t):
r'[a-zA-Z_@][a-zA-Z0-9_@\-]*'
t.type = self.reserved_words.get(t.value, 'ID')
return t
def t_NUMBER(self, t):
r'-?\d+'
t.value = int(t.value)
return t
# Single-quoted strings
t_singlequote_ignore = ''
def t_singlequote(self, t):
r"'"
t.lexer.string_start = t.lexer.lexpos
t.lexer.string_value = ''
t.lexer.push_state('singlequote')
def t_singlequote_content(self, t):
r"[^'\\]+"
t.lexer.string_value += t.value
def t_singlequote_escape(self, t):
r'\\.'
t.lexer.string_value += t.value[1]
def t_singlequote_end(self, t):
r"'"
t.value = t.lexer.string_value
t.type = 'ID'
t.lexer.string_value = None
t.lexer.pop_state()
return t
def t_singlequote_error(self, t):
raise JsonPathLexerError('Error on line %s, col %s while lexing singlequoted field: Unexpected character: %s ' % (t.lexer.lineno, t.lexpos - t.lexer.latest_newline, t.value[0]))
# Double-quoted strings
t_doublequote_ignore = ''
def t_doublequote(self, t):
r'"'
t.lexer.string_start = t.lexer.lexpos
t.lexer.string_value = ''
t.lexer.push_state('doublequote')
def t_doublequote_content(self, t):
r'[^"\\]+'
t.lexer.string_value += t.value
def t_doublequote_escape(self, t):
r'\\.'
t.lexer.string_value += t.value[1]
def t_doublequote_end(self, t):
r'"'
t.value = t.lexer.string_value
t.type = 'ID'
t.lexer.string_value = None
t.lexer.pop_state()
return t
def t_doublequote_error(self, t):
raise JsonPathLexerError('Error on line %s, col %s while lexing doublequoted field: Unexpected character: %s ' % (t.lexer.lineno, t.lexpos - t.lexer.latest_newline, t.value[0]))
# Back-quoted "magic" operators
t_backquote_ignore = ''
def t_backquote(self, t):
r'`'
t.lexer.string_start = t.lexer.lexpos
t.lexer.string_value = ''
t.lexer.push_state('backquote')
def t_backquote_escape(self, t):
r'\\.'
t.lexer.string_value += t.value[1]
def t_backquote_content(self, t):
r"[^`\\]+"
t.lexer.string_value += t.value
def t_backquote_end(self, t):
r'`'
t.value = t.lexer.string_value
t.type = 'NAMED_OPERATOR'
t.lexer.string_value = None
t.lexer.pop_state()
return t
def t_backquote_error(self, t):
raise JsonPathLexerError('Error on line %s, col %s while lexing backquoted operator: Unexpected character: %s ' % (t.lexer.lineno, t.lexpos - t.lexer.latest_newline, t.value[0]))
# Counting lines, handling errors
def t_newline(self, t):
r'\n'
t.lexer.lineno += 1
t.lexer.latest_newline = t.lexpos
def t_error(self, t):
raise JsonPathLexerError('Error on line %s, col %s: Unexpected character: %s ' % (t.lexer.lineno, t.lexpos - t.lexer.latest_newline, t.value[0]))
if __name__ == '__main__':
logging.basicConfig()
lexer = JsonPathLexer(debug=True)
for token in lexer.tokenize(sys.stdin.read()):
print('%-20s%s' % (token.value, token.type))
|