File: lexer.py

package info (click to toggle)
python-stone 3.3.9-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 2,036 kB
  • sloc: python: 22,311; objc: 498; sh: 23; makefile: 11
file content (444 lines) | stat: -rw-r--r-- 14,403 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
import logging
import os

import ply.lex as lex

_MYPY = False
if _MYPY:
    import typing  # noqa: F401 # pylint: disable=import-error,unused-import,useless-suppression


class MultiToken:
    """Object used to monkeypatch ply.lex so that we can return multiple
    tokens from one lex operation."""
    def __init__(self, tokens):
        self.type = tokens[0].type
        self.tokens = tokens

# Represents a null value. We want to differentiate between the Python "None"
# and null in several places.
NullToken = object()


class Lexer:
    """
    Lexer. Tokenizes stone files.
    """

    states = (
        ('WSIGNORE', 'inclusive'),
    )

    def __init__(self):
        self.lex = None
        self.tokens_queue = None
        # The current indentation "level" rather than a count of spaces.
        self.cur_indent = None
        self._logger = logging.getLogger('stone.stone.lexer')
        self.last_token = None
        # [(character, line number), ...]
        self.errors = []

    def input(self, file_data, **kwargs):
        """
        Required by ply.yacc for this to quack (duck typing) like a ply lexer.

        :param str file_data: Contents of the file to lex.
        """
        self.lex = lex.lex(module=self, **kwargs)
        self.tokens_queue = []
        self.cur_indent = 0
        # Hack to avoid tokenization bugs caused by files that do not end in a
        # new line.
        self.lex.input(file_data + '\n')

    def token(self):
        """
        Returns the next LexToken. Returns None when all tokens have been
        exhausted.
        """

        if self.tokens_queue:
            self.last_token = self.tokens_queue.pop(0)
        else:
            r = self.lex.token()
            if isinstance(r, MultiToken):
                self.tokens_queue.extend(r.tokens)
                self.last_token = self.tokens_queue.pop(0)
            else:
                if r is None and self.cur_indent > 0:
                    if (self.last_token and
                            self.last_token.type not in ('NEWLINE', 'LINE')):
                        newline_token = _create_token(
                            'NEWLINE', '\n', self.lex.lineno, self.lex.lexpos)
                        self.tokens_queue.append(newline_token)
                    dedent_count = self.cur_indent
                    dedent_token = _create_token(
                        'DEDENT', '\t', self.lex.lineno, self.lex.lexpos)
                    self.tokens_queue.extend([dedent_token] * dedent_count)

                    self.cur_indent = 0
                    self.last_token = self.tokens_queue.pop(0)
                else:
                    self.last_token = r
        return self.last_token

    def test(self, data):
        """Logs all tokens for human inspection. Useful for debugging."""
        self.input(data)
        while True:
            token = self.token()
            if not token:
                break
            self._logger.debug('Token %r', token)

    # List of token names
    tokens = (
        'ID',
        'KEYWORD',
        'PATH',
        'DOT',
    )  # type: typing.Tuple[typing.Text, ...]

    # Whitespace tokens
    tokens += (
        'DEDENT',
        'INDENT',
        'NEWLINE',
    )

    # Attribute lists, aliases
    tokens += (
        'COMMA',
        'EQ',
        'LPAR',
        'RPAR',
    )

    # Primitive types
    tokens += (
        'BOOLEAN',
        'FLOAT',
        'INTEGER',
        'NULL',
        'STRING',
    )

    # List notation
    tokens += (
        'LBRACKET',
        'RBRACKET',
    )

    # Map notation
    tokens += (
        'LBRACE',
        'RBRACE',
        'COLON',
    )

    tokens += (
        'Q',
    )

    # Annotation notation
    tokens += (
        'AT',
    )

    # Regular expression rules for simple tokens
    t_DOT = r'\.'
    t_LBRACKET = r'\['
    t_RBRACKET = r'\]'
    t_EQ = r'='
    t_COMMA = r','
    t_Q = r'\?'
    t_LBRACE = r'\{'
    t_RBRACE = r'\}'
    t_COLON = r'\:'
    t_AT = r'@'

    # TODO(kelkabany): Use scoped/conditional lexing to restrict where keywords
    # are identified as such.
    KEYWORDS = [
        'alias',
        'annotation',
        'annotation_type',
        'attrs',
        'by',
        'deprecated',
        'doc',
        'example',
        'error',
        'extends',
        'import',
        'namespace',
        'patch',
        'route',
        'struct',
        'union',
        'union_closed',
    ]

    RESERVED = {
        'annotation': 'ANNOTATION',
        'annotation_type': 'ANNOTATION_TYPE',
        'attrs': 'ATTRS',
        'deprecated': 'DEPRECATED',
        'by': 'BY',
        'extends': 'EXTENDS',
        'import': 'IMPORT',
        'patch': 'PATCH',
        'route': 'ROUTE',
        'struct': 'STRUCT',
        'union': 'UNION',
        'union_closed': 'UNION_CLOSED',
    }

    tokens += tuple(RESERVED.values())

    def t_LPAR(self, token):
        r'\('
        token.lexer.push_state('WSIGNORE')
        return token

    def t_RPAR(self, token):
        r'\)'
        token.lexer.pop_state()
        return token

    def t_ANY_BOOLEAN(self, token):
        r'\btrue\b|\bfalse\b'
        token.value = (token.value == 'true')
        return token

    def t_ANY_NULL(self, token):
        r'\bnull\b'
        token.value = NullToken
        return token

    # No leading digits
    def t_ANY_ID(self, token):
        r'[a-zA-Z_][a-zA-Z0-9_-]*'
        if token.value in self.KEYWORDS:
            if (token.value == 'annotation_type') and self.cur_indent:
                # annotation_type was added as a reserved keyword relatively
                # late, when there could be identifers with the same name
                # in existing specs. because annotation_type-the-keyword can
                # only be used at the beginning of a non-indented line, this
                # check lets both the keyword and the identifer coexist and
                # maintains backward compatibility.
                # Note: this is kind of a hack, and we should get rid of it if
                # the lexer gets better at telling keywords from identifiers in general.
                return token
            token.type = self.RESERVED.get(token.value, 'KEYWORD')
            return token
        else:
            return token

    def t_ANY_PATH(self, token):
        r'\/[/a-zA-Z0-9_-]*'
        return token

    def t_ANY_FLOAT(self, token):
        r'-?\d+(\.\d*(e-?\d+)?|e-?\d+)'
        token.value = float(token.value)
        return token

    def t_ANY_INTEGER(self, token):
        r'-?\d+'
        token.value = int(token.value)
        return token

    # Read in a string while respecting the following escape sequences:
    # \", \\, \n, and \t.
    def t_ANY_STRING(self, t):
        r'\"([^\\"]|(\\.))*\"'
        escaped = 0
        t.lexer.lineno += t.value.count('\n')
        s = t.value[1:-1]
        new_str = ""
        for i in range(0, len(s)):
            c = s[i]
            if escaped:
                if c == 'n':
                    c = '\n'
                elif c == 't':
                    c = '\t'
                new_str += c
                escaped = 0
            else:
                if c == '\\':
                    escaped = 1
                else:
                    new_str += c
        # remove current indentation
        indentation_str = ' ' * _indent_level_to_spaces_count(self.cur_indent)
        lines_without_indentation = [
            line.replace(indentation_str, '', 1)
            for line in new_str.splitlines()]
        t.value = '\n'.join(lines_without_indentation)
        return t

    # Ignore comments.
    # There are two types of comments.
    # 1. Comments that take up a full line. These lines are ignored entirely.
    # 2. Comments that come after tokens in the same line. These comments
    #    are ignored, but, we still need to emit a NEWLINE since this rule
    #    takes all trailing newlines.
    # Regardless of comment type, the following line must be checked for a
    # DEDENT or INDENT.
    def t_INITIAL_comment(self, token):
        r'[#][^\n]*\n+'
        token.lexer.lineno += token.value.count('\n')
        # Scan backwards from the comment hash to figure out which type of
        # comment this is. If we find an non-ws character, we know it was a
        # partial line. But, if we find a newline before a non-ws character,
        # then we know the entire line was a comment.
        i = token.lexpos - 1
        while i >= 0:
            is_full_line_comment = token.lexer.lexdata[i] == '\n'
            is_partial_line_comment = (not is_full_line_comment and
                                       token.lexer.lexdata[i] != ' ')
            if is_full_line_comment or is_partial_line_comment:
                newline_token = _create_token('NEWLINE', '\n',
                    token.lineno, token.lexpos + len(token.value) - 1)
                newline_token.lexer = token.lexer
                dent_tokens = self._create_tokens_for_next_line_dent(
                    newline_token)
                if is_full_line_comment:
                    # Comment takes the full line so ignore entirely.
                    return dent_tokens
                elif is_partial_line_comment:
                    # Comment is only a partial line. Preserve newline token.
                    if dent_tokens:
                        dent_tokens.tokens.insert(0, newline_token)
                        return dent_tokens
                    else:
                        return newline_token
            i -= 1

    def t_WSIGNORE_comment(self, token):
        r'[#][^\n]*\n+'
        token.lexer.lineno += token.value.count('\n')
        newline_token = _create_token('NEWLINE', '\n',
            token.lineno, token.lexpos + len(token.value) - 1)
        newline_token.lexer = token.lexer
        self._check_for_indent(newline_token)

    # Define a rule so we can track line numbers
    def t_INITIAL_NEWLINE(self, newline_token):
        r'\n+'
        newline_token.lexer.lineno += newline_token.value.count('\n')
        dent_tokens = self._create_tokens_for_next_line_dent(newline_token)
        if dent_tokens:
            dent_tokens.tokens.insert(0, newline_token)
            return dent_tokens
        else:
            return newline_token

    def t_WSIGNORE_NEWLINE(self, newline_token):
        r'\n+'
        newline_token.lexer.lineno += newline_token.value.count('\n')
        self._check_for_indent(newline_token)

    def _create_tokens_for_next_line_dent(self, newline_token):
        """
        Starting from a newline token that isn't followed by another newline
        token, returns any indent or dedent tokens that immediately follow.
        If indentation doesn't change, returns None.
        """
        indent_delta = self._get_next_line_indent_delta(newline_token)
        if indent_delta is None or indent_delta == 0:
            # Next line's indent isn't relevant OR there was no change in
            # indentation.
            return None

        dent_type = 'INDENT' if indent_delta > 0 else 'DEDENT'
        dent_token = _create_token(
            dent_type, '\t', newline_token.lineno + 1,
            newline_token.lexpos + len(newline_token.value))

        tokens = [dent_token] * abs(indent_delta)
        self.cur_indent += indent_delta
        return MultiToken(tokens)

    def _check_for_indent(self, newline_token):
        """
        Checks that the line following a newline is indented, otherwise a
        parsing error is generated.
        """
        indent_delta = self._get_next_line_indent_delta(newline_token)
        if indent_delta is None or indent_delta == 1:
            # Next line's indent isn't relevant (e.g. it's a comment) OR
            # next line is correctly indented.
            return None
        else:
            self.errors.append(
                ('Line continuation must increment indent by 1.',
                 newline_token.lexer.lineno))

    def _get_next_line_indent_delta(self, newline_token):
        """
        Returns the change in indentation. The return units are in
        indentations rather than spaces/tabs.

        If the next line's indent isn't relevant (e.g. it's a comment),
        returns None. Since the return value might be 0, the caller should
        explicitly check the return type, rather than rely on truthiness.
        """
        assert newline_token.type == 'NEWLINE', \
            'Can only search for a dent starting from a newline.'
        next_line_pos = newline_token.lexpos + len(newline_token.value)
        if next_line_pos == len(newline_token.lexer.lexdata):
            # Reached end of file
            return None

        line = newline_token.lexer.lexdata[next_line_pos:].split(os.linesep, 1)[0]
        if not line:
            return None
        lstripped_line = line.lstrip()
        lstripped_line_length = len(lstripped_line)
        if lstripped_line_length == 0:
            # If the next line is composed of only spaces, ignore indentation.
            return None
        if lstripped_line[0] == '#':
            # If it's a comment line, ignore indentation.
            return None

        indent = len(line) - lstripped_line_length
        if indent % 4 > 0:
            self.errors.append(
                ('Indent is not divisible by 4.', newline_token.lexer.lineno))
            return None

        indent_delta = indent - _indent_level_to_spaces_count(self.cur_indent)
        return indent_delta // 4

    # A string containing ignored characters (spaces and tabs)
    t_ignore = ' \t'

    # Error handling rule
    def t_ANY_error(self, token):
        self._logger.debug('Illegal character %r at line %d',
                           token.value[0], token.lexer.lineno)
        self.errors.append(
            ('Illegal character %s.' % repr(token.value[0]).lstrip('u'),
             token.lexer.lineno))
        token.lexer.skip(1)


def _create_token(token_type, value, lineno, lexpos):
    """
    Helper for creating ply.lex.LexToken objects. Unfortunately, LexToken
    does not have a constructor defined to make settings these values easy.
    """
    token = lex.LexToken()
    token.type = token_type
    token.value = value
    token.lineno = lineno
    token.lexpos = lexpos
    return token

def _indent_level_to_spaces_count(indent):
    return indent * 4