File: annotation_tokenizer.py

package info (click to toggle)
chromium 138.0.7204.157-1
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 6,071,864 kB
  • sloc: cpp: 34,936,859; ansic: 7,176,967; javascript: 4,110,704; python: 1,419,953; asm: 946,768; xml: 739,967; pascal: 187,324; sh: 89,623; perl: 88,663; objc: 79,944; sql: 50,304; cs: 41,786; fortran: 24,137; makefile: 21,806; php: 13,980; tcl: 13,166; yacc: 8,925; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (145 lines) | stat: -rw-r--r-- 4,869 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# Copyright 2019 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""
A tokenizer for traffic annotation definitions.
"""

from typing import NamedTuple, Optional

import re
import textwrap

# Regexen that match a token inside the annotation definition arguments. Stored
# as a list instead of a dict, to preserve order.
#
# Order matters because otherwise, 'symbol' could be parsed before
# 'string_literal' (i.e., R"(...)" would be misinterpreted as the symbol 'R',
# followed by a string with parentheses in it).
TOKEN_REGEXEN = [
    # Comma for separating args.
    ('comma', re.compile(r'(,)')),
    # Java text blocks (must come before string_literal).
    ('text_block', re.compile(r'"""\n(.*?)"""', re.DOTALL)),
    # String literal. "string" or R"(string)". In Java, this will incorrectly
    # accept R-strings, which aren't part of the language's syntax. But since
    # that wouldn't compile anyways, we can just ignore this issue.
    ('string_literal', re.compile(r'"((?:\\.|[^"])*?)"|R"\((.*?)\)"',
                                  re.DOTALL)),
    # C++ or Java identifier.
    ('symbol', re.compile(r'([a-zA-Z_][a-zA-Z_0-9]*)')),
    # Left parenthesis.
    ('left_paren', re.compile(r'(\()')),
    # Right parenthesis.
    ('right_paren', re.compile(r'(\))')),
]

# Number of characters to include in the context (for error reporting).
CONTEXT_LENGTH = 20


class Token(NamedTuple):
  type: str
  value: str
  pos: int


def _process_backslashes(string):
  # https://stackoverflow.com/questions/4020539
  return bytes(string, 'utf-8').decode('unicode_escape')


class SourceCodeParsingError(Exception):
  """An error during C++ or Java parsing/tokenizing."""

  def __init__(self, expected_type, body, pos, file_path, line_number):
    context = body[pos:pos + CONTEXT_LENGTH]
    msg = ("Expected {} in annotation definition at {}:{}.\n" +
           "near '{}'").format(expected_type, file_path, line_number, context)
    Exception.__init__(self, msg)


class Tokenizer:
  """Simple tokenizer with basic error reporting.

  Use advance() or maybe_advance() to take tokens from the string, one at a
  time.
  """

  def __init__(self, body, file_path, line_number):
    self.body = body
    self.pos = 0
    self.file_path = file_path
    self.line_number = line_number

  def _assert_token_type(self, token, expected_type):
    """Like assert(), but reports errors in a _somewhat_ useful way."""
    if token and token.type == expected_type:
      return
    # Skip whitespace to make the error message more useful.
    pos = self._skip_whitespace()
    raise SourceCodeParsingError(expected_type, self.body, pos, self.file_path,
                                 self.line_number)

  def _skip_whitespace(self):
    """Return the position of the first non-whitespace character from here."""
    whitespace_re = re.compile(r'\s*')
    return whitespace_re.match(self.body, self.pos).end()

  def _get_token(self):
    """Return the token here, or None on failure."""
    # Skip initial whitespace.
    pos = self._skip_whitespace()

    # Find the token here, if there's one.
    token = None

    for (token_type, regex) in TOKEN_REGEXEN:
      re_match = regex.match(self.body, pos)
      if re_match:
        raw_token = re_match.group(0)
        token_content = next(g for g in re_match.groups() if g is not None)
        if token_type == 'string_literal' and not raw_token.startswith('R"'):
          # Remove the extra backslash in backslash sequences, but only in
          # non-R strings. R-strings don't need escaping.
          token_content = _process_backslashes(token_content)
        elif token_type == 'text_block':
          token_type = 'string_literal'
          token_content = _process_backslashes(textwrap.dedent(token_content))
        token = Token(token_type, token_content, re_match.end())
        break

    return token

  def maybe_advance(self, expected_type: str) -> Optional[str]:
    """Advance the tokenizer by one token if it has |expected_type|.

    Args:
      expected_type: expected |type| attribute of the token.

    Returns:
      The |value| attribute of the token if it has the right type, or None if it
      has another type.
    """
    token = self._get_token()
    if token and token.type == expected_type:
      self.pos = token.pos
      return token.value
    return None

  def advance(self, expected_type: str) -> str:
    """Advance the tokenizer by one token, asserting its type.

    Throws an error if the token at point has the wrong type.

    Args:
      expected_type: expected |type| attribute of the token.

    Returns:
      The |value| attribute of the token at point.
    """
    token = self._get_token()
    self._assert_token_type(token, expected_type)
    self.pos = token.pos
    return token.value