File: tokenizer.py

package info (click to toggle)
python-baron 0.10.1-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 2,080 kB
  • sloc: python: 26,926; makefile: 126; sh: 27
file content (129 lines) | stat: -rw-r--r-- 4,009 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import re
from .utils import BaronError


class UnknowItem(BaronError):
    pass


KEYWORDS = ("and", "as", "assert", "break", "class", "continue", "def", "del",
            "elif", "else", "except", "exec", "finally", "for", "from",
            "global", "nonlocal", "if", "import", "in", "is", "lambda", "not",
            "or", "pass", "print", "raise", "return", "try", "while", "with",
            "yield")

TOKENS = (
    (r'[a-zA-Z_]\w*', 'NAME'),
    (r'0', 'INT'),
    (r'[-+]?\d+[eE][-+]?\d+[jJ]', 'FLOAT_EXPONANT_COMPLEX'),
    (r'[-+]?\d+.\d?[eE][-+]?\d+[jJ]', 'FLOAT_EXPONANT_COMPLEX'),
    (r'[-+]?\d?.\d+[eE][-+]?\d+[jJ]', 'FLOAT_EXPONANT_COMPLEX'),
    (r'\d+[eE][-+]?\d*', 'FLOAT_EXPONANT'),
    (r'\d+\.\d*[eE][-+]?\d*', 'FLOAT_EXPONANT'),
    (r'\.\d+[eE][-+]?\d*', 'FLOAT_EXPONANT'),
    (r'\d*\.\d+[jJ]', 'COMPLEX'),
    (r'\d+\.[jJ]', 'COMPLEX'),
    (r'\d+[jJ]', 'COMPLEX'),
    (r'\d+\.', 'FLOAT'),
    (r'\d*[_\d]*\.[_\d]+[lL]?', 'FLOAT'),
    (r'\d+[_\d]+\.[_\d]*[lL]?', 'FLOAT'),
    (r'\.', 'DOT'),
    (r'[1-9]+[_\d]*[lL]', 'LONG'),
    (r'[1-9]+[_\d]*', 'INT'),
    (r'0[xX][\d_a-fA-F]+[lL]?', 'HEXA'),
    (r'(0[oO][0-7]+)|(0[0-7_]*)[lL]?', 'OCTA'),
    (r'0[bB][01_]+[lL]?', 'BINARY'),
    (r'\(', 'LEFT_PARENTHESIS'),
    (r'\)', 'RIGHT_PARENTHESIS'),
    (r':', 'COLON'),
    (r',', 'COMMA'),
    (r';', 'SEMICOLON'),
    (r'@', 'AT'),
    (r'\+', 'PLUS'),
    (r'-', 'MINUS'),
    (r'\*', 'STAR'),
    (r'/', 'SLASH'),
    (r'\|', 'VBAR'),
    (r'&', 'AMPER'),
    (r'@', 'AT'),
    (r'<', 'LESS'),
    (r'>', 'GREATER'),
    (r'=', 'EQUAL'),
    (r'%', 'PERCENT'),
    (r'\[', 'LEFT_SQUARE_BRACKET'),
    (r'\]', 'RIGHT_SQUARE_BRACKET'),
    (r'\{', 'LEFT_BRACKET'),
    (r'\}', 'RIGHT_BRACKET'),
    (r'`', 'BACKQUOTE'),
    (r'==', 'EQUAL_EQUAL'),
    (r'<>', 'NOT_EQUAL'),
    (r'!=', 'NOT_EQUAL'),
    (r'<=', 'LESS_EQUAL'),
    (r'>=', 'GREATER_EQUAL'),
    (r'~', 'TILDE'),
    (r'\^', 'CIRCUMFLEX'),
    (r'<<', 'LEFT_SHIFT'),
    (r'>>', 'RIGHT_SHIFT'),
    (r'\*\*', 'DOUBLE_STAR'),
    (r'\+=', 'PLUS_EQUAL'),
    (r'-=', 'MINUS_EQUAL'),
    (r'@=', 'AT_EQUAL'),
    (r'\*=', 'STAR_EQUAL'),
    (r'/=', 'SLASH_EQUAL'),
    (r'%=', 'PERCENT_EQUAL'),
    (r'&=', 'AMPER_EQUAL'),
    (r'\|=', 'VBAR_EQUAL'),
    (r'\^=', 'CIRCUMFLEX_EQUAL'),
    (r'<<=', 'LEFT_SHIFT_EQUAL'),
    (r'>>=', 'RIGHT_SHIFT_EQUAL'),
    (r'\.\.\.', 'ELLIPSIS'),
    (r'->', 'RIGHT_ARROW'),
    (r'\*\*=', 'DOUBLE_STAR_EQUAL'),
    (r'//', 'DOUBLE_SLASH'),
    (r'//=', 'DOUBLE_SLASH_EQUAL'),
    (r'\n', 'ENDL'),
    (r'\r\n', 'ENDL'),
    (r'#.*', 'COMMENT'),
    (r'(\s|\\\n|\\\r\n)+', 'SPACE'),
    (r'["\'](.|\n|\r)*["\']', 'STRING'),
    (r'[uU]["\'](.|\n|\r)*["\']', 'UNICODE_STRING'),
    (r'[fF]["\'](.|\n|\r)*["\']', 'INTERPOLATED_STRING'),
    (r'[rR]["\'](.|\n|\r)*["\']', 'RAW_STRING'),
    (r'[bB]["\'](.|\n|\r)*["\']', 'BINARY_STRING'),
    (r'[uU][rR]["\'](.|\n|\r)*["\']', 'UNICODE_RAW_STRING'),
    (r'[bB][rR]["\'](.|\n|\r)*["\']', 'BINARY_RAW_STRING'),
    (r'[fF][rR]["\'](.|\n|\r)*["\']', 'INTERPOLATED_RAW_STRING'),
    (r'[rR][fF]["\'](.|\n|\r)*["\']', 'INTERPOLATED_RAW_STRING'),
)


TOKENS = [(re.compile('^' + x[0] + '$'), x[1]) for x in TOKENS]


def tokenize(sequence, print_function=False):
    return list(tokenize_generator(sequence, print_function))


def tokenize_current_keywords(print_function=False):
    if print_function is True:
        return [x for x in KEYWORDS if x not in ("print", "exec")]
    else:
        return KEYWORDS


def tokenize_generator(sequence, print_function=False):
    current_keywords = tokenize_current_keywords()

    for item in sequence:
        if item in current_keywords:
            yield (item.upper(), item)
            continue

        for candidate, token_name in TOKENS:
            if candidate.match(item):
                yield (token_name, item)
                break
        else:
            raise UnknowItem("Can't find a matching token for this item: '%s'" % item)
    yield ('ENDMARKER', '')
    yield