File: wrapped_tokenize.py

package info (click to toggle)
python-libcst 1.8.6-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 6,240 kB
  • sloc: python: 78,096; makefile: 15; sh: 2
file content (225 lines) | stat: -rw-r--r-- 8,319 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.


"""
Parso's tokenize doesn't give us tokens in the format that we'd ideally like, so this
performs a small number of transformations to the token stream:

- `end_pos` is precomputed as a property, instead of lazily as a method, for more
  efficient access.
- `whitespace_before` and `whitespace_after` have been added. These include the correct
  indentation information.
- `prefix` is removed, since we don't use it anywhere.
- `ERRORTOKEN` and `ERROR_DEDENT` have been removed, because we don't intend to support
  error recovery. If we encounter token errors, we'll raise a ParserSyntaxError instead.

If performance becomes a concern, we can rewrite this later as a fork of the original
tokenize module, instead of as a wrapper.
"""

from dataclasses import dataclass, field
from enum import Enum
from typing import Generator, Iterator, List, Optional, Sequence

from libcst._add_slots import add_slots
from libcst._exceptions import ParserSyntaxError
from libcst._parser.parso.python.token import PythonTokenTypes, TokenType
from libcst._parser.parso.python.tokenize import (
    Token as OrigToken,
    tokenize_lines as orig_tokenize_lines,
)
from libcst._parser.parso.utils import PythonVersionInfo, split_lines
from libcst._parser.types.token import Token
from libcst._parser.types.whitespace_state import WhitespaceState

_ERRORTOKEN: TokenType = PythonTokenTypes.ERRORTOKEN
_ERROR_DEDENT: TokenType = PythonTokenTypes.ERROR_DEDENT

_INDENT: TokenType = PythonTokenTypes.INDENT
_DEDENT: TokenType = PythonTokenTypes.DEDENT
_ENDMARKER: TokenType = PythonTokenTypes.ENDMARKER

_FSTRING_START: TokenType = PythonTokenTypes.FSTRING_START
_FSTRING_END: TokenType = PythonTokenTypes.FSTRING_END

_OP: TokenType = PythonTokenTypes.OP


class _ParenthesisOrFStringStackEntry(Enum):
    PARENTHESIS = 0
    FSTRING = 0


_PARENTHESIS_STACK_ENTRY: _ParenthesisOrFStringStackEntry = (
    _ParenthesisOrFStringStackEntry.PARENTHESIS
)
_FSTRING_STACK_ENTRY: _ParenthesisOrFStringStackEntry = (
    _ParenthesisOrFStringStackEntry.FSTRING
)


@add_slots
@dataclass(frozen=False)
class _TokenizeState:
    lines: Sequence[str]
    previous_whitespace_state: WhitespaceState = field(
        default_factory=lambda: WhitespaceState(
            line=1, column=0, absolute_indent="", is_parenthesized=False
        )
    )
    indents: List[str] = field(default_factory=lambda: [""])
    parenthesis_or_fstring_stack: List[_ParenthesisOrFStringStackEntry] = field(
        default_factory=list
    )


def tokenize(code: str, version_info: PythonVersionInfo) -> Iterator[Token]:
    try:
        from libcst_native import tokenize as native_tokenize

        return native_tokenize.tokenize(code)
    except ImportError:
        lines = split_lines(code, keepends=True)
        return tokenize_lines(code, lines, version_info)


def tokenize_lines(
    code: str, lines: Sequence[str], version_info: PythonVersionInfo
) -> Iterator[Token]:
    try:
        from libcst_native import tokenize as native_tokenize

        # TODO: pass through version_info
        return native_tokenize.tokenize(code)
    except ImportError:
        return tokenize_lines_py(code, lines, version_info)


def tokenize_lines_py(
    code: str, lines: Sequence[str], version_info: PythonVersionInfo
) -> Generator[Token, None, None]:
    state = _TokenizeState(lines)
    orig_tokens_iter = iter(orig_tokenize_lines(lines, version_info))

    # Iterate over the tokens and pass them to _convert_token, providing a one-token
    # lookahead, to enable proper indent handling.
    try:
        curr_token = next(orig_tokens_iter)
    except StopIteration:
        pass  # empty file
    else:
        for next_token in orig_tokens_iter:
            yield _convert_token(state, curr_token, next_token)
            curr_token = next_token
        yield _convert_token(state, curr_token, None)


def _convert_token(  # noqa: C901: too complex
    state: _TokenizeState, curr_token: OrigToken, next_token: Optional[OrigToken]
) -> Token:
    ct_type = curr_token.type
    ct_string = curr_token.string
    ct_start_pos = curr_token.start_pos
    if ct_type is _ERRORTOKEN:
        raise ParserSyntaxError(
            f"{ct_string!r} is not a valid token.",
            lines=state.lines,
            raw_line=ct_start_pos[0],
            raw_column=ct_start_pos[1],
        )
    if ct_type is _ERROR_DEDENT:
        raise ParserSyntaxError(
            "Inconsistent indentation. Expected a dedent.",
            lines=state.lines,
            raw_line=ct_start_pos[0],
            raw_column=ct_start_pos[1],
        )

    # Compute relative indent changes for indent/dedent nodes
    relative_indent: Optional[str] = None
    if ct_type is _INDENT:
        old_indent = "" if len(state.indents) < 2 else state.indents[-2]
        new_indent = state.indents[-1]
        relative_indent = new_indent[len(old_indent) :]

    if next_token is not None:
        nt_type = next_token.type
        if nt_type is _INDENT:
            nt_line, nt_column = next_token.start_pos
            state.indents.append(state.lines[nt_line - 1][:nt_column])
        elif nt_type is _DEDENT:
            state.indents.pop()

    whitespace_before = state.previous_whitespace_state

    if ct_type is _INDENT or ct_type is _DEDENT or ct_type is _ENDMARKER:
        # Don't update whitespace state for these dummy tokens. This makes it possible
        # to partially parse whitespace for IndentedBlock footers, and then parse the
        # rest of the whitespace in the following statement's leading_lines.
        # Unfortunately, that means that the indentation is either wrong for the footer
        # comments, or for the next line. We've chosen to allow it to be wrong for the
        # IndentedBlock footer and manually override the state when parsing whitespace
        # in that particular node.
        whitespace_after = whitespace_before
        ct_end_pos = ct_start_pos
    else:
        # Not a dummy token, so update the whitespace state.

        # Compute our own end_pos, since parso's end_pos is wrong for triple-strings.
        lines = split_lines(ct_string)
        if len(lines) > 1:
            ct_end_pos = ct_start_pos[0] + len(lines) - 1, len(lines[-1])
        else:
            ct_end_pos = (ct_start_pos[0], ct_start_pos[1] + len(ct_string))

        # Figure out what mode the whitespace parser should use. If we're inside
        # parentheses, certain whitespace (e.g. newlines) are allowed where they would
        # otherwise not be. f-strings override and disable this behavior, however.
        #
        # Parso's tokenizer tracks this internally, but doesn't expose it, so we have to
        # duplicate that logic here.

        pof_stack = state.parenthesis_or_fstring_stack
        try:
            if ct_type is _FSTRING_START:
                pof_stack.append(_FSTRING_STACK_ENTRY)
            elif ct_type is _FSTRING_END:
                pof_stack.pop()
            elif ct_type is _OP:
                if ct_string in "([{":
                    pof_stack.append(_PARENTHESIS_STACK_ENTRY)
                elif ct_string in ")]}":
                    pof_stack.pop()
        except IndexError:
            # pof_stack may be empty by the time we need to read from it due to
            # mismatched braces.
            raise ParserSyntaxError(
                "Encountered a closing brace without a matching opening brace.",
                lines=state.lines,
                raw_line=ct_start_pos[0],
                raw_column=ct_start_pos[1],
            )
        is_parenthesized = (
            len(pof_stack) > 0 and pof_stack[-1] == _PARENTHESIS_STACK_ENTRY
        )

        whitespace_after = WhitespaceState(
            ct_end_pos[0], ct_end_pos[1], state.indents[-1], is_parenthesized
        )

    # Hold onto whitespace_after, so we can use it as whitespace_before in the next
    # node.
    state.previous_whitespace_state = whitespace_after

    return Token(
        ct_type,
        ct_string,
        ct_start_pos,
        ct_end_pos,
        whitespace_before,
        whitespace_after,
        relative_indent,
    )