File: string_tokenizer.c

package info (click to toggle)
python3.14 3.14.0-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 169,680 kB
  • sloc: python: 751,968; ansic: 717,163; xml: 31,250; sh: 5,989; cpp: 4,063; makefile: 1,995; objc: 787; lisp: 502; javascript: 136; asm: 75; csh: 12
file content (135 lines) | stat: -rw-r--r-- 3,879 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#include "Python.h"
#include "errcode.h"

#include "helpers.h"
#include "../lexer/state.h"

static int
tok_underflow_string(struct tok_state *tok) {
    char *end = strchr(tok->inp, '\n');
    if (end != NULL) {
        end++;
    }
    else {
        end = strchr(tok->inp, '\0');
        if (end == tok->inp) {
            tok->done = E_EOF;
            return 0;
        }
    }
    if (tok->start == NULL) {
        tok->buf = tok->cur;
    }
    tok->line_start = tok->cur;
    ADVANCE_LINENO();
    tok->inp = end;
    return 1;
}

/* Fetch a byte from TOK, using the string buffer. */
static int
buf_getc(struct tok_state *tok) {
    return Py_CHARMASK(*tok->str++);
}

/* Unfetch a byte from TOK, using the string buffer. */
static void
buf_ungetc(int c, struct tok_state *tok) {
    tok->str--;
    assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
}

/* Set the readline function for TOK to ENC. For the string-based
   tokenizer, this means to just record the encoding. */
static int
buf_setreadl(struct tok_state *tok, const char* enc) {
    tok->enc = enc;
    return 1;
}

/* Decode a byte string STR for use as the buffer of TOK.
   Look for encoding declarations inside STR, and record them
   inside TOK.  */
static char *
decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf)
{
    PyObject* utf8 = NULL;
    char *str;
    const char *s;
    const char *newl[2] = {NULL, NULL};
    int lineno = 0;
    tok->input = str = _PyTokenizer_translate_newlines(input, single, preserve_crlf, tok);
    if (str == NULL)
        return NULL;
    tok->enc = NULL;
    tok->str = str;
    if (!_PyTokenizer_check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
        return _PyTokenizer_error_ret(tok);
    str = tok->str;             /* string after BOM if any */
    assert(str);
    if (tok->enc != NULL) {
        utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
        if (utf8 == NULL)
            return _PyTokenizer_error_ret(tok);
        str = PyBytes_AsString(utf8);
    }
    for (s = str;; s++) {
        if (*s == '\0') break;
        else if (*s == '\n') {
            assert(lineno < 2);
            newl[lineno] = s;
            lineno++;
            if (lineno == 2) break;
        }
    }
    tok->enc = NULL;
    /* need to check line 1 and 2 separately since check_coding_spec
       assumes a single line as input */
    if (newl[0]) {
        tok->lineno = 1;
        if (!_PyTokenizer_check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
            return NULL;
        }
        if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
            tok->lineno = 2;
            if (!_PyTokenizer_check_coding_spec(newl[0]+1, newl[1] - newl[0],
                                   tok, buf_setreadl))
                return NULL;
        }
    }
    tok->lineno = 0;
    if (tok->enc != NULL) {
        assert(utf8 == NULL);
        utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
        if (utf8 == NULL)
            return _PyTokenizer_error_ret(tok);
        str = PyBytes_AS_STRING(utf8);
    }
    else if (!_PyTokenizer_ensure_utf8(str, tok, 1)) {
        return _PyTokenizer_error_ret(tok);
    }
    assert(tok->decoding_buffer == NULL);
    tok->decoding_buffer = utf8; /* CAUTION */
    return str;
}

/* Set up tokenizer for string */
struct tok_state *
_PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
{
    struct tok_state *tok = _PyTokenizer_tok_new();
    char *decoded;

    if (tok == NULL)
        return NULL;
    decoded = decode_str(str, exec_input, tok, preserve_crlf);
    if (decoded == NULL) {
        _PyTokenizer_Free(tok);
        return NULL;
    }

    tok->buf = tok->cur = tok->inp = decoded;
    tok->end = decoded;
    tok->underflow = &tok_underflow_string;
    return tok;
}