1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
|
#include "Python.h"
#include "errcode.h"
#include "helpers.h"
#include "../lexer/state.h"
static int
tok_underflow_string(struct tok_state *tok) {
char *end = strchr(tok->inp, '\n');
if (end != NULL) {
end++;
}
else {
end = strchr(tok->inp, '\0');
if (end == tok->inp) {
tok->done = E_EOF;
return 0;
}
}
if (tok->start == NULL) {
tok->buf = tok->cur;
}
tok->line_start = tok->cur;
ADVANCE_LINENO();
tok->inp = end;
return 1;
}
/* Fetch a byte from TOK, using the string buffer. */
static int
buf_getc(struct tok_state *tok) {
return Py_CHARMASK(*tok->str++);
}
/* Unfetch a byte from TOK, using the string buffer. */
static void
buf_ungetc(int c, struct tok_state *tok) {
tok->str--;
assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
}
/* Set the readline function for TOK to ENC. For the string-based
tokenizer, this means to just record the encoding. */
static int
buf_setreadl(struct tok_state *tok, const char* enc) {
tok->enc = enc;
return 1;
}
/* Decode a byte string STR for use as the buffer of TOK.
Look for encoding declarations inside STR, and record them
inside TOK. */
static char *
decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf)
{
PyObject* utf8 = NULL;
char *str;
const char *s;
const char *newl[2] = {NULL, NULL};
int lineno = 0;
tok->input = str = _PyTokenizer_translate_newlines(input, single, preserve_crlf, tok);
if (str == NULL)
return NULL;
tok->enc = NULL;
tok->str = str;
if (!_PyTokenizer_check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
return _PyTokenizer_error_ret(tok);
str = tok->str; /* string after BOM if any */
assert(str);
if (tok->enc != NULL) {
utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
if (utf8 == NULL)
return _PyTokenizer_error_ret(tok);
str = PyBytes_AsString(utf8);
}
for (s = str;; s++) {
if (*s == '\0') break;
else if (*s == '\n') {
assert(lineno < 2);
newl[lineno] = s;
lineno++;
if (lineno == 2) break;
}
}
tok->enc = NULL;
/* need to check line 1 and 2 separately since check_coding_spec
assumes a single line as input */
if (newl[0]) {
tok->lineno = 1;
if (!_PyTokenizer_check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
return NULL;
}
if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
tok->lineno = 2;
if (!_PyTokenizer_check_coding_spec(newl[0]+1, newl[1] - newl[0],
tok, buf_setreadl))
return NULL;
}
}
tok->lineno = 0;
if (tok->enc != NULL) {
assert(utf8 == NULL);
utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
if (utf8 == NULL)
return _PyTokenizer_error_ret(tok);
str = PyBytes_AS_STRING(utf8);
}
else if (!_PyTokenizer_ensure_utf8(str, tok, 1)) {
return _PyTokenizer_error_ret(tok);
}
assert(tok->decoding_buffer == NULL);
tok->decoding_buffer = utf8; /* CAUTION */
return str;
}
/* Set up tokenizer for string */
struct tok_state *
_PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
{
struct tok_state *tok = _PyTokenizer_tok_new();
char *decoded;
if (tok == NULL)
return NULL;
decoded = decode_str(str, exec_input, tok, preserve_crlf);
if (decoded == NULL) {
_PyTokenizer_Free(tok);
return NULL;
}
tok->buf = tok->cur = tok->inp = decoded;
tok->end = decoded;
tok->underflow = &tok_underflow_string;
return tok;
}
|