Line data Source code
1 :
2 : /* Tokenizer implementation */
3 :
4 : #define PY_SSIZE_T_CLEAN
5 : #include "Python.h"
6 : #include "pycore_call.h" // _PyObject_CallNoArgs()
7 :
8 : #include <ctype.h>
9 : #include <assert.h>
10 :
11 : #include "tokenizer.h"
12 : #include "errcode.h"
13 :
14 : #include "unicodeobject.h"
15 : #include "bytesobject.h"
16 : #include "fileobject.h"
17 : #include "abstract.h"
18 :
19 : /* Alternate tab spacing */
20 : #define ALTTABSIZE 1
21 :
22 : #define is_potential_identifier_start(c) (\
23 : (c >= 'a' && c <= 'z')\
24 : || (c >= 'A' && c <= 'Z')\
25 : || c == '_'\
26 : || (c >= 128))
27 :
28 : #define is_potential_identifier_char(c) (\
29 : (c >= 'a' && c <= 'z')\
30 : || (c >= 'A' && c <= 'Z')\
31 : || (c >= '0' && c <= '9')\
32 : || c == '_'\
33 : || (c >= 128))
34 :
35 :
36 : /* Don't ever change this -- it would break the portability of Python code */
37 : #define TABSIZE 8
38 :
39 : /* Forward */
40 : static struct tok_state *tok_new(void);
41 : static int tok_nextc(struct tok_state *tok);
42 : static void tok_backup(struct tok_state *tok, int c);
43 : static int syntaxerror(struct tok_state *tok, const char *format, ...);
44 :
45 : /* Spaces in this constant are treated as "zero or more spaces or tabs" when
46 : tokenizing. */
47 : static const char* type_comment_prefix = "# type: ";
48 :
49 : /* Create and initialize a new tok_state structure */
50 :
51 : static struct tok_state *
52 224563 : tok_new(void)
53 : {
54 224563 : struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
55 : sizeof(struct tok_state));
56 224563 : if (tok == NULL)
57 0 : return NULL;
58 224563 : tok->buf = tok->cur = tok->inp = NULL;
59 224563 : tok->fp_interactive = 0;
60 224563 : tok->interactive_src_start = NULL;
61 224563 : tok->interactive_src_end = NULL;
62 224563 : tok->start = NULL;
63 224563 : tok->end = NULL;
64 224563 : tok->done = E_OK;
65 224563 : tok->fp = NULL;
66 224563 : tok->input = NULL;
67 224563 : tok->tabsize = TABSIZE;
68 224563 : tok->indent = 0;
69 224563 : tok->indstack[0] = 0;
70 224563 : tok->atbol = 1;
71 224563 : tok->pendin = 0;
72 224563 : tok->prompt = tok->nextprompt = NULL;
73 224563 : tok->lineno = 0;
74 224563 : tok->level = 0;
75 224563 : tok->altindstack[0] = 0;
76 224563 : tok->decoding_state = STATE_INIT;
77 224563 : tok->decoding_erred = 0;
78 224563 : tok->enc = NULL;
79 224563 : tok->encoding = NULL;
80 224563 : tok->cont_line = 0;
81 224563 : tok->filename = NULL;
82 224563 : tok->decoding_readline = NULL;
83 224563 : tok->decoding_buffer = NULL;
84 224563 : tok->type_comments = 0;
85 224563 : tok->async_hacks = 0;
86 224563 : tok->async_def = 0;
87 224563 : tok->async_def_indent = 0;
88 224563 : tok->async_def_nl = 0;
89 224563 : tok->interactive_underflow = IUNDERFLOW_NORMAL;
90 224563 : tok->str = NULL;
91 : #ifdef Py_DEBUG
92 224563 : tok->debug = _Py_GetConfig()->parser_debug;
93 : #endif
94 224563 : return tok;
95 : }
96 :
97 : static char *
98 116446 : new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
99 : {
100 116446 : char* result = (char *)PyMem_Malloc(len + 1);
101 116446 : if (!result) {
102 0 : tok->done = E_NOMEM;
103 0 : return NULL;
104 : }
105 116446 : memcpy(result, s, len);
106 116446 : result[len] = '\0';
107 116446 : return result;
108 : }
109 :
110 : static char *
111 29 : error_ret(struct tok_state *tok) /* XXX */
112 : {
113 29 : tok->decoding_erred = 1;
114 29 : if (tok->fp != NULL && tok->buf != NULL) /* see _PyTokenizer_Free */
115 3 : PyMem_Free(tok->buf);
116 29 : tok->buf = tok->cur = tok->inp = NULL;
117 29 : tok->start = NULL;
118 29 : tok->end = NULL;
119 29 : tok->done = E_DECODE;
120 29 : return NULL; /* as if it were EOF */
121 : }
122 :
123 :
124 : static const char *
125 666 : get_normal_name(const char *s) /* for utf-8 and latin-1 */
126 : {
127 : char buf[13];
128 : int i;
129 4351 : for (i = 0; i < 12; i++) {
130 4351 : int c = s[i];
131 4351 : if (c == '\0')
132 666 : break;
133 3685 : else if (c == '_')
134 0 : buf[i] = '-';
135 : else
136 3685 : buf[i] = tolower(c);
137 : }
138 666 : buf[i] = '\0';
139 666 : if (strcmp(buf, "utf-8") == 0 ||
140 132 : strncmp(buf, "utf-8-", 6) == 0)
141 534 : return "utf-8";
142 132 : else if (strcmp(buf, "latin-1") == 0 ||
143 118 : strcmp(buf, "iso-8859-1") == 0 ||
144 73 : strcmp(buf, "iso-latin-1") == 0 ||
145 73 : strncmp(buf, "latin-1-", 8) == 0 ||
146 73 : strncmp(buf, "iso-8859-1-", 11) == 0 ||
147 73 : strncmp(buf, "iso-latin-1-", 12) == 0)
148 59 : return "iso-8859-1";
149 : else
150 73 : return s;
151 : }
152 :
153 : /* Return the coding spec in S, or NULL if none is found. */
154 :
155 : static int
156 112268 : get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
157 : {
158 : Py_ssize_t i;
159 112268 : *spec = NULL;
160 : /* Coding spec must be in a comment, and that comment must be
161 : * the only statement on the source code line. */
162 112268 : for (i = 0; i < size - 6; i++) {
163 33450 : if (s[i] == '#')
164 4536 : break;
165 28914 : if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
166 28914 : return 1;
167 : }
168 252728 : for (; i < size - 6; i++) { /* XXX inefficient search */
169 170040 : const char* t = s + i;
170 170040 : if (memcmp(t, "coding", 6) == 0) {
171 674 : const char* begin = NULL;
172 674 : t += 6;
173 674 : if (t[0] != ':' && t[0] != '=')
174 8 : continue;
175 : do {
176 1287 : t++;
177 1287 : } while (t[0] == ' ' || t[0] == '\t');
178 :
179 666 : begin = t;
180 4351 : while (Py_ISALNUM(t[0]) ||
181 1342 : t[0] == '-' || t[0] == '_' || t[0] == '.')
182 3685 : t++;
183 :
184 666 : if (begin < t) {
185 666 : char* r = new_string(begin, t - begin, tok);
186 : const char* q;
187 666 : if (!r)
188 0 : return 0;
189 666 : q = get_normal_name(r);
190 666 : if (r != q) {
191 593 : PyMem_Free(r);
192 593 : r = new_string(q, strlen(q), tok);
193 593 : if (!r)
194 0 : return 0;
195 : }
196 666 : *spec = r;
197 666 : break;
198 : }
199 : }
200 : }
201 83354 : return 1;
202 : }
203 :
204 : /* Check whether the line contains a coding spec. If it does,
205 : invoke the set_readline function for the new encoding.
206 : This function receives the tok_state and the new encoding.
207 : Return 1 on success, 0 on failure. */
208 :
209 : static int
210 112268 : check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
211 : int set_readline(struct tok_state *, const char *))
212 : {
213 : char *cs;
214 112268 : if (tok->cont_line) {
215 : /* It's a continuation line, so it can't be a coding spec. */
216 0 : tok->decoding_state = STATE_NORMAL;
217 0 : return 1;
218 : }
219 112268 : if (!get_coding_spec(line, &cs, size, tok)) {
220 0 : return 0;
221 : }
222 112268 : if (!cs) {
223 : Py_ssize_t i;
224 111626 : for (i = 0; i < size; i++) {
225 110803 : if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
226 : break;
227 105943 : if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
228 : /* Stop checking coding spec after a line containing
229 : * anything except a comment. */
230 105919 : tok->decoding_state = STATE_NORMAL;
231 105919 : break;
232 : }
233 : }
234 111602 : return 1;
235 : }
236 666 : tok->decoding_state = STATE_NORMAL;
237 666 : if (tok->encoding == NULL) {
238 639 : assert(tok->decoding_readline == NULL);
239 639 : if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
240 0 : error_ret(tok);
241 0 : PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
242 0 : PyMem_Free(cs);
243 0 : return 0;
244 : }
245 639 : tok->encoding = cs;
246 : } else { /* then, compare cs with BOM */
247 27 : if (strcmp(tok->encoding, cs) != 0) {
248 20 : error_ret(tok);
249 20 : PyErr_Format(PyExc_SyntaxError,
250 : "encoding problem: %s with BOM", cs);
251 20 : PyMem_Free(cs);
252 20 : return 0;
253 : }
254 7 : PyMem_Free(cs);
255 : }
256 646 : return 1;
257 : }
258 :
259 : /* See whether the file starts with a BOM. If it does,
260 : invoke the set_readline function with the new encoding.
261 : Return 1 on success, 0 on failure. */
262 :
263 : static int
264 109412 : check_bom(int get_char(struct tok_state *),
265 : void unget_char(int, struct tok_state *),
266 : int set_readline(struct tok_state *, const char *),
267 : struct tok_state *tok)
268 : {
269 : int ch1, ch2, ch3;
270 109412 : ch1 = get_char(tok);
271 109412 : tok->decoding_state = STATE_SEEK_CODING;
272 109412 : if (ch1 == EOF) {
273 8 : return 1;
274 109404 : } else if (ch1 == 0xEF) {
275 39 : ch2 = get_char(tok);
276 39 : if (ch2 != 0xBB) {
277 1 : unget_char(ch2, tok);
278 1 : unget_char(ch1, tok);
279 1 : return 1;
280 : }
281 38 : ch3 = get_char(tok);
282 38 : if (ch3 != 0xBF) {
283 2 : unget_char(ch3, tok);
284 2 : unget_char(ch2, tok);
285 2 : unget_char(ch1, tok);
286 2 : return 1;
287 : }
288 : } else {
289 109365 : unget_char(ch1, tok);
290 109365 : return 1;
291 : }
292 36 : if (tok->encoding != NULL)
293 0 : PyMem_Free(tok->encoding);
294 36 : tok->encoding = new_string("utf-8", 5, tok);
295 36 : if (!tok->encoding)
296 0 : return 0;
297 : /* No need to set_readline: input is already utf-8 */
298 36 : return 1;
299 : }
300 :
301 : static int
302 87 : tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
303 87 : assert(tok->fp_interactive);
304 :
305 87 : if (!line) {
306 0 : return 0;
307 : }
308 :
309 87 : Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start;
310 87 : Py_ssize_t line_size = strlen(line);
311 87 : char last_char = line[line_size > 0 ? line_size - 1 : line_size];
312 87 : if (last_char != '\n') {
313 9 : line_size += 1;
314 : }
315 87 : char* new_str = tok->interactive_src_start;
316 :
317 87 : new_str = PyMem_Realloc(new_str, current_size + line_size + 1);
318 87 : if (!new_str) {
319 0 : if (tok->interactive_src_start) {
320 0 : PyMem_Free(tok->interactive_src_start);
321 : }
322 0 : tok->interactive_src_start = NULL;
323 0 : tok->interactive_src_end = NULL;
324 0 : tok->done = E_NOMEM;
325 0 : return -1;
326 : }
327 87 : strcpy(new_str + current_size, line);
328 87 : if (last_char != '\n') {
329 : /* Last line does not end in \n, fake one */
330 9 : new_str[current_size + line_size - 1] = '\n';
331 9 : new_str[current_size + line_size] = '\0';
332 : }
333 87 : tok->interactive_src_start = new_str;
334 87 : tok->interactive_src_end = new_str + current_size + line_size;
335 87 : return 0;
336 : }
337 :
338 :
339 : /* Read a line of text from TOK into S, using the stream in TOK.
340 : Return NULL on failure, else S.
341 :
342 : On entry, tok->decoding_buffer will be one of:
343 : 1) NULL: need to call tok->decoding_readline to get a new line
344 : 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
345 : stored the result in tok->decoding_buffer
346 : 3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
347 : (in the s buffer) to copy entire contents of the line read
348 : by tok->decoding_readline. tok->decoding_buffer has the overflow.
349 : In this case, tok_readline_recode is called in a loop (with an expanded buffer)
350 : until the buffer ends with a '\n' (or until the end of the file is
351 : reached): see tok_nextc and its calls to tok_reserve_buf.
352 : */
353 :
354 : static int
355 80119 : tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
356 : {
357 80119 : Py_ssize_t cur = tok->cur - tok->buf;
358 80119 : Py_ssize_t oldsize = tok->inp - tok->buf;
359 80119 : Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
360 80119 : if (newsize > tok->end - tok->buf) {
361 3334 : char *newbuf = tok->buf;
362 3334 : Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
363 3334 : Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
364 3334 : Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
365 3334 : newbuf = (char *)PyMem_Realloc(newbuf, newsize);
366 3334 : if (newbuf == NULL) {
367 0 : tok->done = E_NOMEM;
368 0 : return 0;
369 : }
370 3334 : tok->buf = newbuf;
371 3334 : tok->cur = tok->buf + cur;
372 3334 : tok->inp = tok->buf + oldsize;
373 3334 : tok->end = tok->buf + newsize;
374 3334 : tok->start = start < 0 ? NULL : tok->buf + start;
375 3334 : tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
376 3334 : tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
377 : }
378 80119 : return 1;
379 : }
380 :
381 : static int
382 23302 : tok_readline_recode(struct tok_state *tok) {
383 : PyObject *line;
384 : const char *buf;
385 : Py_ssize_t buflen;
386 23302 : line = tok->decoding_buffer;
387 23302 : if (line == NULL) {
388 23302 : line = PyObject_CallNoArgs(tok->decoding_readline);
389 23302 : if (line == NULL) {
390 0 : error_ret(tok);
391 0 : goto error;
392 : }
393 : }
394 : else {
395 0 : tok->decoding_buffer = NULL;
396 : }
397 23302 : buf = PyUnicode_AsUTF8AndSize(line, &buflen);
398 23302 : if (buf == NULL) {
399 0 : error_ret(tok);
400 0 : goto error;
401 : }
402 23302 : if (!tok_reserve_buf(tok, buflen + 1)) {
403 0 : goto error;
404 : }
405 23302 : memcpy(tok->inp, buf, buflen);
406 23302 : tok->inp += buflen;
407 23302 : *tok->inp = '\0';
408 23302 : if (tok->fp_interactive &&
409 0 : tok_concatenate_interactive_new_line(tok, buf) == -1) {
410 0 : goto error;
411 : }
412 23302 : Py_DECREF(line);
413 23302 : return 1;
414 0 : error:
415 0 : Py_XDECREF(line);
416 0 : return 0;
417 : }
418 :
419 : /* Set the readline function for TOK to a StreamReader's
420 : readline function. The StreamReader is named ENC.
421 :
422 : This function is called from check_bom and check_coding_spec.
423 :
424 : ENC is usually identical to the future value of tok->encoding,
425 : except for the (currently unsupported) case of UTF-16.
426 :
427 : Return 1 on success, 0 on failure. */
428 :
429 : static int
430 63 : fp_setreadl(struct tok_state *tok, const char* enc)
431 : {
432 : PyObject *readline, *open, *stream;
433 : int fd;
434 : long pos;
435 :
436 63 : fd = fileno(tok->fp);
437 : /* Due to buffering the file offset for fd can be different from the file
438 : * position of tok->fp. If tok->fp was opened in text mode on Windows,
439 : * its file position counts CRLF as one char and can't be directly mapped
440 : * to the file offset for fd. Instead we step back one byte and read to
441 : * the end of line.*/
442 63 : pos = ftell(tok->fp);
443 126 : if (pos == -1 ||
444 63 : lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
445 0 : PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
446 0 : return 0;
447 : }
448 :
449 63 : open = _PyImport_GetModuleAttrString("io", "open");
450 63 : if (open == NULL) {
451 0 : return 0;
452 : }
453 63 : stream = PyObject_CallFunction(open, "isisOOO",
454 : fd, "r", -1, enc, Py_None, Py_None, Py_False);
455 63 : Py_DECREF(open);
456 63 : if (stream == NULL) {
457 0 : return 0;
458 : }
459 :
460 63 : readline = PyObject_GetAttr(stream, &_Py_ID(readline));
461 63 : Py_DECREF(stream);
462 63 : if (readline == NULL) {
463 0 : return 0;
464 : }
465 63 : Py_XSETREF(tok->decoding_readline, readline);
466 :
467 63 : if (pos > 0) {
468 63 : PyObject *bufobj = _PyObject_CallNoArgs(readline);
469 63 : if (bufobj == NULL) {
470 0 : return 0;
471 : }
472 63 : Py_DECREF(bufobj);
473 : }
474 :
475 63 : return 1;
476 : }
477 :
478 : /* Fetch the next byte from TOK. */
479 :
480 804 : static int fp_getc(struct tok_state *tok) {
481 804 : return getc(tok->fp);
482 : }
483 :
484 : /* Unfetch the last byte back into TOK. */
485 :
486 790 : static void fp_ungetc(int c, struct tok_state *tok) {
487 790 : ungetc(c, tok->fp);
488 790 : }
489 :
490 : /* Check whether the characters at s start a valid
491 : UTF-8 sequence. Return the number of characters forming
492 : the sequence if yes, 0 if not. */
493 1943970 : static int valid_utf8(const unsigned char* s)
494 : {
495 1943970 : int expected = 0;
496 : int length;
497 1943970 : if (*s < 0x80)
498 : /* single-byte code */
499 1943960 : return 1;
500 6 : if (*s < 0xc0)
501 : /* following byte */
502 2 : return 0;
503 4 : if (*s < 0xE0)
504 3 : expected = 1;
505 1 : else if (*s < 0xF0)
506 0 : expected = 2;
507 1 : else if (*s < 0xF8)
508 0 : expected = 3;
509 : else
510 1 : return 0;
511 3 : length = expected + 1;
512 6 : for (; expected; expected--)
513 3 : if (s[expected] < 0x80 || s[expected] >= 0xC0)
514 0 : return 0;
515 3 : return length;
516 : }
517 :
518 : static int
519 56382 : ensure_utf8(char *line, struct tok_state *tok)
520 : {
521 56382 : int badchar = 0;
522 : unsigned char *c;
523 : int length;
524 2000350 : for (c = (unsigned char *)line; *c; c += length) {
525 1943970 : if (!(length = valid_utf8(c))) {
526 3 : badchar = *c;
527 3 : break;
528 : }
529 : }
530 56382 : if (badchar) {
531 : /* Need to add 1 to the line number, since this line
532 : has not been counted, yet. */
533 3 : PyErr_Format(PyExc_SyntaxError,
534 : "Non-UTF-8 code starting with '\\x%.2x' "
535 : "in file %U on line %i, "
536 : "but no encoding declared; "
537 : "see https://peps.python.org/pep-0263/ for details",
538 3 : badchar, tok->filename, tok->lineno + 1);
539 3 : return 0;
540 : }
541 56379 : return 1;
542 : }
543 :
544 : /* Fetch a byte from TOK, using the string buffer. */
545 :
546 : static int
547 108685 : buf_getc(struct tok_state *tok) {
548 108685 : return Py_CHARMASK(*tok->str++);
549 : }
550 :
551 : /* Unfetch a byte from TOK, using the string buffer. */
552 :
553 : static void
554 108583 : buf_ungetc(int c, struct tok_state *tok) {
555 108583 : tok->str--;
556 108583 : assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
557 108583 : }
558 :
559 : /* Set the readline function for TOK to ENC. For the string-based
560 : tokenizer, this means to just record the encoding. */
561 :
562 : static int
563 49 : buf_setreadl(struct tok_state *tok, const char* enc) {
564 49 : tok->enc = enc;
565 49 : return 1;
566 : }
567 :
568 : /* Return a UTF-8 encoding Python string object from the
569 : C byte string STR, which is encoded with ENC. */
570 :
571 : static PyObject *
572 94 : translate_into_utf8(const char* str, const char* enc) {
573 : PyObject *utf8;
574 94 : PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
575 94 : if (buf == NULL)
576 6 : return NULL;
577 88 : utf8 = PyUnicode_AsUTF8String(buf);
578 88 : Py_DECREF(buf);
579 88 : return utf8;
580 : }
581 :
582 :
583 : static char *
584 223785 : translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
585 223785 : int skip_next_lf = 0;
586 223785 : size_t needed_length = strlen(s) + 2, final_length;
587 : char *buf, *current;
588 223785 : char c = '\0';
589 223785 : buf = PyMem_Malloc(needed_length);
590 223785 : if (buf == NULL) {
591 0 : tok->done = E_NOMEM;
592 0 : return NULL;
593 : }
594 222216000 : for (current = buf; *s; s++, current++) {
595 221992000 : c = *s;
596 221992000 : if (skip_next_lf) {
597 45980 : skip_next_lf = 0;
598 45980 : if (c == '\n') {
599 45963 : c = *++s;
600 45963 : if (!c)
601 16 : break;
602 : }
603 : }
604 221992000 : if (c == '\r') {
605 45987 : skip_next_lf = 1;
606 45987 : c = '\n';
607 : }
608 221992000 : *current = c;
609 : }
610 : /* If this is exec input, add a newline to the end of the string if
611 : there isn't one already. */
612 223785 : if (exec_input && c != '\n') {
613 127954 : *current = '\n';
614 127954 : current++;
615 : }
616 223785 : *current = '\0';
617 223785 : final_length = current - buf + 1;
618 223785 : if (final_length < needed_length && final_length) {
619 : /* should never fail */
620 95847 : char* result = PyMem_Realloc(buf, final_length);
621 95847 : if (result == NULL) {
622 0 : PyMem_Free(buf);
623 : }
624 95847 : buf = result;
625 : }
626 223785 : return buf;
627 : }
628 :
629 : /* Decode a byte string STR for use as the buffer of TOK.
630 : Look for encoding declarations inside STR, and record them
631 : inside TOK. */
632 :
633 : static char *
634 108612 : decode_str(const char *input, int single, struct tok_state *tok)
635 : {
636 108612 : PyObject* utf8 = NULL;
637 : char *str;
638 : const char *s;
639 108612 : const char *newl[2] = {NULL, NULL};
640 108612 : int lineno = 0;
641 108612 : tok->input = str = translate_newlines(input, single, tok);
642 108612 : if (str == NULL)
643 0 : return NULL;
644 108612 : tok->enc = NULL;
645 108612 : tok->str = str;
646 108612 : if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
647 0 : return error_ret(tok);
648 108612 : str = tok->str; /* string after BOM if any */
649 108612 : assert(str);
650 108612 : if (tok->enc != NULL) {
651 0 : utf8 = translate_into_utf8(str, tok->enc);
652 0 : if (utf8 == NULL)
653 0 : return error_ret(tok);
654 0 : str = PyBytes_AsString(utf8);
655 : }
656 1793920 : for (s = str;; s++) {
657 1793920 : if (*s == '\0') break;
658 1695980 : else if (*s == '\n') {
659 119267 : assert(lineno < 2);
660 119267 : newl[lineno] = s;
661 119267 : lineno++;
662 119267 : if (lineno == 2) break;
663 : }
664 : }
665 108612 : tok->enc = NULL;
666 : /* need to check line 1 and 2 separately since check_coding_spec
667 : assumes a single line as input */
668 108612 : if (newl[0]) {
669 108599 : if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
670 20 : return NULL;
671 : }
672 108579 : if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
673 2553 : if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
674 : tok, buf_setreadl))
675 0 : return NULL;
676 : }
677 : }
678 108592 : if (tok->enc != NULL) {
679 49 : assert(utf8 == NULL);
680 49 : utf8 = translate_into_utf8(str, tok->enc);
681 49 : if (utf8 == NULL)
682 6 : return error_ret(tok);
683 43 : str = PyBytes_AS_STRING(utf8);
684 : }
685 108586 : assert(tok->decoding_buffer == NULL);
686 108586 : tok->decoding_buffer = utf8; /* CAUTION */
687 108586 : return str;
688 : }
689 :
690 : /* Set up tokenizer for string */
691 :
692 : struct tok_state *
693 108612 : _PyTokenizer_FromString(const char *str, int exec_input)
694 : {
695 108612 : struct tok_state *tok = tok_new();
696 : char *decoded;
697 :
698 108612 : if (tok == NULL)
699 0 : return NULL;
700 108612 : decoded = decode_str(str, exec_input, tok);
701 108612 : if (decoded == NULL) {
702 26 : _PyTokenizer_Free(tok);
703 26 : return NULL;
704 : }
705 :
706 108586 : tok->buf = tok->cur = tok->inp = decoded;
707 108586 : tok->end = decoded;
708 108586 : return tok;
709 : }
710 :
711 : /* Set up tokenizer for UTF-8 string */
712 :
713 : struct tok_state *
714 115121 : _PyTokenizer_FromUTF8(const char *str, int exec_input)
715 : {
716 115121 : struct tok_state *tok = tok_new();
717 : char *translated;
718 115121 : if (tok == NULL)
719 0 : return NULL;
720 115121 : tok->input = translated = translate_newlines(str, exec_input, tok);
721 115121 : if (translated == NULL) {
722 0 : _PyTokenizer_Free(tok);
723 0 : return NULL;
724 : }
725 115121 : tok->decoding_state = STATE_NORMAL;
726 115121 : tok->enc = NULL;
727 115121 : tok->str = translated;
728 115121 : tok->encoding = new_string("utf-8", 5, tok);
729 115121 : if (!tok->encoding) {
730 0 : _PyTokenizer_Free(tok);
731 0 : return NULL;
732 : }
733 :
734 115121 : tok->buf = tok->cur = tok->inp = translated;
735 115121 : tok->end = translated;
736 115121 : return tok;
737 : }
738 :
739 : /* Set up tokenizer for file */
740 :
741 : struct tok_state *
742 830 : _PyTokenizer_FromFile(FILE *fp, const char* enc,
743 : const char *ps1, const char *ps2)
744 : {
745 830 : struct tok_state *tok = tok_new();
746 830 : if (tok == NULL)
747 0 : return NULL;
748 830 : if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
749 0 : _PyTokenizer_Free(tok);
750 0 : return NULL;
751 : }
752 830 : tok->cur = tok->inp = tok->buf;
753 830 : tok->end = tok->buf + BUFSIZ;
754 830 : tok->fp = fp;
755 830 : tok->prompt = ps1;
756 830 : tok->nextprompt = ps2;
757 830 : if (enc != NULL) {
758 : /* Must copy encoding declaration since it
759 : gets copied into the parse tree. */
760 30 : tok->encoding = new_string(enc, strlen(enc), tok);
761 30 : if (!tok->encoding) {
762 0 : _PyTokenizer_Free(tok);
763 0 : return NULL;
764 : }
765 30 : tok->decoding_state = STATE_NORMAL;
766 : }
767 830 : return tok;
768 : }
769 :
770 : /* Free a tok_state structure */
771 :
772 : void
773 224563 : _PyTokenizer_Free(struct tok_state *tok)
774 : {
775 224563 : if (tok->encoding != NULL) {
776 115826 : PyMem_Free(tok->encoding);
777 : }
778 224563 : Py_XDECREF(tok->decoding_readline);
779 224563 : Py_XDECREF(tok->decoding_buffer);
780 224563 : Py_XDECREF(tok->filename);
781 224563 : if (tok->fp != NULL && tok->buf != NULL) {
782 827 : PyMem_Free(tok->buf);
783 : }
784 224563 : if (tok->input) {
785 223733 : PyMem_Free(tok->input);
786 : }
787 224563 : if (tok->interactive_src_start != NULL) {
788 35 : PyMem_Free(tok->interactive_src_start);
789 : }
790 224563 : PyMem_Free(tok);
791 224563 : }
792 :
793 : static int
794 56797 : tok_readline_raw(struct tok_state *tok)
795 : {
796 : do {
797 56797 : if (!tok_reserve_buf(tok, BUFSIZ)) {
798 0 : return 0;
799 : }
800 56797 : char *line = Py_UniversalNewlineFgets(tok->inp,
801 56797 : (int)(tok->end - tok->inp),
802 : tok->fp, NULL);
803 56797 : if (line == NULL) {
804 325 : return 1;
805 : }
806 56507 : if (tok->fp_interactive &&
807 35 : tok_concatenate_interactive_new_line(tok, line) == -1) {
808 0 : return 0;
809 : }
810 56472 : tok->inp = strchr(tok->inp, '\0');
811 56472 : if (tok->inp == tok->buf) {
812 1 : return 0;
813 : }
814 56471 : } while (tok->inp[-1] != '\n');
815 56425 : return 1;
816 : }
817 :
818 : static int
819 8952620 : tok_underflow_string(struct tok_state *tok) {
820 8952620 : char *end = strchr(tok->inp, '\n');
821 8952620 : if (end != NULL) {
822 8754170 : end++;
823 : }
824 : else {
825 198448 : end = strchr(tok->inp, '\0');
826 198448 : if (end == tok->inp) {
827 123990 : tok->done = E_EOF;
828 123990 : return 0;
829 : }
830 : }
831 8828630 : if (tok->start == NULL) {
832 8106030 : tok->buf = tok->cur;
833 : }
834 8828630 : tok->line_start = tok->cur;
835 8828630 : tok->lineno++;
836 8828630 : tok->inp = end;
837 8828630 : return 1;
838 : }
839 :
840 : static int
841 52 : tok_underflow_interactive(struct tok_state *tok) {
842 52 : if (tok->interactive_underflow == IUNDERFLOW_STOP) {
843 0 : tok->done = E_INTERACT_STOP;
844 0 : return 1;
845 : }
846 52 : char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt);
847 52 : if (newtok != NULL) {
848 52 : char *translated = translate_newlines(newtok, 0, tok);
849 52 : PyMem_Free(newtok);
850 52 : if (translated == NULL) {
851 0 : return 0;
852 : }
853 52 : newtok = translated;
854 : }
855 52 : if (tok->encoding && newtok && *newtok) {
856 : /* Recode to UTF-8 */
857 : Py_ssize_t buflen;
858 : const char* buf;
859 45 : PyObject *u = translate_into_utf8(newtok, tok->encoding);
860 45 : PyMem_Free(newtok);
861 45 : if (u == NULL) {
862 0 : tok->done = E_DECODE;
863 0 : return 0;
864 : }
865 45 : buflen = PyBytes_GET_SIZE(u);
866 45 : buf = PyBytes_AS_STRING(u);
867 45 : newtok = PyMem_Malloc(buflen+1);
868 45 : if (newtok == NULL) {
869 0 : Py_DECREF(u);
870 0 : tok->done = E_NOMEM;
871 0 : return 0;
872 : }
873 45 : strcpy(newtok, buf);
874 45 : Py_DECREF(u);
875 : }
876 104 : if (tok->fp_interactive &&
877 52 : tok_concatenate_interactive_new_line(tok, newtok) == -1) {
878 0 : PyMem_Free(newtok);
879 0 : return 0;
880 : }
881 52 : if (tok->nextprompt != NULL) {
882 52 : tok->prompt = tok->nextprompt;
883 : }
884 52 : if (newtok == NULL) {
885 0 : tok->done = E_INTR;
886 : }
887 52 : else if (*newtok == '\0') {
888 7 : PyMem_Free(newtok);
889 7 : tok->done = E_EOF;
890 : }
891 45 : else if (tok->start != NULL) {
892 20 : Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
893 20 : size_t size = strlen(newtok);
894 20 : tok->lineno++;
895 20 : if (!tok_reserve_buf(tok, size + 1)) {
896 0 : PyMem_Free(tok->buf);
897 0 : tok->buf = NULL;
898 0 : PyMem_Free(newtok);
899 0 : return 0;
900 : }
901 20 : memcpy(tok->cur, newtok, size + 1);
902 20 : PyMem_Free(newtok);
903 20 : tok->inp += size;
904 20 : tok->multi_line_start = tok->buf + cur_multi_line_start;
905 : }
906 : else {
907 25 : tok->lineno++;
908 25 : PyMem_Free(tok->buf);
909 25 : tok->buf = newtok;
910 25 : tok->cur = tok->buf;
911 25 : tok->line_start = tok->buf;
912 25 : tok->inp = strchr(tok->buf, '\0');
913 25 : tok->end = tok->inp + 1;
914 : }
915 52 : if (tok->done != E_OK) {
916 7 : if (tok->prompt != NULL) {
917 7 : PySys_WriteStderr("\n");
918 : }
919 7 : return 0;
920 : }
921 45 : return 1;
922 : }
923 :
924 : static int
925 80053 : tok_underflow_file(struct tok_state *tok) {
926 80053 : if (tok->start == NULL) {
927 69569 : tok->cur = tok->inp = tok->buf;
928 : }
929 80053 : if (tok->decoding_state == STATE_INIT) {
930 : /* We have not yet determined the encoding.
931 : If an encoding is found, use the file-pointer
932 : reader functions from now on. */
933 800 : if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {
934 0 : error_ret(tok);
935 0 : return 0;
936 : }
937 800 : assert(tok->decoding_state != STATE_INIT);
938 : }
939 : /* Read until '\n' or EOF */
940 80053 : if (tok->decoding_readline != NULL) {
941 : /* We already have a codec associated with this input. */
942 23302 : if (!tok_readline_recode(tok)) {
943 0 : return 0;
944 : }
945 : }
946 : else {
947 : /* We want a 'raw' read. */
948 56751 : if (!tok_readline_raw(tok)) {
949 1 : return 0;
950 : }
951 : }
952 80052 : if (tok->inp == tok->cur) {
953 335 : tok->done = E_EOF;
954 335 : return 0;
955 : }
956 79717 : if (tok->inp[-1] != '\n') {
957 : /* Last line does not end in \n, fake one */
958 53 : *tok->inp++ = '\n';
959 53 : *tok->inp = '\0';
960 : }
961 :
962 79717 : tok->lineno++;
963 79717 : if (tok->decoding_state != STATE_NORMAL) {
964 1350 : if (tok->lineno > 2) {
965 234 : tok->decoding_state = STATE_NORMAL;
966 : }
967 1116 : else if (!check_coding_spec(tok->cur, strlen(tok->cur),
968 : tok, fp_setreadl))
969 : {
970 0 : return 0;
971 : }
972 : }
973 : /* The default encoding is UTF-8, so make sure we don't have any
974 : non-UTF-8 sequences in it. */
975 79717 : if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
976 3 : error_ret(tok);
977 3 : return 0;
978 : }
979 79714 : assert(tok->done == E_OK);
980 79714 : return tok->done == E_OK;
981 : }
982 :
983 : #if defined(Py_DEBUG)
984 : static void
985 21 : print_escape(FILE *f, const char *s, Py_ssize_t size)
986 : {
987 21 : if (s == NULL) {
988 0 : fputs("NULL", f);
989 0 : return;
990 : }
991 21 : putc('"', f);
992 810 : while (size-- > 0) {
993 789 : unsigned char c = *s++;
994 789 : switch (c) {
995 18 : case '\n': fputs("\\n", f); break;
996 0 : case '\r': fputs("\\r", f); break;
997 0 : case '\t': fputs("\\t", f); break;
998 0 : case '\f': fputs("\\f", f); break;
999 0 : case '\'': fputs("\\'", f); break;
1000 0 : case '"': fputs("\\\"", f); break;
1001 771 : default:
1002 771 : if (0x20 <= c && c <= 0x7f)
1003 771 : putc(c, f);
1004 : else
1005 0 : fprintf(f, "\\x%02x", c);
1006 : }
1007 : }
1008 21 : putc('"', f);
1009 : }
1010 : #endif
1011 :
1012 : /* Get next char, updating state; error code goes into tok->done */
1013 :
1014 : static int
1015 318271000 : tok_nextc(struct tok_state *tok)
1016 : {
1017 : int rc;
1018 : for (;;) {
1019 318271000 : if (tok->cur != tok->inp) {
1020 308996000 : return Py_CHARMASK(*tok->cur++); /* Fast path */
1021 : }
1022 9274950 : if (tok->done != E_OK) {
1023 242228 : return EOF;
1024 : }
1025 9032720 : if (tok->fp == NULL) {
1026 8952620 : rc = tok_underflow_string(tok);
1027 : }
1028 80105 : else if (tok->prompt != NULL) {
1029 52 : rc = tok_underflow_interactive(tok);
1030 : }
1031 : else {
1032 80053 : rc = tok_underflow_file(tok);
1033 : }
1034 : #if defined(Py_DEBUG)
1035 9032720 : if (tok->debug) {
1036 21 : fprintf(stderr, "line[%d] = ", tok->lineno);
1037 21 : print_escape(stderr, tok->cur, tok->inp - tok->cur);
1038 21 : fprintf(stderr, " tok->done = %d\n", tok->done);
1039 : }
1040 : #endif
1041 9032720 : if (!rc) {
1042 124336 : tok->cur = tok->inp;
1043 124336 : return EOF;
1044 : }
1045 8908380 : tok->line_start = tok->cur;
1046 : }
1047 : Py_UNREACHABLE();
1048 : }
1049 :
1050 : /* Back-up one character */
1051 :
1052 : static void
1053 84640500 : tok_backup(struct tok_state *tok, int c)
1054 : {
1055 84640500 : if (c != EOF) {
1056 84398300 : if (--tok->cur < tok->buf) {
1057 0 : Py_FatalError("tokenizer beginning of buffer");
1058 : }
1059 84398300 : if ((int)(unsigned char)*tok->cur != c) {
1060 0 : Py_FatalError("tok_backup: wrong character");
1061 : }
1062 : }
1063 84640500 : }
1064 :
1065 : static int
1066 232 : _syntaxerror_range(struct tok_state *tok, const char *format,
1067 : int col_offset, int end_col_offset,
1068 : va_list vargs)
1069 : {
1070 : PyObject *errmsg, *errtext, *args;
1071 232 : errmsg = PyUnicode_FromFormatV(format, vargs);
1072 232 : if (!errmsg) {
1073 0 : goto error;
1074 : }
1075 :
1076 232 : errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1077 : "replace");
1078 232 : if (!errtext) {
1079 0 : goto error;
1080 : }
1081 :
1082 232 : if (col_offset == -1) {
1083 219 : col_offset = (int)PyUnicode_GET_LENGTH(errtext);
1084 : }
1085 232 : if (end_col_offset == -1) {
1086 219 : end_col_offset = col_offset;
1087 : }
1088 :
1089 232 : Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1090 232 : if (line_len != tok->cur - tok->line_start) {
1091 156 : Py_DECREF(errtext);
1092 156 : errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1093 : "replace");
1094 : }
1095 232 : if (!errtext) {
1096 0 : goto error;
1097 : }
1098 :
1099 232 : args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
1100 : col_offset, errtext, tok->lineno, end_col_offset);
1101 232 : if (args) {
1102 232 : PyErr_SetObject(PyExc_SyntaxError, args);
1103 232 : Py_DECREF(args);
1104 : }
1105 :
1106 0 : error:
1107 232 : Py_XDECREF(errmsg);
1108 232 : tok->done = E_ERROR;
1109 232 : return ERRORTOKEN;
1110 : }
1111 :
1112 : static int
1113 219 : syntaxerror(struct tok_state *tok, const char *format, ...)
1114 : {
1115 : va_list vargs;
1116 219 : va_start(vargs, format);
1117 219 : int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
1118 219 : va_end(vargs);
1119 219 : return ret;
1120 : }
1121 :
1122 : static int
1123 13 : syntaxerror_known_range(struct tok_state *tok,
1124 : int col_offset, int end_col_offset,
1125 : const char *format, ...)
1126 : {
1127 : va_list vargs;
1128 13 : va_start(vargs, format);
1129 13 : int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);
1130 13 : va_end(vargs);
1131 13 : return ret;
1132 : }
1133 :
1134 :
1135 :
1136 : static int
1137 2 : indenterror(struct tok_state *tok)
1138 : {
1139 2 : tok->done = E_TABSPACE;
1140 2 : tok->cur = tok->inp;
1141 2 : return ERRORTOKEN;
1142 : }
1143 :
1144 : static int
1145 130 : parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...)
1146 : {
1147 : PyObject *errmsg;
1148 : va_list vargs;
1149 130 : va_start(vargs, format);
1150 130 : errmsg = PyUnicode_FromFormatV(format, vargs);
1151 130 : va_end(vargs);
1152 130 : if (!errmsg) {
1153 0 : goto error;
1154 : }
1155 :
1156 130 : if (PyErr_WarnExplicitObject(category, errmsg, tok->filename,
1157 : tok->lineno, NULL, NULL) < 0) {
1158 64 : if (PyErr_ExceptionMatches(category)) {
1159 : /* Replace the DeprecationWarning exception with a SyntaxError
1160 : to get a more accurate error report */
1161 64 : PyErr_Clear();
1162 64 : syntaxerror(tok, "%U", errmsg);
1163 : }
1164 64 : goto error;
1165 : }
1166 66 : Py_DECREF(errmsg);
1167 66 : return 0;
1168 :
1169 64 : error:
1170 64 : Py_XDECREF(errmsg);
1171 64 : tok->done = E_ERROR;
1172 64 : return -1;
1173 : }
1174 :
1175 : static int
1176 91 : lookahead(struct tok_state *tok, const char *test)
1177 : {
1178 91 : const char *s = test;
1179 91 : int res = 0;
1180 158 : while (1) {
1181 249 : int c = tok_nextc(tok);
1182 249 : if (*s == 0) {
1183 82 : res = !is_potential_identifier_char(c);
1184 : }
1185 167 : else if (c == *s) {
1186 158 : s++;
1187 158 : continue;
1188 : }
1189 :
1190 91 : tok_backup(tok, c);
1191 249 : while (s != test) {
1192 158 : tok_backup(tok, *--s);
1193 : }
1194 91 : return res;
1195 : }
1196 : }
1197 :
1198 : static int
1199 2913270 : verify_end_of_number(struct tok_state *tok, int c, const char *kind)
1200 : {
1201 : /* Emit a deprecation warning only if the numeric literal is immediately
1202 : * followed by one of keywords which can occur after a numeric literal
1203 : * in valid code: "and", "else", "for", "if", "in", "is" and "or".
1204 : * It allows to gradually deprecate existing valid code without adding
1205 : * warning before error in most cases of invalid numeric literal (which
1206 : * would be confusing and break existing tests).
1207 : * Raise a syntax error with slightly better message than plain
1208 : * "invalid syntax" if the numeric literal is immediately followed by
1209 : * other keyword or identifier.
1210 : */
1211 2913270 : int r = 0;
1212 2913270 : if (c == 'a') {
1213 14 : r = lookahead(tok, "nd");
1214 : }
1215 2913250 : else if (c == 'e') {
1216 24 : r = lookahead(tok, "lse");
1217 : }
1218 2913230 : else if (c == 'f') {
1219 14 : r = lookahead(tok, "or");
1220 : }
1221 2913220 : else if (c == 'i') {
1222 48 : int c2 = tok_nextc(tok);
1223 48 : if (c2 == 'f' || c2 == 'n' || c2 == 's') {
1224 48 : r = 1;
1225 : }
1226 48 : tok_backup(tok, c2);
1227 : }
1228 2913170 : else if (c == 'o') {
1229 22 : r = lookahead(tok, "r");
1230 : }
1231 2913140 : else if (c == 'n') {
1232 17 : r = lookahead(tok, "ot");
1233 : }
1234 2913270 : if (r) {
1235 130 : tok_backup(tok, c);
1236 130 : if (parser_warn(tok, PyExc_SyntaxWarning,
1237 : "invalid %s literal", kind))
1238 : {
1239 64 : return 0;
1240 : }
1241 66 : tok_nextc(tok);
1242 : }
1243 : else /* In future releases, only error will remain. */
1244 2913140 : if (is_potential_identifier_char(c)) {
1245 24 : tok_backup(tok, c);
1246 24 : syntaxerror(tok, "invalid %s literal", kind);
1247 24 : return 0;
1248 : }
1249 2913180 : return 1;
1250 : }
1251 :
1252 : /* Verify that the identifier follows PEP 3131.
1253 : All identifier strings are guaranteed to be "ready" unicode objects.
1254 : */
1255 : static int
1256 214 : verify_identifier(struct tok_state *tok)
1257 : {
1258 : PyObject *s;
1259 214 : if (tok->decoding_erred)
1260 0 : return 0;
1261 214 : s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1262 214 : if (s == NULL) {
1263 4 : if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1264 4 : tok->done = E_DECODE;
1265 : }
1266 : else {
1267 0 : tok->done = E_ERROR;
1268 : }
1269 4 : return 0;
1270 : }
1271 210 : Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
1272 210 : if (invalid < 0) {
1273 0 : Py_DECREF(s);
1274 0 : tok->done = E_ERROR;
1275 0 : return 0;
1276 : }
1277 210 : assert(PyUnicode_GET_LENGTH(s) > 0);
1278 210 : if (invalid < PyUnicode_GET_LENGTH(s)) {
1279 10 : Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
1280 10 : if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
1281 : /* Determine the offset in UTF-8 encoded input */
1282 0 : Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
1283 0 : if (s != NULL) {
1284 0 : Py_SETREF(s, PyUnicode_AsUTF8String(s));
1285 : }
1286 0 : if (s == NULL) {
1287 0 : tok->done = E_ERROR;
1288 0 : return 0;
1289 : }
1290 0 : tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
1291 : }
1292 10 : Py_DECREF(s);
1293 : // PyUnicode_FromFormatV() does not support %X
1294 : char hex[9];
1295 10 : (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);
1296 10 : if (Py_UNICODE_ISPRINTABLE(ch)) {
1297 7 : syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
1298 : }
1299 : else {
1300 3 : syntaxerror(tok, "invalid non-printable character U+%s", hex);
1301 : }
1302 10 : return 0;
1303 : }
1304 200 : Py_DECREF(s);
1305 200 : return 1;
1306 : }
1307 :
1308 : static int
1309 4658080 : tok_decimal_tail(struct tok_state *tok)
1310 : {
1311 : int c;
1312 :
1313 1871 : while (1) {
1314 : do {
1315 4658080 : c = tok_nextc(tok);
1316 4658080 : } while (isdigit(c));
1317 2045350 : if (c != '_') {
1318 2043450 : break;
1319 : }
1320 1898 : c = tok_nextc(tok);
1321 1898 : if (!isdigit(c)) {
1322 27 : tok_backup(tok, c);
1323 27 : syntaxerror(tok, "invalid decimal literal");
1324 27 : return 0;
1325 : }
1326 : }
1327 2043450 : return c;
1328 : }
1329 :
1330 : /* Get next token, after space stripping etc. */
1331 :
1332 : static inline int
1333 4489 : tok_continuation_line(struct tok_state *tok) {
1334 4489 : int c = tok_nextc(tok);
1335 4489 : if (c != '\n') {
1336 7 : tok->done = E_LINECONT;
1337 7 : return -1;
1338 : }
1339 4482 : c = tok_nextc(tok);
1340 4482 : if (c == EOF) {
1341 11 : tok->done = E_EOF;
1342 11 : tok->cur = tok->inp;
1343 11 : return -1;
1344 : } else {
1345 4471 : tok_backup(tok, c);
1346 : }
1347 4471 : return c;
1348 : }
1349 :
1350 : static int
1351 39583700 : tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
1352 : {
1353 : int c;
1354 : int blankline, nonascii;
1355 :
1356 39583700 : *p_start = *p_end = NULL;
1357 44376100 : nextline:
1358 44376100 : tok->start = NULL;
1359 44376100 : blankline = 0;
1360 :
1361 : /* Get indentation level */
1362 44376100 : if (tok->atbol) {
1363 8225190 : int col = 0;
1364 8225190 : int altcol = 0;
1365 8225190 : tok->atbol = 0;
1366 8225190 : int cont_line_col = 0;
1367 : for (;;) {
1368 48308800 : c = tok_nextc(tok);
1369 48308800 : if (c == ' ') {
1370 40082400 : col++, altcol++;
1371 : }
1372 8226470 : else if (c == '\t') {
1373 915 : col = (col / tok->tabsize + 1) * tok->tabsize;
1374 915 : altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
1375 : }
1376 8225550 : else if (c == '\014') {/* Control-L (formfeed) */
1377 334 : col = altcol = 0; /* For Emacs users */
1378 : }
1379 8225220 : else if (c == '\\') {
1380 : // Indentation cannot be split over multiple physical lines
1381 : // using backslashes. This means that if we found a backslash
1382 : // preceded by whitespace, **the first one we find** determines
1383 : // the level of indentation of whatever comes next.
1384 31 : cont_line_col = cont_line_col ? cont_line_col : col;
1385 31 : if ((c = tok_continuation_line(tok)) == -1) {
1386 2 : return ERRORTOKEN;
1387 : }
1388 : }
1389 : else {
1390 8225190 : break;
1391 : }
1392 : }
1393 8225190 : tok_backup(tok, c);
1394 8225190 : if (c == '#' || c == '\n') {
1395 : /* Lines with only whitespace and/or comments
1396 : shouldn't affect the indentation and are
1397 : not passed to the parser as NEWLINE tokens,
1398 : except *totally* empty lines in interactive
1399 : mode, which signal the end of a command group. */
1400 3628260 : if (col == 0 && c == '\n' && tok->prompt != NULL) {
1401 2 : blankline = 0; /* Let it through */
1402 : }
1403 3628260 : else if (tok->prompt != NULL && tok->lineno == 1) {
1404 : /* In interactive mode, if the first line contains
1405 : only spaces and/or a comment, let it through. */
1406 0 : blankline = 0;
1407 0 : col = altcol = 0;
1408 : }
1409 : else {
1410 3628260 : blankline = 1; /* Ignore completely */
1411 : }
1412 : /* We can't jump back right here since we still
1413 : may need to skip to the end of a comment */
1414 : }
1415 8225190 : if (!blankline && tok->level == 0) {
1416 3432800 : col = cont_line_col ? cont_line_col : col;
1417 3432800 : altcol = cont_line_col ? cont_line_col : altcol;
1418 3432800 : if (col == tok->indstack[tok->indent]) {
1419 : /* No change */
1420 1645700 : if (altcol != tok->altindstack[tok->indent]) {
1421 2 : return indenterror(tok);
1422 : }
1423 : }
1424 1787110 : else if (col > tok->indstack[tok->indent]) {
1425 : /* Indent -- always one */
1426 997324 : if (tok->indent+1 >= MAXINDENT) {
1427 0 : tok->done = E_TOODEEP;
1428 0 : tok->cur = tok->inp;
1429 0 : return ERRORTOKEN;
1430 : }
1431 997324 : if (altcol <= tok->altindstack[tok->indent]) {
1432 0 : return indenterror(tok);
1433 : }
1434 997324 : tok->pendin++;
1435 997324 : tok->indstack[++tok->indent] = col;
1436 997324 : tok->altindstack[tok->indent] = altcol;
1437 : }
1438 : else /* col < tok->indstack[tok->indent] */ {
1439 : /* Dedent -- any number, must be consistent */
1440 1787070 : while (tok->indent > 0 &&
1441 1652600 : col < tok->indstack[tok->indent]) {
1442 997281 : tok->pendin--;
1443 997281 : tok->indent--;
1444 : }
1445 789785 : if (col != tok->indstack[tok->indent]) {
1446 6 : tok->done = E_DEDENT;
1447 6 : tok->cur = tok->inp;
1448 6 : return ERRORTOKEN;
1449 : }
1450 789779 : if (altcol != tok->altindstack[tok->indent]) {
1451 0 : return indenterror(tok);
1452 : }
1453 : }
1454 : }
1455 : }
1456 :
1457 44376000 : tok->start = tok->cur;
1458 :
1459 : /* Return pending indents/dedents */
1460 44376000 : if (tok->pendin != 0) {
1461 1994610 : if (tok->pendin < 0) {
1462 997282 : tok->pendin++;
1463 997282 : return DEDENT;
1464 : }
1465 : else {
1466 997324 : tok->pendin--;
1467 997324 : return INDENT;
1468 : }
1469 : }
1470 :
1471 : /* Peek ahead at the next character */
1472 42381400 : c = tok_nextc(tok);
1473 42381400 : tok_backup(tok, c);
1474 : /* Check if we are closing an async function */
1475 42381400 : if (tok->async_def
1476 141 : && !blankline
1477 : /* Due to some implementation artifacts of type comments,
1478 : * a TYPE_COMMENT at the start of a function won't set an
1479 : * indentation level and it will produce a NEWLINE after it.
1480 : * To avoid spuriously ending an async function due to this,
1481 : * wait until we have some non-newline char in front of us. */
1482 135 : && c != '\n'
1483 114 : && tok->level == 0
1484 : /* There was a NEWLINE after ASYNC DEF,
1485 : so we're past the signature. */
1486 75 : && tok->async_def_nl
1487 : /* Current indentation level is less than where
1488 : the async function was defined */
1489 36 : && tok->async_def_indent >= tok->indent)
1490 : {
1491 9 : tok->async_def = 0;
1492 9 : tok->async_def_indent = 0;
1493 9 : tok->async_def_nl = 0;
1494 : }
1495 :
1496 42381400 : again:
1497 42385900 : tok->start = NULL;
1498 : /* Skip spaces */
1499 : do {
1500 52004300 : c = tok_nextc(tok);
1501 52004300 : } while (c == ' ' || c == '\t' || c == '\014');
1502 :
1503 : /* Set start of current token */
1504 42385900 : tok->start = tok->cur - 1;
1505 :
1506 : /* Skip comment, unless it's a type comment */
1507 42385900 : if (c == '#') {
1508 : const char *prefix, *p, *type_start;
1509 :
1510 22033900 : while (c != EOF && c != '\n') {
1511 21228300 : c = tok_nextc(tok);
1512 : }
1513 :
1514 805673 : if (tok->type_comments) {
1515 611 : p = tok->start;
1516 611 : prefix = type_comment_prefix;
1517 5499 : while (*prefix && p < tok->cur) {
1518 4888 : if (*prefix == ' ') {
1519 2443 : while (*p == ' ' || *p == '\t') {
1520 1221 : p++;
1521 : }
1522 3666 : } else if (*prefix == *p) {
1523 3666 : p++;
1524 : } else {
1525 0 : break;
1526 : }
1527 :
1528 4888 : prefix++;
1529 : }
1530 :
1531 : /* This is a type comment if we matched all of type_comment_prefix. */
1532 611 : if (!*prefix) {
1533 611 : int is_type_ignore = 1;
1534 611 : const char *ignore_end = p + 6;
1535 611 : tok_backup(tok, c); /* don't eat the newline or EOF */
1536 :
1537 611 : type_start = p;
1538 :
1539 : /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
1540 : * or anything ASCII and non-alphanumeric. */
1541 611 : is_type_ignore = (
1542 165 : tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
1543 836 : && !(tok->cur > ignore_end
1544 60 : && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
1545 :
1546 611 : if (is_type_ignore) {
1547 74 : *p_start = ignore_end;
1548 74 : *p_end = tok->cur;
1549 :
1550 : /* If this type ignore is the only thing on the line, consume the newline also. */
1551 74 : if (blankline) {
1552 0 : tok_nextc(tok);
1553 0 : tok->atbol = 1;
1554 : }
1555 74 : return TYPE_IGNORE;
1556 : } else {
1557 537 : *p_start = type_start; /* after type_comment_prefix */
1558 537 : *p_end = tok->cur;
1559 537 : return TYPE_COMMENT;
1560 : }
1561 : }
1562 : }
1563 : }
1564 :
1565 42385300 : if (tok->done == E_INTERACT_STOP) {
1566 0 : return ENDMARKER;
1567 : }
1568 :
1569 : /* Check for EOF and errors now */
1570 42385300 : if (c == EOF) {
1571 124369 : if (tok->level) {
1572 197 : return ERRORTOKEN;
1573 : }
1574 124172 : return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1575 : }
1576 :
1577 : /* Identifier (most frequent token!) */
1578 42260900 : nonascii = 0;
1579 42260900 : if (is_potential_identifier_start(c)) {
1580 : /* Process the various legal combinations of b"", r"", u"", and f"". */
1581 12536100 : int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
1582 : while (1) {
1583 14714200 : if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
1584 764889 : saw_b = 1;
1585 : /* Since this is a backwards compatibility support literal we don't
1586 : want to support it in arbitrary order like byte literals. */
1587 13949300 : else if (!(saw_b || saw_u || saw_r || saw_f)
1588 11771400 : && (c == 'u'|| c == 'U')) {
1589 130866 : saw_u = 1;
1590 : }
1591 : /* ur"" and ru"" are not supported */
1592 13818500 : else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
1593 774232 : saw_r = 1;
1594 : }
1595 13044200 : else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
1596 599208 : saw_f = 1;
1597 : }
1598 : else {
1599 : break;
1600 : }
1601 2269200 : c = tok_nextc(tok);
1602 2269200 : if (c == '"' || c == '\'') {
1603 91048 : goto letter_quote;
1604 : }
1605 : }
1606 79156100 : while (is_potential_identifier_char(c)) {
1607 66711000 : if (c >= 128) {
1608 1999 : nonascii = 1;
1609 : }
1610 66711000 : c = tok_nextc(tok);
1611 : }
1612 12445000 : tok_backup(tok, c);
1613 12445000 : if (nonascii && !verify_identifier(tok)) {
1614 14 : return ERRORTOKEN;
1615 : }
1616 :
1617 12445000 : *p_start = tok->start;
1618 12445000 : *p_end = tok->cur;
1619 :
1620 : /* async/await parsing block. */
1621 12445000 : if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1622 : /* May be an 'async' or 'await' token. For Python 3.7 or
1623 : later we recognize them unconditionally. For Python
1624 : 3.5 or 3.6 we recognize 'async' in front of 'def', and
1625 : either one inside of 'async def'. (Technically we
1626 : shouldn't recognize these at all for 3.4 or earlier,
1627 : but there's no *valid* Python 3.4 code that would be
1628 : rejected, and async functions will be rejected in a
1629 : later phase.) */
1630 22523 : if (!tok->async_hacks || tok->async_def) {
1631 : /* Always recognize the keywords. */
1632 22508 : if (memcmp(tok->start, "async", 5) == 0) {
1633 4306 : return ASYNC;
1634 : }
1635 18202 : if (memcmp(tok->start, "await", 5) == 0) {
1636 2426 : return AWAIT;
1637 : }
1638 : }
1639 15 : else if (memcmp(tok->start, "async", 5) == 0) {
1640 : /* The current token is 'async'.
1641 : Look ahead one token to see if that is 'def'. */
1642 :
1643 : struct tok_state ahead_tok;
1644 12 : const char *ahead_tok_start = NULL;
1645 12 : const char *ahead_tok_end = NULL;
1646 : int ahead_tok_kind;
1647 :
1648 12 : memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1649 12 : ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1650 : &ahead_tok_end);
1651 :
1652 12 : if (ahead_tok_kind == NAME
1653 9 : && ahead_tok.cur - ahead_tok.start == 3
1654 9 : && memcmp(ahead_tok.start, "def", 3) == 0)
1655 : {
1656 : /* The next token is going to be 'def', so instead of
1657 : returning a plain NAME token, return ASYNC. */
1658 9 : tok->async_def_indent = tok->indent;
1659 9 : tok->async_def = 1;
1660 9 : return ASYNC;
1661 : }
1662 : }
1663 : }
1664 :
1665 12438300 : return NAME;
1666 : }
1667 :
1668 : /* Newline */
1669 29724800 : if (c == '\n') {
1670 8100170 : tok->atbol = 1;
1671 8100170 : if (blankline || tok->level > 0) {
1672 4792350 : goto nextline;
1673 : }
1674 3307820 : *p_start = tok->start;
1675 3307820 : *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1676 3307820 : tok->cont_line = 0;
1677 3307820 : if (tok->async_def) {
1678 : /* We're somewhere inside an 'async def' function, and
1679 : we've encountered a NEWLINE after its signature. */
1680 21 : tok->async_def_nl = 1;
1681 : }
1682 3307820 : return NEWLINE;
1683 : }
1684 :
1685 : /* Period or number starting with period? */
1686 21624700 : if (c == '.') {
1687 2181670 : c = tok_nextc(tok);
1688 2181670 : if (isdigit(c)) {
1689 168 : goto fraction;
1690 2181500 : } else if (c == '.') {
1691 5886 : c = tok_nextc(tok);
1692 5886 : if (c == '.') {
1693 5233 : *p_start = tok->start;
1694 5233 : *p_end = tok->cur;
1695 5233 : return ELLIPSIS;
1696 : }
1697 : else {
1698 653 : tok_backup(tok, c);
1699 : }
1700 653 : tok_backup(tok, '.');
1701 : }
1702 : else {
1703 2175620 : tok_backup(tok, c);
1704 : }
1705 2176270 : *p_start = tok->start;
1706 2176270 : *p_end = tok->cur;
1707 2176270 : return DOT;
1708 : }
1709 :
1710 : /* Number */
1711 19443000 : if (isdigit(c)) {
1712 2913200 : if (c == '0') {
1713 : /* Hex, octal or binary -- maybe. */
1714 898262 : c = tok_nextc(tok);
1715 898262 : if (c == 'x' || c == 'X') {
1716 : /* Hex */
1717 96464 : c = tok_nextc(tok);
1718 : do {
1719 96570 : if (c == '_') {
1720 110 : c = tok_nextc(tok);
1721 : }
1722 96570 : if (!isxdigit(c)) {
1723 16 : tok_backup(tok, c);
1724 16 : return syntaxerror(tok, "invalid hexadecimal literal");
1725 : }
1726 : do {
1727 518134 : c = tok_nextc(tok);
1728 518134 : } while (isxdigit(c));
1729 96554 : } while (c == '_');
1730 96448 : if (!verify_end_of_number(tok, c, "hexadecimal")) {
1731 12 : return ERRORTOKEN;
1732 : }
1733 : }
1734 801798 : else if (c == 'o' || c == 'O') {
1735 : /* Octal */
1736 2044 : c = tok_nextc(tok);
1737 : do {
1738 2054 : if (c == '_') {
1739 12 : c = tok_nextc(tok);
1740 : }
1741 2054 : if (c < '0' || c >= '8') {
1742 12 : if (isdigit(c)) {
1743 5 : return syntaxerror(tok,
1744 : "invalid digit '%c' in octal literal", c);
1745 : }
1746 : else {
1747 7 : tok_backup(tok, c);
1748 7 : return syntaxerror(tok, "invalid octal literal");
1749 : }
1750 : }
1751 : do {
1752 8450 : c = tok_nextc(tok);
1753 8450 : } while ('0' <= c && c < '8');
1754 2042 : } while (c == '_');
1755 2032 : if (isdigit(c)) {
1756 3 : return syntaxerror(tok,
1757 : "invalid digit '%c' in octal literal", c);
1758 : }
1759 2029 : if (!verify_end_of_number(tok, c, "octal")) {
1760 11 : return ERRORTOKEN;
1761 : }
1762 : }
1763 799754 : else if (c == 'b' || c == 'B') {
1764 : /* Binary */
1765 387 : c = tok_nextc(tok);
1766 : do {
1767 413 : if (c == '_') {
1768 28 : c = tok_nextc(tok);
1769 : }
1770 413 : if (c != '0' && c != '1') {
1771 12 : if (isdigit(c)) {
1772 5 : return syntaxerror(tok,
1773 : "invalid digit '%c' in binary literal", c);
1774 : }
1775 : else {
1776 7 : tok_backup(tok, c);
1777 7 : return syntaxerror(tok, "invalid binary literal");
1778 : }
1779 : }
1780 : do {
1781 4927 : c = tok_nextc(tok);
1782 4927 : } while (c == '0' || c == '1');
1783 401 : } while (c == '_');
1784 375 : if (isdigit(c)) {
1785 2 : return syntaxerror(tok,
1786 : "invalid digit '%c' in binary literal", c);
1787 : }
1788 373 : if (!verify_end_of_number(tok, c, "binary")) {
1789 11 : return ERRORTOKEN;
1790 : }
1791 : }
1792 : else {
1793 799367 : int nonzero = 0;
1794 : /* maybe old-style octal; c is first char of it */
1795 : /* in any case, allow '0' as a literal */
1796 : while (1) {
1797 799574 : if (c == '_') {
1798 13 : c = tok_nextc(tok);
1799 13 : if (!isdigit(c)) {
1800 4 : tok_backup(tok, c);
1801 4 : return syntaxerror(tok, "invalid decimal literal");
1802 : }
1803 : }
1804 799570 : if (c != '0') {
1805 799363 : break;
1806 : }
1807 207 : c = tok_nextc(tok);
1808 : }
1809 799363 : char* zeros_end = tok->cur;
1810 799363 : if (isdigit(c)) {
1811 26 : nonzero = 1;
1812 26 : c = tok_decimal_tail(tok);
1813 26 : if (c == 0) {
1814 0 : return ERRORTOKEN;
1815 : }
1816 : }
1817 799363 : if (c == '.') {
1818 8839 : c = tok_nextc(tok);
1819 8839 : goto fraction;
1820 : }
1821 790524 : else if (c == 'e' || c == 'E') {
1822 13 : goto exponent;
1823 : }
1824 790511 : else if (c == 'j' || c == 'J') {
1825 509 : goto imaginary;
1826 : }
1827 790002 : else if (nonzero) {
1828 : /* Old-style octal: now disallowed. */
1829 13 : tok_backup(tok, c);
1830 13 : return syntaxerror_known_range(
1831 13 : tok, (int)(tok->start + 1 - tok->line_start),
1832 13 : (int)(zeros_end - tok->line_start),
1833 : "leading zeros in decimal integer "
1834 : "literals are not permitted; "
1835 : "use an 0o prefix for octal integers");
1836 : }
1837 789989 : if (!verify_end_of_number(tok, c, "decimal")) {
1838 7 : return ERRORTOKEN;
1839 : }
1840 : }
1841 : }
1842 : else {
1843 : /* Decimal */
1844 2014930 : c = tok_decimal_tail(tok);
1845 2014930 : if (c == 0) {
1846 11 : return ERRORTOKEN;
1847 : }
1848 : {
1849 : /* Accept floating point numbers. */
1850 2014920 : if (c == '.') {
1851 16316 : c = tok_nextc(tok);
1852 25323 : fraction:
1853 : /* Fraction */
1854 25323 : if (isdigit(c)) {
1855 24105 : c = tok_decimal_tail(tok);
1856 24105 : if (c == 0) {
1857 10 : return ERRORTOKEN;
1858 : }
1859 : }
1860 : }
1861 2023920 : if (c == 'e' || c == 'E') {
1862 : int e;
1863 4425 : exponent:
1864 4438 : e = c;
1865 : /* Exponent part */
1866 4438 : c = tok_nextc(tok);
1867 4438 : if (c == '+' || c == '-') {
1868 3131 : c = tok_nextc(tok);
1869 3131 : if (!isdigit(c)) {
1870 8 : tok_backup(tok, c);
1871 8 : return syntaxerror(tok, "invalid decimal literal");
1872 : }
1873 1307 : } else if (!isdigit(c)) {
1874 15 : tok_backup(tok, c);
1875 15 : if (!verify_end_of_number(tok, e, "decimal")) {
1876 10 : return ERRORTOKEN;
1877 : }
1878 5 : tok_backup(tok, e);
1879 5 : *p_start = tok->start;
1880 5 : *p_end = tok->cur;
1881 5 : return NUMBER;
1882 : }
1883 4415 : c = tok_decimal_tail(tok);
1884 4415 : if (c == 0) {
1885 6 : return ERRORTOKEN;
1886 : }
1887 : }
1888 2023900 : if (c == 'j' || c == 'J') {
1889 : /* Imaginary part */
1890 1639 : imaginary:
1891 2148 : c = tok_nextc(tok);
1892 2148 : if (!verify_end_of_number(tok, c, "imaginary")) {
1893 10 : return ERRORTOKEN;
1894 : }
1895 : }
1896 2022260 : else if (!verify_end_of_number(tok, c, "decimal")) {
1897 27 : return ERRORTOKEN;
1898 : }
1899 : }
1900 : }
1901 2913170 : tok_backup(tok, c);
1902 2913170 : *p_start = tok->start;
1903 2913170 : *p_end = tok->cur;
1904 2913170 : return NUMBER;
1905 : }
1906 :
1907 16529800 : letter_quote:
1908 : /* String */
1909 16620800 : if (c == '\'' || c == '"') {
1910 1691840 : int quote = c;
1911 1691840 : int quote_size = 1; /* 1 or 3 */
1912 1691840 : int end_quote_size = 0;
1913 :
1914 : /* Nodes of type STRING, especially multi line strings
1915 : must be handled differently in order to get both
1916 : the starting line number and the column offset right.
1917 : (cf. issue 16806) */
1918 1691840 : tok->first_lineno = tok->lineno;
1919 1691840 : tok->multi_line_start = tok->line_start;
1920 :
1921 : /* Find the quote size and start of string */
1922 1691840 : c = tok_nextc(tok);
1923 1691840 : if (c == quote) {
1924 229617 : c = tok_nextc(tok);
1925 229617 : if (c == quote) {
1926 122483 : quote_size = 3;
1927 : }
1928 : else {
1929 107134 : end_quote_size = 1; /* empty string found */
1930 : }
1931 : }
1932 1691840 : if (c != quote) {
1933 1569360 : tok_backup(tok, c);
1934 : }
1935 :
1936 : /* Get rest of string */
1937 52327000 : while (end_quote_size != quote_size) {
1938 50635200 : c = tok_nextc(tok);
1939 50635200 : if (c == EOF || (quote_size == 1 && c == '\n')) {
1940 18 : assert(tok->multi_line_start != NULL);
1941 : // shift the tok_state's location into
1942 : // the start of string, and report the error
1943 : // from the initial quote character
1944 18 : tok->cur = (char *)tok->start;
1945 18 : tok->cur++;
1946 18 : tok->line_start = tok->multi_line_start;
1947 18 : int start = tok->lineno;
1948 18 : tok->lineno = tok->first_lineno;
1949 18 : if (quote_size == 3) {
1950 6 : syntaxerror(tok, "unterminated triple-quoted string literal"
1951 : " (detected at line %d)", start);
1952 6 : if (c != '\n') {
1953 6 : tok->done = E_EOFS;
1954 : }
1955 6 : return ERRORTOKEN;
1956 : }
1957 : else {
1958 12 : syntaxerror(tok, "unterminated string literal (detected at"
1959 : " line %d)", start);
1960 12 : if (c != '\n') {
1961 6 : tok->done = E_EOLS;
1962 : }
1963 12 : return ERRORTOKEN;
1964 : }
1965 : }
1966 50635200 : if (c == quote) {
1967 1898940 : end_quote_size += 1;
1968 : }
1969 : else {
1970 48736300 : end_quote_size = 0;
1971 48736300 : if (c == '\\') {
1972 341804 : tok_nextc(tok); /* skip escaped char */
1973 : }
1974 : }
1975 : }
1976 :
1977 1691830 : *p_start = tok->start;
1978 1691830 : *p_end = tok->cur;
1979 1691830 : return STRING;
1980 : }
1981 :
1982 : /* Line continuation */
1983 14929000 : if (c == '\\') {
1984 4458 : if ((c = tok_continuation_line(tok)) == -1) {
1985 16 : return ERRORTOKEN;
1986 : }
1987 4442 : tok->cont_line = 1;
1988 4442 : goto again; /* Read next line */
1989 : }
1990 :
1991 : /* Check for two-character token */
1992 : {
1993 14924500 : int c2 = tok_nextc(tok);
1994 14924500 : int token = _PyToken_TwoChars(c, c2);
1995 14924500 : if (token != OP) {
1996 215764 : int c3 = tok_nextc(tok);
1997 215764 : int token3 = _PyToken_ThreeChars(c, c2, c3);
1998 215764 : if (token3 != OP) {
1999 765 : token = token3;
2000 : }
2001 : else {
2002 214999 : tok_backup(tok, c3);
2003 : }
2004 215764 : *p_start = tok->start;
2005 215764 : *p_end = tok->cur;
2006 215764 : return token;
2007 : }
2008 14708800 : tok_backup(tok, c2);
2009 : }
2010 :
2011 : /* Keep track of parentheses nesting level */
2012 14708800 : switch (c) {
2013 3392690 : case '(':
2014 : case '[':
2015 : case '{':
2016 3392690 : if (tok->level >= MAXLEVEL) {
2017 1 : return syntaxerror(tok, "too many nested parentheses");
2018 : }
2019 3392690 : tok->parenstack[tok->level] = c;
2020 3392690 : tok->parenlinenostack[tok->level] = tok->lineno;
2021 3392690 : tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
2022 3392690 : tok->level++;
2023 3392690 : break;
2024 3392240 : case ')':
2025 : case ']':
2026 : case '}':
2027 3392240 : if (!tok->level) {
2028 12 : return syntaxerror(tok, "unmatched '%c'", c);
2029 : }
2030 3392230 : tok->level--;
2031 3392230 : int opening = tok->parenstack[tok->level];
2032 3392230 : if (!((opening == '(' && c == ')') ||
2033 640080 : (opening == '[' && c == ']') ||
2034 47751 : (opening == '{' && c == '}')))
2035 : {
2036 5 : if (tok->parenlinenostack[tok->level] != tok->lineno) {
2037 0 : return syntaxerror(tok,
2038 : "closing parenthesis '%c' does not match "
2039 : "opening parenthesis '%c' on line %d",
2040 0 : c, opening, tok->parenlinenostack[tok->level]);
2041 : }
2042 : else {
2043 5 : return syntaxerror(tok,
2044 : "closing parenthesis '%c' does not match "
2045 : "opening parenthesis '%c'",
2046 : c, opening);
2047 : }
2048 : }
2049 3392220 : break;
2050 : }
2051 :
2052 14708800 : if (!Py_UNICODE_ISPRINTABLE(c)) {
2053 : char hex[9];
2054 1 : (void)PyOS_snprintf(hex, sizeof(hex), "%04X", c);
2055 1 : return syntaxerror(tok, "invalid non-printable character U+%s", hex);
2056 : }
2057 :
2058 : /* Punctuation character */
2059 14708800 : *p_start = tok->start;
2060 14708800 : *p_end = tok->cur;
2061 14708800 : return _PyToken_OneChar(c);
2062 : }
2063 :
2064 : int
2065 39583700 : _PyTokenizer_Get(struct tok_state *tok,
2066 : const char **p_start, const char **p_end)
2067 : {
2068 39583700 : int result = tok_get(tok, p_start, p_end);
2069 39583700 : if (tok->decoding_erred) {
2070 3 : result = ERRORTOKEN;
2071 3 : tok->done = E_DECODE;
2072 : }
2073 39583700 : return result;
2074 : }
2075 :
2076 : #if defined(__wasi__) || (defined(__EMSCRIPTEN__) && (__EMSCRIPTEN_major__ >= 3))
2077 : // fdopen() with borrowed fd. WASI does not provide dup() and Emscripten's
2078 : // dup() emulation with open() is slow.
2079 : typedef union {
2080 : void *cookie;
2081 : int fd;
2082 : } borrowed;
2083 :
2084 : static ssize_t
2085 : borrow_read(void *cookie, char *buf, size_t size)
2086 : {
2087 : borrowed b = {.cookie = cookie};
2088 : return read(b.fd, (void *)buf, size);
2089 : }
2090 :
2091 : static FILE *
2092 : fdopen_borrow(int fd) {
2093 : // supports only reading. seek fails. close and write are no-ops.
2094 : cookie_io_functions_t io_cb = {borrow_read, NULL, NULL, NULL};
2095 : borrowed b = {.fd = fd};
2096 : return fopencookie(b.cookie, "r", io_cb);
2097 : }
2098 : #else
2099 : static FILE *
2100 476 : fdopen_borrow(int fd) {
2101 476 : fd = _Py_dup(fd);
2102 476 : if (fd < 0) {
2103 0 : return NULL;
2104 : }
2105 476 : return fdopen(fd, "r");
2106 : }
2107 : #endif
2108 :
2109 : /* Get the encoding of a Python file. Check for the coding cookie and check if
2110 : the file starts with a BOM.
2111 :
2112 : _PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
2113 : encoding in the first or second line of the file (in which case the encoding
2114 : should be assumed to be UTF-8).
2115 :
2116 : The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed
2117 : by the caller. */
2118 :
2119 : char *
2120 476 : _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
2121 : {
2122 : struct tok_state *tok;
2123 : FILE *fp;
2124 476 : const char *p_start = NULL;
2125 476 : const char *p_end = NULL;
2126 476 : char *encoding = NULL;
2127 :
2128 476 : fp = fdopen_borrow(fd);
2129 476 : if (fp == NULL) {
2130 0 : return NULL;
2131 : }
2132 476 : tok = _PyTokenizer_FromFile(fp, NULL, NULL, NULL);
2133 476 : if (tok == NULL) {
2134 0 : fclose(fp);
2135 0 : return NULL;
2136 : }
2137 476 : if (filename != NULL) {
2138 476 : Py_INCREF(filename);
2139 476 : tok->filename = filename;
2140 : }
2141 : else {
2142 0 : tok->filename = PyUnicode_FromString("<string>");
2143 0 : if (tok->filename == NULL) {
2144 0 : fclose(fp);
2145 0 : _PyTokenizer_Free(tok);
2146 0 : return encoding;
2147 : }
2148 : }
2149 1438 : while (tok->lineno < 2 && tok->done == E_OK) {
2150 962 : _PyTokenizer_Get(tok, &p_start, &p_end);
2151 : }
2152 476 : fclose(fp);
2153 476 : if (tok->encoding) {
2154 12 : encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1);
2155 12 : if (encoding) {
2156 12 : strcpy(encoding, tok->encoding);
2157 : }
2158 : }
2159 476 : _PyTokenizer_Free(tok);
2160 476 : return encoding;
2161 : }
2162 :
2163 : #ifdef Py_DEBUG
2164 : void
2165 0 : tok_dump(int type, char *start, char *end)
2166 : {
2167 0 : fprintf(stderr, "%s", _PyParser_TokenNames[type]);
2168 0 : if (type == NAME || type == NUMBER || type == STRING || type == OP)
2169 0 : fprintf(stderr, "(%.*s)", (int)(end - start), start);
2170 0 : }
2171 : #endif // Py_DEBUG
|