LCOV - code coverage report
Current view: top level - Parser - tokenizer.c (source / functions) Hit Total Coverage
Test: CPython lcov report Lines: 1055 1196 88.2 %
Date: 2022-07-07 18:19:46 Functions: 46 47 97.9 %

          Line data    Source code
       1             : 
       2             : /* Tokenizer implementation */
       3             : 
       4             : #define PY_SSIZE_T_CLEAN
       5             : #include "Python.h"
       6             : #include "pycore_call.h"          // _PyObject_CallNoArgs()
       7             : 
       8             : #include <ctype.h>
       9             : #include <assert.h>
      10             : 
      11             : #include "tokenizer.h"
      12             : #include "errcode.h"
      13             : 
      14             : #include "unicodeobject.h"
      15             : #include "bytesobject.h"
      16             : #include "fileobject.h"
      17             : #include "abstract.h"
      18             : 
      19             : /* Alternate tab spacing */
      20             : #define ALTTABSIZE 1
      21             : 
      22             : #define is_potential_identifier_start(c) (\
      23             :               (c >= 'a' && c <= 'z')\
      24             :                || (c >= 'A' && c <= 'Z')\
      25             :                || c == '_'\
      26             :                || (c >= 128))
      27             : 
      28             : #define is_potential_identifier_char(c) (\
      29             :               (c >= 'a' && c <= 'z')\
      30             :                || (c >= 'A' && c <= 'Z')\
      31             :                || (c >= '0' && c <= '9')\
      32             :                || c == '_'\
      33             :                || (c >= 128))
      34             : 
      35             : 
      36             : /* Don't ever change this -- it would break the portability of Python code */
      37             : #define TABSIZE 8
      38             : 
      39             : /* Forward */
      40             : static struct tok_state *tok_new(void);
      41             : static int tok_nextc(struct tok_state *tok);
      42             : static void tok_backup(struct tok_state *tok, int c);
      43             : static int syntaxerror(struct tok_state *tok, const char *format, ...);
      44             : 
      45             : /* Spaces in this constant are treated as "zero or more spaces or tabs" when
      46             :    tokenizing. */
      47             : static const char* type_comment_prefix = "# type: ";
      48             : 
      49             : /* Create and initialize a new tok_state structure */
      50             : 
      51             : static struct tok_state *
      52      224563 : tok_new(void)
      53             : {
      54      224563 :     struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
      55             :                                             sizeof(struct tok_state));
      56      224563 :     if (tok == NULL)
      57           0 :         return NULL;
      58      224563 :     tok->buf = tok->cur = tok->inp = NULL;
      59      224563 :     tok->fp_interactive = 0;
      60      224563 :     tok->interactive_src_start = NULL;
      61      224563 :     tok->interactive_src_end = NULL;
      62      224563 :     tok->start = NULL;
      63      224563 :     tok->end = NULL;
      64      224563 :     tok->done = E_OK;
      65      224563 :     tok->fp = NULL;
      66      224563 :     tok->input = NULL;
      67      224563 :     tok->tabsize = TABSIZE;
      68      224563 :     tok->indent = 0;
      69      224563 :     tok->indstack[0] = 0;
      70      224563 :     tok->atbol = 1;
      71      224563 :     tok->pendin = 0;
      72      224563 :     tok->prompt = tok->nextprompt = NULL;
      73      224563 :     tok->lineno = 0;
      74      224563 :     tok->level = 0;
      75      224563 :     tok->altindstack[0] = 0;
      76      224563 :     tok->decoding_state = STATE_INIT;
      77      224563 :     tok->decoding_erred = 0;
      78      224563 :     tok->enc = NULL;
      79      224563 :     tok->encoding = NULL;
      80      224563 :     tok->cont_line = 0;
      81      224563 :     tok->filename = NULL;
      82      224563 :     tok->decoding_readline = NULL;
      83      224563 :     tok->decoding_buffer = NULL;
      84      224563 :     tok->type_comments = 0;
      85      224563 :     tok->async_hacks = 0;
      86      224563 :     tok->async_def = 0;
      87      224563 :     tok->async_def_indent = 0;
      88      224563 :     tok->async_def_nl = 0;
      89      224563 :     tok->interactive_underflow = IUNDERFLOW_NORMAL;
      90      224563 :     tok->str = NULL;
      91             : #ifdef Py_DEBUG
      92      224563 :     tok->debug = _Py_GetConfig()->parser_debug;
      93             : #endif
      94      224563 :     return tok;
      95             : }
      96             : 
      97             : static char *
      98      116446 : new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
      99             : {
     100      116446 :     char* result = (char *)PyMem_Malloc(len + 1);
     101      116446 :     if (!result) {
     102           0 :         tok->done = E_NOMEM;
     103           0 :         return NULL;
     104             :     }
     105      116446 :     memcpy(result, s, len);
     106      116446 :     result[len] = '\0';
     107      116446 :     return result;
     108             : }
     109             : 
     110             : static char *
     111          29 : error_ret(struct tok_state *tok) /* XXX */
     112             : {
     113          29 :     tok->decoding_erred = 1;
     114          29 :     if (tok->fp != NULL && tok->buf != NULL) /* see _PyTokenizer_Free */
     115           3 :         PyMem_Free(tok->buf);
     116          29 :     tok->buf = tok->cur = tok->inp = NULL;
     117          29 :     tok->start = NULL;
     118          29 :     tok->end = NULL;
     119          29 :     tok->done = E_DECODE;
     120          29 :     return NULL;                /* as if it were EOF */
     121             : }
     122             : 
     123             : 
     124             : static const char *
     125         666 : get_normal_name(const char *s)  /* for utf-8 and latin-1 */
     126             : {
     127             :     char buf[13];
     128             :     int i;
     129        4351 :     for (i = 0; i < 12; i++) {
     130        4351 :         int c = s[i];
     131        4351 :         if (c == '\0')
     132         666 :             break;
     133        3685 :         else if (c == '_')
     134           0 :             buf[i] = '-';
     135             :         else
     136        3685 :             buf[i] = tolower(c);
     137             :     }
     138         666 :     buf[i] = '\0';
     139         666 :     if (strcmp(buf, "utf-8") == 0 ||
     140         132 :         strncmp(buf, "utf-8-", 6) == 0)
     141         534 :         return "utf-8";
     142         132 :     else if (strcmp(buf, "latin-1") == 0 ||
     143         118 :              strcmp(buf, "iso-8859-1") == 0 ||
     144          73 :              strcmp(buf, "iso-latin-1") == 0 ||
     145          73 :              strncmp(buf, "latin-1-", 8) == 0 ||
     146          73 :              strncmp(buf, "iso-8859-1-", 11) == 0 ||
     147          73 :              strncmp(buf, "iso-latin-1-", 12) == 0)
     148          59 :         return "iso-8859-1";
     149             :     else
     150          73 :         return s;
     151             : }
     152             : 
     153             : /* Return the coding spec in S, or NULL if none is found.  */
     154             : 
     155             : static int
     156      112268 : get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
     157             : {
     158             :     Py_ssize_t i;
     159      112268 :     *spec = NULL;
     160             :     /* Coding spec must be in a comment, and that comment must be
     161             :      * the only statement on the source code line. */
     162      112268 :     for (i = 0; i < size - 6; i++) {
     163       33450 :         if (s[i] == '#')
     164        4536 :             break;
     165       28914 :         if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
     166       28914 :             return 1;
     167             :     }
     168      252728 :     for (; i < size - 6; i++) { /* XXX inefficient search */
     169      170040 :         const char* t = s + i;
     170      170040 :         if (memcmp(t, "coding", 6) == 0) {
     171         674 :             const char* begin = NULL;
     172         674 :             t += 6;
     173         674 :             if (t[0] != ':' && t[0] != '=')
     174           8 :                 continue;
     175             :             do {
     176        1287 :                 t++;
     177        1287 :             } while (t[0] == ' ' || t[0] == '\t');
     178             : 
     179         666 :             begin = t;
     180        4351 :             while (Py_ISALNUM(t[0]) ||
     181        1342 :                    t[0] == '-' || t[0] == '_' || t[0] == '.')
     182        3685 :                 t++;
     183             : 
     184         666 :             if (begin < t) {
     185         666 :                 char* r = new_string(begin, t - begin, tok);
     186             :                 const char* q;
     187         666 :                 if (!r)
     188           0 :                     return 0;
     189         666 :                 q = get_normal_name(r);
     190         666 :                 if (r != q) {
     191         593 :                     PyMem_Free(r);
     192         593 :                     r = new_string(q, strlen(q), tok);
     193         593 :                     if (!r)
     194           0 :                         return 0;
     195             :                 }
     196         666 :                 *spec = r;
     197         666 :                 break;
     198             :             }
     199             :         }
     200             :     }
     201       83354 :     return 1;
     202             : }
     203             : 
     204             : /* Check whether the line contains a coding spec. If it does,
     205             :    invoke the set_readline function for the new encoding.
     206             :    This function receives the tok_state and the new encoding.
     207             :    Return 1 on success, 0 on failure.  */
     208             : 
     209             : static int
     210      112268 : check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
     211             :                   int set_readline(struct tok_state *, const char *))
     212             : {
     213             :     char *cs;
     214      112268 :     if (tok->cont_line) {
     215             :         /* It's a continuation line, so it can't be a coding spec. */
     216           0 :         tok->decoding_state = STATE_NORMAL;
     217           0 :         return 1;
     218             :     }
     219      112268 :     if (!get_coding_spec(line, &cs, size, tok)) {
     220           0 :         return 0;
     221             :     }
     222      112268 :     if (!cs) {
     223             :         Py_ssize_t i;
     224      111626 :         for (i = 0; i < size; i++) {
     225      110803 :             if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
     226             :                 break;
     227      105943 :             if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
     228             :                 /* Stop checking coding spec after a line containing
     229             :                  * anything except a comment. */
     230      105919 :                 tok->decoding_state = STATE_NORMAL;
     231      105919 :                 break;
     232             :             }
     233             :         }
     234      111602 :         return 1;
     235             :     }
     236         666 :     tok->decoding_state = STATE_NORMAL;
     237         666 :     if (tok->encoding == NULL) {
     238         639 :         assert(tok->decoding_readline == NULL);
     239         639 :         if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
     240           0 :             error_ret(tok);
     241           0 :             PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
     242           0 :             PyMem_Free(cs);
     243           0 :             return 0;
     244             :         }
     245         639 :         tok->encoding = cs;
     246             :     } else {                /* then, compare cs with BOM */
     247          27 :         if (strcmp(tok->encoding, cs) != 0) {
     248          20 :             error_ret(tok);
     249          20 :             PyErr_Format(PyExc_SyntaxError,
     250             :                          "encoding problem: %s with BOM", cs);
     251          20 :             PyMem_Free(cs);
     252          20 :             return 0;
     253             :         }
     254           7 :         PyMem_Free(cs);
     255             :     }
     256         646 :     return 1;
     257             : }
     258             : 
     259             : /* See whether the file starts with a BOM. If it does,
     260             :    invoke the set_readline function with the new encoding.
     261             :    Return 1 on success, 0 on failure.  */
     262             : 
     263             : static int
     264      109412 : check_bom(int get_char(struct tok_state *),
     265             :           void unget_char(int, struct tok_state *),
     266             :           int set_readline(struct tok_state *, const char *),
     267             :           struct tok_state *tok)
     268             : {
     269             :     int ch1, ch2, ch3;
     270      109412 :     ch1 = get_char(tok);
     271      109412 :     tok->decoding_state = STATE_SEEK_CODING;
     272      109412 :     if (ch1 == EOF) {
     273           8 :         return 1;
     274      109404 :     } else if (ch1 == 0xEF) {
     275          39 :         ch2 = get_char(tok);
     276          39 :         if (ch2 != 0xBB) {
     277           1 :             unget_char(ch2, tok);
     278           1 :             unget_char(ch1, tok);
     279           1 :             return 1;
     280             :         }
     281          38 :         ch3 = get_char(tok);
     282          38 :         if (ch3 != 0xBF) {
     283           2 :             unget_char(ch3, tok);
     284           2 :             unget_char(ch2, tok);
     285           2 :             unget_char(ch1, tok);
     286           2 :             return 1;
     287             :         }
     288             :     } else {
     289      109365 :         unget_char(ch1, tok);
     290      109365 :         return 1;
     291             :     }
     292          36 :     if (tok->encoding != NULL)
     293           0 :         PyMem_Free(tok->encoding);
     294          36 :     tok->encoding = new_string("utf-8", 5, tok);
     295          36 :     if (!tok->encoding)
     296           0 :         return 0;
     297             :     /* No need to set_readline: input is already utf-8 */
     298          36 :     return 1;
     299             : }
     300             : 
     301             : static int
     302          87 : tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
     303          87 :     assert(tok->fp_interactive);
     304             : 
     305          87 :     if (!line) {
     306           0 :         return 0;
     307             :     }
     308             : 
     309          87 :     Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start;
     310          87 :     Py_ssize_t line_size = strlen(line);
     311          87 :     char last_char = line[line_size > 0 ? line_size - 1 : line_size];
     312          87 :     if (last_char != '\n') {
     313           9 :         line_size += 1;
     314             :     }
     315          87 :     char* new_str = tok->interactive_src_start;
     316             : 
     317          87 :     new_str = PyMem_Realloc(new_str, current_size + line_size + 1);
     318          87 :     if (!new_str) {
     319           0 :         if (tok->interactive_src_start) {
     320           0 :             PyMem_Free(tok->interactive_src_start);
     321             :         }
     322           0 :         tok->interactive_src_start = NULL;
     323           0 :         tok->interactive_src_end = NULL;
     324           0 :         tok->done = E_NOMEM;
     325           0 :         return -1;
     326             :     }
     327          87 :     strcpy(new_str + current_size, line);
     328          87 :     if (last_char != '\n') {
     329             :         /* Last line does not end in \n, fake one */
     330           9 :         new_str[current_size + line_size - 1] = '\n';
     331           9 :         new_str[current_size + line_size] = '\0';
     332             :     }
     333          87 :     tok->interactive_src_start = new_str;
     334          87 :     tok->interactive_src_end = new_str + current_size + line_size;
     335          87 :     return 0;
     336             : }
     337             : 
     338             : 
     339             : /* Read a line of text from TOK into S, using the stream in TOK.
     340             :    Return NULL on failure, else S.
     341             : 
     342             :    On entry, tok->decoding_buffer will be one of:
     343             :      1) NULL: need to call tok->decoding_readline to get a new line
     344             :      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
     345             :        stored the result in tok->decoding_buffer
     346             :      3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
     347             :        (in the s buffer) to copy entire contents of the line read
     348             :        by tok->decoding_readline.  tok->decoding_buffer has the overflow.
     349             :        In this case, tok_readline_recode is called in a loop (with an expanded buffer)
     350             :        until the buffer ends with a '\n' (or until the end of the file is
     351             :        reached): see tok_nextc and its calls to tok_reserve_buf.
     352             : */
     353             : 
     354             : static int
     355       80119 : tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
     356             : {
     357       80119 :     Py_ssize_t cur = tok->cur - tok->buf;
     358       80119 :     Py_ssize_t oldsize = tok->inp - tok->buf;
     359       80119 :     Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
     360       80119 :     if (newsize > tok->end - tok->buf) {
     361        3334 :         char *newbuf = tok->buf;
     362        3334 :         Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
     363        3334 :         Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
     364        3334 :         Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
     365        3334 :         newbuf = (char *)PyMem_Realloc(newbuf, newsize);
     366        3334 :         if (newbuf == NULL) {
     367           0 :             tok->done = E_NOMEM;
     368           0 :             return 0;
     369             :         }
     370        3334 :         tok->buf = newbuf;
     371        3334 :         tok->cur = tok->buf + cur;
     372        3334 :         tok->inp = tok->buf + oldsize;
     373        3334 :         tok->end = tok->buf + newsize;
     374        3334 :         tok->start = start < 0 ? NULL : tok->buf + start;
     375        3334 :         tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
     376        3334 :         tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
     377             :     }
     378       80119 :     return 1;
     379             : }
     380             : 
     381             : static int
     382       23302 : tok_readline_recode(struct tok_state *tok) {
     383             :     PyObject *line;
     384             :     const  char *buf;
     385             :     Py_ssize_t buflen;
     386       23302 :     line = tok->decoding_buffer;
     387       23302 :     if (line == NULL) {
     388       23302 :         line = PyObject_CallNoArgs(tok->decoding_readline);
     389       23302 :         if (line == NULL) {
     390           0 :             error_ret(tok);
     391           0 :             goto error;
     392             :         }
     393             :     }
     394             :     else {
     395           0 :         tok->decoding_buffer = NULL;
     396             :     }
     397       23302 :     buf = PyUnicode_AsUTF8AndSize(line, &buflen);
     398       23302 :     if (buf == NULL) {
     399           0 :         error_ret(tok);
     400           0 :         goto error;
     401             :     }
     402       23302 :     if (!tok_reserve_buf(tok, buflen + 1)) {
     403           0 :         goto error;
     404             :     }
     405       23302 :     memcpy(tok->inp, buf, buflen);
     406       23302 :     tok->inp += buflen;
     407       23302 :     *tok->inp = '\0';
     408       23302 :     if (tok->fp_interactive &&
     409           0 :         tok_concatenate_interactive_new_line(tok, buf) == -1) {
     410           0 :         goto error;
     411             :     }
     412       23302 :     Py_DECREF(line);
     413       23302 :     return 1;
     414           0 : error:
     415           0 :     Py_XDECREF(line);
     416           0 :     return 0;
     417             : }
     418             : 
     419             : /* Set the readline function for TOK to a StreamReader's
     420             :    readline function. The StreamReader is named ENC.
     421             : 
     422             :    This function is called from check_bom and check_coding_spec.
     423             : 
     424             :    ENC is usually identical to the future value of tok->encoding,
     425             :    except for the (currently unsupported) case of UTF-16.
     426             : 
     427             :    Return 1 on success, 0 on failure. */
     428             : 
     429             : static int
     430          63 : fp_setreadl(struct tok_state *tok, const char* enc)
     431             : {
     432             :     PyObject *readline, *open, *stream;
     433             :     int fd;
     434             :     long pos;
     435             : 
     436          63 :     fd = fileno(tok->fp);
     437             :     /* Due to buffering the file offset for fd can be different from the file
     438             :      * position of tok->fp.  If tok->fp was opened in text mode on Windows,
     439             :      * its file position counts CRLF as one char and can't be directly mapped
     440             :      * to the file offset for fd.  Instead we step back one byte and read to
     441             :      * the end of line.*/
     442          63 :     pos = ftell(tok->fp);
     443         126 :     if (pos == -1 ||
     444          63 :         lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
     445           0 :         PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
     446           0 :         return 0;
     447             :     }
     448             : 
     449          63 :     open = _PyImport_GetModuleAttrString("io", "open");
     450          63 :     if (open == NULL) {
     451           0 :         return 0;
     452             :     }
     453          63 :     stream = PyObject_CallFunction(open, "isisOOO",
     454             :                     fd, "r", -1, enc, Py_None, Py_None, Py_False);
     455          63 :     Py_DECREF(open);
     456          63 :     if (stream == NULL) {
     457           0 :         return 0;
     458             :     }
     459             : 
     460          63 :     readline = PyObject_GetAttr(stream, &_Py_ID(readline));
     461          63 :     Py_DECREF(stream);
     462          63 :     if (readline == NULL) {
     463           0 :         return 0;
     464             :     }
     465          63 :     Py_XSETREF(tok->decoding_readline, readline);
     466             : 
     467          63 :     if (pos > 0) {
     468          63 :         PyObject *bufobj = _PyObject_CallNoArgs(readline);
     469          63 :         if (bufobj == NULL) {
     470           0 :             return 0;
     471             :         }
     472          63 :         Py_DECREF(bufobj);
     473             :     }
     474             : 
     475          63 :     return 1;
     476             : }
     477             : 
     478             : /* Fetch the next byte from TOK. */
     479             : 
     480         804 : static int fp_getc(struct tok_state *tok) {
     481         804 :     return getc(tok->fp);
     482             : }
     483             : 
     484             : /* Unfetch the last byte back into TOK.  */
     485             : 
     486         790 : static void fp_ungetc(int c, struct tok_state *tok) {
     487         790 :     ungetc(c, tok->fp);
     488         790 : }
     489             : 
     490             : /* Check whether the characters at s start a valid
     491             :    UTF-8 sequence. Return the number of characters forming
     492             :    the sequence if yes, 0 if not.  */
     493     1943970 : static int valid_utf8(const unsigned char* s)
     494             : {
     495     1943970 :     int expected = 0;
     496             :     int length;
     497     1943970 :     if (*s < 0x80)
     498             :         /* single-byte code */
     499     1943960 :         return 1;
     500           6 :     if (*s < 0xc0)
     501             :         /* following byte */
     502           2 :         return 0;
     503           4 :     if (*s < 0xE0)
     504           3 :         expected = 1;
     505           1 :     else if (*s < 0xF0)
     506           0 :         expected = 2;
     507           1 :     else if (*s < 0xF8)
     508           0 :         expected = 3;
     509             :     else
     510           1 :         return 0;
     511           3 :     length = expected + 1;
     512           6 :     for (; expected; expected--)
     513           3 :         if (s[expected] < 0x80 || s[expected] >= 0xC0)
     514           0 :             return 0;
     515           3 :     return length;
     516             : }
     517             : 
     518             : static int
     519       56382 : ensure_utf8(char *line, struct tok_state *tok)
     520             : {
     521       56382 :     int badchar = 0;
     522             :     unsigned char *c;
     523             :     int length;
     524     2000350 :     for (c = (unsigned char *)line; *c; c += length) {
     525     1943970 :         if (!(length = valid_utf8(c))) {
     526           3 :             badchar = *c;
     527           3 :             break;
     528             :         }
     529             :     }
     530       56382 :     if (badchar) {
     531             :         /* Need to add 1 to the line number, since this line
     532             :        has not been counted, yet.  */
     533           3 :         PyErr_Format(PyExc_SyntaxError,
     534             :                      "Non-UTF-8 code starting with '\\x%.2x' "
     535             :                      "in file %U on line %i, "
     536             :                      "but no encoding declared; "
     537             :                      "see https://peps.python.org/pep-0263/ for details",
     538           3 :                      badchar, tok->filename, tok->lineno + 1);
     539           3 :         return 0;
     540             :     }
     541       56379 :     return 1;
     542             : }
     543             : 
     544             : /* Fetch a byte from TOK, using the string buffer. */
     545             : 
     546             : static int
     547      108685 : buf_getc(struct tok_state *tok) {
     548      108685 :     return Py_CHARMASK(*tok->str++);
     549             : }
     550             : 
     551             : /* Unfetch a byte from TOK, using the string buffer. */
     552             : 
     553             : static void
     554      108583 : buf_ungetc(int c, struct tok_state *tok) {
     555      108583 :     tok->str--;
     556      108583 :     assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
     557      108583 : }
     558             : 
     559             : /* Set the readline function for TOK to ENC. For the string-based
     560             :    tokenizer, this means to just record the encoding. */
     561             : 
     562             : static int
     563          49 : buf_setreadl(struct tok_state *tok, const char* enc) {
     564          49 :     tok->enc = enc;
     565          49 :     return 1;
     566             : }
     567             : 
     568             : /* Return a UTF-8 encoding Python string object from the
     569             :    C byte string STR, which is encoded with ENC. */
     570             : 
     571             : static PyObject *
     572          94 : translate_into_utf8(const char* str, const char* enc) {
     573             :     PyObject *utf8;
     574          94 :     PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
     575          94 :     if (buf == NULL)
     576           6 :         return NULL;
     577          88 :     utf8 = PyUnicode_AsUTF8String(buf);
     578          88 :     Py_DECREF(buf);
     579          88 :     return utf8;
     580             : }
     581             : 
     582             : 
     583             : static char *
     584      223785 : translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
     585      223785 :     int skip_next_lf = 0;
     586      223785 :     size_t needed_length = strlen(s) + 2, final_length;
     587             :     char *buf, *current;
     588      223785 :     char c = '\0';
     589      223785 :     buf = PyMem_Malloc(needed_length);
     590      223785 :     if (buf == NULL) {
     591           0 :         tok->done = E_NOMEM;
     592           0 :         return NULL;
     593             :     }
     594   222216000 :     for (current = buf; *s; s++, current++) {
     595   221992000 :         c = *s;
     596   221992000 :         if (skip_next_lf) {
     597       45980 :             skip_next_lf = 0;
     598       45980 :             if (c == '\n') {
     599       45963 :                 c = *++s;
     600       45963 :                 if (!c)
     601          16 :                     break;
     602             :             }
     603             :         }
     604   221992000 :         if (c == '\r') {
     605       45987 :             skip_next_lf = 1;
     606       45987 :             c = '\n';
     607             :         }
     608   221992000 :         *current = c;
     609             :     }
     610             :     /* If this is exec input, add a newline to the end of the string if
     611             :        there isn't one already. */
     612      223785 :     if (exec_input && c != '\n') {
     613      127954 :         *current = '\n';
     614      127954 :         current++;
     615             :     }
     616      223785 :     *current = '\0';
     617      223785 :     final_length = current - buf + 1;
     618      223785 :     if (final_length < needed_length && final_length) {
     619             :         /* should never fail */
     620       95847 :         char* result = PyMem_Realloc(buf, final_length);
     621       95847 :         if (result == NULL) {
     622           0 :             PyMem_Free(buf);
     623             :         }
     624       95847 :         buf = result;
     625             :     }
     626      223785 :     return buf;
     627             : }
     628             : 
     629             : /* Decode a byte string STR for use as the buffer of TOK.
     630             :    Look for encoding declarations inside STR, and record them
     631             :    inside TOK.  */
     632             : 
     633             : static char *
     634      108612 : decode_str(const char *input, int single, struct tok_state *tok)
     635             : {
     636      108612 :     PyObject* utf8 = NULL;
     637             :     char *str;
     638             :     const char *s;
     639      108612 :     const char *newl[2] = {NULL, NULL};
     640      108612 :     int lineno = 0;
     641      108612 :     tok->input = str = translate_newlines(input, single, tok);
     642      108612 :     if (str == NULL)
     643           0 :         return NULL;
     644      108612 :     tok->enc = NULL;
     645      108612 :     tok->str = str;
     646      108612 :     if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
     647           0 :         return error_ret(tok);
     648      108612 :     str = tok->str;             /* string after BOM if any */
     649      108612 :     assert(str);
     650      108612 :     if (tok->enc != NULL) {
     651           0 :         utf8 = translate_into_utf8(str, tok->enc);
     652           0 :         if (utf8 == NULL)
     653           0 :             return error_ret(tok);
     654           0 :         str = PyBytes_AsString(utf8);
     655             :     }
     656     1793920 :     for (s = str;; s++) {
     657     1793920 :         if (*s == '\0') break;
     658     1695980 :         else if (*s == '\n') {
     659      119267 :             assert(lineno < 2);
     660      119267 :             newl[lineno] = s;
     661      119267 :             lineno++;
     662      119267 :             if (lineno == 2) break;
     663             :         }
     664             :     }
     665      108612 :     tok->enc = NULL;
     666             :     /* need to check line 1 and 2 separately since check_coding_spec
     667             :        assumes a single line as input */
     668      108612 :     if (newl[0]) {
     669      108599 :         if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
     670          20 :             return NULL;
     671             :         }
     672      108579 :         if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
     673        2553 :             if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
     674             :                                    tok, buf_setreadl))
     675           0 :                 return NULL;
     676             :         }
     677             :     }
     678      108592 :     if (tok->enc != NULL) {
     679          49 :         assert(utf8 == NULL);
     680          49 :         utf8 = translate_into_utf8(str, tok->enc);
     681          49 :         if (utf8 == NULL)
     682           6 :             return error_ret(tok);
     683          43 :         str = PyBytes_AS_STRING(utf8);
     684             :     }
     685      108586 :     assert(tok->decoding_buffer == NULL);
     686      108586 :     tok->decoding_buffer = utf8; /* CAUTION */
     687      108586 :     return str;
     688             : }
     689             : 
     690             : /* Set up tokenizer for string */
     691             : 
     692             : struct tok_state *
     693      108612 : _PyTokenizer_FromString(const char *str, int exec_input)
     694             : {
     695      108612 :     struct tok_state *tok = tok_new();
     696             :     char *decoded;
     697             : 
     698      108612 :     if (tok == NULL)
     699           0 :         return NULL;
     700      108612 :     decoded = decode_str(str, exec_input, tok);
     701      108612 :     if (decoded == NULL) {
     702          26 :         _PyTokenizer_Free(tok);
     703          26 :         return NULL;
     704             :     }
     705             : 
     706      108586 :     tok->buf = tok->cur = tok->inp = decoded;
     707      108586 :     tok->end = decoded;
     708      108586 :     return tok;
     709             : }
     710             : 
     711             : /* Set up tokenizer for UTF-8 string */
     712             : 
     713             : struct tok_state *
     714      115121 : _PyTokenizer_FromUTF8(const char *str, int exec_input)
     715             : {
     716      115121 :     struct tok_state *tok = tok_new();
     717             :     char *translated;
     718      115121 :     if (tok == NULL)
     719           0 :         return NULL;
     720      115121 :     tok->input = translated = translate_newlines(str, exec_input, tok);
     721      115121 :     if (translated == NULL) {
     722           0 :         _PyTokenizer_Free(tok);
     723           0 :         return NULL;
     724             :     }
     725      115121 :     tok->decoding_state = STATE_NORMAL;
     726      115121 :     tok->enc = NULL;
     727      115121 :     tok->str = translated;
     728      115121 :     tok->encoding = new_string("utf-8", 5, tok);
     729      115121 :     if (!tok->encoding) {
     730           0 :         _PyTokenizer_Free(tok);
     731           0 :         return NULL;
     732             :     }
     733             : 
     734      115121 :     tok->buf = tok->cur = tok->inp = translated;
     735      115121 :     tok->end = translated;
     736      115121 :     return tok;
     737             : }
     738             : 
     739             : /* Set up tokenizer for file */
     740             : 
     741             : struct tok_state *
     742         830 : _PyTokenizer_FromFile(FILE *fp, const char* enc,
     743             :                       const char *ps1, const char *ps2)
     744             : {
     745         830 :     struct tok_state *tok = tok_new();
     746         830 :     if (tok == NULL)
     747           0 :         return NULL;
     748         830 :     if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
     749           0 :         _PyTokenizer_Free(tok);
     750           0 :         return NULL;
     751             :     }
     752         830 :     tok->cur = tok->inp = tok->buf;
     753         830 :     tok->end = tok->buf + BUFSIZ;
     754         830 :     tok->fp = fp;
     755         830 :     tok->prompt = ps1;
     756         830 :     tok->nextprompt = ps2;
     757         830 :     if (enc != NULL) {
     758             :         /* Must copy encoding declaration since it
     759             :            gets copied into the parse tree. */
     760          30 :         tok->encoding = new_string(enc, strlen(enc), tok);
     761          30 :         if (!tok->encoding) {
     762           0 :             _PyTokenizer_Free(tok);
     763           0 :             return NULL;
     764             :         }
     765          30 :         tok->decoding_state = STATE_NORMAL;
     766             :     }
     767         830 :     return tok;
     768             : }
     769             : 
     770             : /* Free a tok_state structure */
     771             : 
     772             : void
     773      224563 : _PyTokenizer_Free(struct tok_state *tok)
     774             : {
     775      224563 :     if (tok->encoding != NULL) {
     776      115826 :         PyMem_Free(tok->encoding);
     777             :     }
     778      224563 :     Py_XDECREF(tok->decoding_readline);
     779      224563 :     Py_XDECREF(tok->decoding_buffer);
     780      224563 :     Py_XDECREF(tok->filename);
     781      224563 :     if (tok->fp != NULL && tok->buf != NULL) {
     782         827 :         PyMem_Free(tok->buf);
     783             :     }
     784      224563 :     if (tok->input) {
     785      223733 :         PyMem_Free(tok->input);
     786             :     }
     787      224563 :     if (tok->interactive_src_start != NULL) {
     788          35 :         PyMem_Free(tok->interactive_src_start);
     789             :     }
     790      224563 :     PyMem_Free(tok);
     791      224563 : }
     792             : 
     793             : static int
     794       56797 : tok_readline_raw(struct tok_state *tok)
     795             : {
     796             :     do {
     797       56797 :         if (!tok_reserve_buf(tok, BUFSIZ)) {
     798           0 :             return 0;
     799             :         }
     800       56797 :         char *line = Py_UniversalNewlineFgets(tok->inp,
     801       56797 :                                               (int)(tok->end - tok->inp),
     802             :                                               tok->fp, NULL);
     803       56797 :         if (line == NULL) {
     804         325 :             return 1;
     805             :         }
     806       56507 :         if (tok->fp_interactive &&
     807          35 :             tok_concatenate_interactive_new_line(tok, line) == -1) {
     808           0 :             return 0;
     809             :         }
     810       56472 :         tok->inp = strchr(tok->inp, '\0');
     811       56472 :         if (tok->inp == tok->buf) {
     812           1 :             return 0;
     813             :         }
     814       56471 :     } while (tok->inp[-1] != '\n');
     815       56425 :     return 1;
     816             : }
     817             : 
     818             : static int
     819     8952620 : tok_underflow_string(struct tok_state *tok) {
     820     8952620 :     char *end = strchr(tok->inp, '\n');
     821     8952620 :     if (end != NULL) {
     822     8754170 :         end++;
     823             :     }
     824             :     else {
     825      198448 :         end = strchr(tok->inp, '\0');
     826      198448 :         if (end == tok->inp) {
     827      123990 :             tok->done = E_EOF;
     828      123990 :             return 0;
     829             :         }
     830             :     }
     831     8828630 :     if (tok->start == NULL) {
     832     8106030 :         tok->buf = tok->cur;
     833             :     }
     834     8828630 :     tok->line_start = tok->cur;
     835     8828630 :     tok->lineno++;
     836     8828630 :     tok->inp = end;
     837     8828630 :     return 1;
     838             : }
     839             : 
     840             : static int
     841          52 : tok_underflow_interactive(struct tok_state *tok) {
     842          52 :     if (tok->interactive_underflow == IUNDERFLOW_STOP) {
     843           0 :         tok->done = E_INTERACT_STOP;
     844           0 :         return 1;
     845             :     }
     846          52 :     char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt);
     847          52 :     if (newtok != NULL) {
     848          52 :         char *translated = translate_newlines(newtok, 0, tok);
     849          52 :         PyMem_Free(newtok);
     850          52 :         if (translated == NULL) {
     851           0 :             return 0;
     852             :         }
     853          52 :         newtok = translated;
     854             :     }
     855          52 :     if (tok->encoding && newtok && *newtok) {
     856             :         /* Recode to UTF-8 */
     857             :         Py_ssize_t buflen;
     858             :         const char* buf;
     859          45 :         PyObject *u = translate_into_utf8(newtok, tok->encoding);
     860          45 :         PyMem_Free(newtok);
     861          45 :         if (u == NULL) {
     862           0 :             tok->done = E_DECODE;
     863           0 :             return 0;
     864             :         }
     865          45 :         buflen = PyBytes_GET_SIZE(u);
     866          45 :         buf = PyBytes_AS_STRING(u);
     867          45 :         newtok = PyMem_Malloc(buflen+1);
     868          45 :         if (newtok == NULL) {
     869           0 :             Py_DECREF(u);
     870           0 :             tok->done = E_NOMEM;
     871           0 :             return 0;
     872             :         }
     873          45 :         strcpy(newtok, buf);
     874          45 :         Py_DECREF(u);
     875             :     }
     876         104 :     if (tok->fp_interactive &&
     877          52 :         tok_concatenate_interactive_new_line(tok, newtok) == -1) {
     878           0 :         PyMem_Free(newtok);
     879           0 :         return 0;
     880             :     }
     881          52 :     if (tok->nextprompt != NULL) {
     882          52 :         tok->prompt = tok->nextprompt;
     883             :     }
     884          52 :     if (newtok == NULL) {
     885           0 :         tok->done = E_INTR;
     886             :     }
     887          52 :     else if (*newtok == '\0') {
     888           7 :         PyMem_Free(newtok);
     889           7 :         tok->done = E_EOF;
     890             :     }
     891          45 :     else if (tok->start != NULL) {
     892          20 :         Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
     893          20 :         size_t size = strlen(newtok);
     894          20 :         tok->lineno++;
     895          20 :         if (!tok_reserve_buf(tok, size + 1)) {
     896           0 :             PyMem_Free(tok->buf);
     897           0 :             tok->buf = NULL;
     898           0 :             PyMem_Free(newtok);
     899           0 :             return 0;
     900             :         }
     901          20 :         memcpy(tok->cur, newtok, size + 1);
     902          20 :         PyMem_Free(newtok);
     903          20 :         tok->inp += size;
     904          20 :         tok->multi_line_start = tok->buf + cur_multi_line_start;
     905             :     }
     906             :     else {
     907          25 :         tok->lineno++;
     908          25 :         PyMem_Free(tok->buf);
     909          25 :         tok->buf = newtok;
     910          25 :         tok->cur = tok->buf;
     911          25 :         tok->line_start = tok->buf;
     912          25 :         tok->inp = strchr(tok->buf, '\0');
     913          25 :         tok->end = tok->inp + 1;
     914             :     }
     915          52 :     if (tok->done != E_OK) {
     916           7 :         if (tok->prompt != NULL) {
     917           7 :             PySys_WriteStderr("\n");
     918             :         }
     919           7 :         return 0;
     920             :     }
     921          45 :     return 1;
     922             : }
     923             : 
     924             : static int
     925       80053 : tok_underflow_file(struct tok_state *tok) {
     926       80053 :     if (tok->start == NULL) {
     927       69569 :         tok->cur = tok->inp = tok->buf;
     928             :     }
     929       80053 :     if (tok->decoding_state == STATE_INIT) {
     930             :         /* We have not yet determined the encoding.
     931             :            If an encoding is found, use the file-pointer
     932             :            reader functions from now on. */
     933         800 :         if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {
     934           0 :             error_ret(tok);
     935           0 :             return 0;
     936             :         }
     937         800 :         assert(tok->decoding_state != STATE_INIT);
     938             :     }
     939             :     /* Read until '\n' or EOF */
     940       80053 :     if (tok->decoding_readline != NULL) {
     941             :         /* We already have a codec associated with this input. */
     942       23302 :         if (!tok_readline_recode(tok)) {
     943           0 :             return 0;
     944             :         }
     945             :     }
     946             :     else {
     947             :         /* We want a 'raw' read. */
     948       56751 :         if (!tok_readline_raw(tok)) {
     949           1 :             return 0;
     950             :         }
     951             :     }
     952       80052 :     if (tok->inp == tok->cur) {
     953         335 :         tok->done = E_EOF;
     954         335 :         return 0;
     955             :     }
     956       79717 :     if (tok->inp[-1] != '\n') {
     957             :         /* Last line does not end in \n, fake one */
     958          53 :         *tok->inp++ = '\n';
     959          53 :         *tok->inp = '\0';
     960             :     }
     961             : 
     962       79717 :     tok->lineno++;
     963       79717 :     if (tok->decoding_state != STATE_NORMAL) {
     964        1350 :         if (tok->lineno > 2) {
     965         234 :             tok->decoding_state = STATE_NORMAL;
     966             :         }
     967        1116 :         else if (!check_coding_spec(tok->cur, strlen(tok->cur),
     968             :                                     tok, fp_setreadl))
     969             :         {
     970           0 :             return 0;
     971             :         }
     972             :     }
     973             :     /* The default encoding is UTF-8, so make sure we don't have any
     974             :        non-UTF-8 sequences in it. */
     975       79717 :     if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
     976           3 :         error_ret(tok);
     977           3 :         return 0;
     978             :     }
     979       79714 :     assert(tok->done == E_OK);
     980       79714 :     return tok->done == E_OK;
     981             : }
     982             : 
     983             : #if defined(Py_DEBUG)
     984             : static void
     985          21 : print_escape(FILE *f, const char *s, Py_ssize_t size)
     986             : {
     987          21 :     if (s == NULL) {
     988           0 :         fputs("NULL", f);
     989           0 :         return;
     990             :     }
     991          21 :     putc('"', f);
     992         810 :     while (size-- > 0) {
     993         789 :         unsigned char c = *s++;
     994         789 :         switch (c) {
     995          18 :             case '\n': fputs("\\n", f); break;
     996           0 :             case '\r': fputs("\\r", f); break;
     997           0 :             case '\t': fputs("\\t", f); break;
     998           0 :             case '\f': fputs("\\f", f); break;
     999           0 :             case '\'': fputs("\\'", f); break;
    1000           0 :             case '"': fputs("\\\"", f); break;
    1001         771 :             default:
    1002         771 :                 if (0x20 <= c && c <= 0x7f)
    1003         771 :                     putc(c, f);
    1004             :                 else
    1005           0 :                     fprintf(f, "\\x%02x", c);
    1006             :         }
    1007             :     }
    1008          21 :     putc('"', f);
    1009             : }
    1010             : #endif
    1011             : 
    1012             : /* Get next char, updating state; error code goes into tok->done */
    1013             : 
    1014             : static int
    1015   318271000 : tok_nextc(struct tok_state *tok)
    1016             : {
    1017             :     int rc;
    1018             :     for (;;) {
    1019   318271000 :         if (tok->cur != tok->inp) {
    1020   308996000 :             return Py_CHARMASK(*tok->cur++); /* Fast path */
    1021             :         }
    1022     9274950 :         if (tok->done != E_OK) {
    1023      242228 :            return EOF;
    1024             :         }
    1025     9032720 :         if (tok->fp == NULL) {
    1026     8952620 :             rc = tok_underflow_string(tok);
    1027             :         }
    1028       80105 :         else if (tok->prompt != NULL) {
    1029          52 :             rc = tok_underflow_interactive(tok);
    1030             :         }
    1031             :         else {
    1032       80053 :             rc = tok_underflow_file(tok);
    1033             :         }
    1034             : #if defined(Py_DEBUG)
    1035     9032720 :         if (tok->debug) {
    1036          21 :             fprintf(stderr, "line[%d] = ", tok->lineno);
    1037          21 :             print_escape(stderr, tok->cur, tok->inp - tok->cur);
    1038          21 :             fprintf(stderr, "  tok->done = %d\n", tok->done);
    1039             :         }
    1040             : #endif
    1041     9032720 :         if (!rc) {
    1042      124336 :             tok->cur = tok->inp;
    1043      124336 :             return EOF;
    1044             :         }
    1045     8908380 :         tok->line_start = tok->cur;
    1046             :     }
    1047             :     Py_UNREACHABLE();
    1048             : }
    1049             : 
    1050             : /* Back-up one character */
    1051             : 
    1052             : static void
    1053    84640500 : tok_backup(struct tok_state *tok, int c)
    1054             : {
    1055    84640500 :     if (c != EOF) {
    1056    84398300 :         if (--tok->cur < tok->buf) {
    1057           0 :             Py_FatalError("tokenizer beginning of buffer");
    1058             :         }
    1059    84398300 :         if ((int)(unsigned char)*tok->cur != c) {
    1060           0 :             Py_FatalError("tok_backup: wrong character");
    1061             :         }
    1062             :     }
    1063    84640500 : }
    1064             : 
    1065             : static int
    1066         232 : _syntaxerror_range(struct tok_state *tok, const char *format,
    1067             :                    int col_offset, int end_col_offset,
    1068             :                    va_list vargs)
    1069             : {
    1070             :     PyObject *errmsg, *errtext, *args;
    1071         232 :     errmsg = PyUnicode_FromFormatV(format, vargs);
    1072         232 :     if (!errmsg) {
    1073           0 :         goto error;
    1074             :     }
    1075             : 
    1076         232 :     errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
    1077             :                                    "replace");
    1078         232 :     if (!errtext) {
    1079           0 :         goto error;
    1080             :     }
    1081             : 
    1082         232 :     if (col_offset == -1) {
    1083         219 :         col_offset = (int)PyUnicode_GET_LENGTH(errtext);
    1084             :     }
    1085         232 :     if (end_col_offset == -1) {
    1086         219 :         end_col_offset = col_offset;
    1087             :     }
    1088             : 
    1089         232 :     Py_ssize_t line_len = strcspn(tok->line_start, "\n");
    1090         232 :     if (line_len != tok->cur - tok->line_start) {
    1091         156 :         Py_DECREF(errtext);
    1092         156 :         errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
    1093             :                                        "replace");
    1094             :     }
    1095         232 :     if (!errtext) {
    1096           0 :         goto error;
    1097             :     }
    1098             : 
    1099         232 :     args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
    1100             :                          col_offset, errtext, tok->lineno, end_col_offset);
    1101         232 :     if (args) {
    1102         232 :         PyErr_SetObject(PyExc_SyntaxError, args);
    1103         232 :         Py_DECREF(args);
    1104             :     }
    1105             : 
    1106           0 : error:
    1107         232 :     Py_XDECREF(errmsg);
    1108         232 :     tok->done = E_ERROR;
    1109         232 :     return ERRORTOKEN;
    1110             : }
    1111             : 
    1112             : static int
    1113         219 : syntaxerror(struct tok_state *tok, const char *format, ...)
    1114             : {
    1115             :     va_list vargs;
    1116         219 :     va_start(vargs, format);
    1117         219 :     int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
    1118         219 :     va_end(vargs);
    1119         219 :     return ret;
    1120             : }
    1121             : 
    1122             : static int
    1123          13 : syntaxerror_known_range(struct tok_state *tok,
    1124             :                         int col_offset, int end_col_offset,
    1125             :                         const char *format, ...)
    1126             : {
    1127             :     va_list vargs;
    1128          13 :     va_start(vargs, format);
    1129          13 :     int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);
    1130          13 :     va_end(vargs);
    1131          13 :     return ret;
    1132             : }
    1133             : 
    1134             : 
    1135             : 
    1136             : static int
    1137           2 : indenterror(struct tok_state *tok)
    1138             : {
    1139           2 :     tok->done = E_TABSPACE;
    1140           2 :     tok->cur = tok->inp;
    1141           2 :     return ERRORTOKEN;
    1142             : }
    1143             : 
    1144             : static int
    1145         130 : parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...)
    1146             : {
    1147             :     PyObject *errmsg;
    1148             :     va_list vargs;
    1149         130 :     va_start(vargs, format);
    1150         130 :     errmsg = PyUnicode_FromFormatV(format, vargs);
    1151         130 :     va_end(vargs);
    1152         130 :     if (!errmsg) {
    1153           0 :         goto error;
    1154             :     }
    1155             : 
    1156         130 :     if (PyErr_WarnExplicitObject(category, errmsg, tok->filename,
    1157             :                                  tok->lineno, NULL, NULL) < 0) {
    1158          64 :         if (PyErr_ExceptionMatches(category)) {
    1159             :             /* Replace the DeprecationWarning exception with a SyntaxError
    1160             :                to get a more accurate error report */
    1161          64 :             PyErr_Clear();
    1162          64 :             syntaxerror(tok, "%U", errmsg);
    1163             :         }
    1164          64 :         goto error;
    1165             :     }
    1166          66 :     Py_DECREF(errmsg);
    1167          66 :     return 0;
    1168             : 
    1169          64 : error:
    1170          64 :     Py_XDECREF(errmsg);
    1171          64 :     tok->done = E_ERROR;
    1172          64 :     return -1;
    1173             : }
    1174             : 
    1175             : static int
    1176          91 : lookahead(struct tok_state *tok, const char *test)
    1177             : {
    1178          91 :     const char *s = test;
    1179          91 :     int res = 0;
    1180         158 :     while (1) {
    1181         249 :         int c = tok_nextc(tok);
    1182         249 :         if (*s == 0) {
    1183          82 :             res = !is_potential_identifier_char(c);
    1184             :         }
    1185         167 :         else if (c == *s) {
    1186         158 :             s++;
    1187         158 :             continue;
    1188             :         }
    1189             : 
    1190          91 :         tok_backup(tok, c);
    1191         249 :         while (s != test) {
    1192         158 :             tok_backup(tok, *--s);
    1193             :         }
    1194          91 :         return res;
    1195             :     }
    1196             : }
    1197             : 
    1198             : static int
    1199     2913270 : verify_end_of_number(struct tok_state *tok, int c, const char *kind)
    1200             : {
    1201             :     /* Emit a deprecation warning only if the numeric literal is immediately
    1202             :      * followed by one of keywords which can occur after a numeric literal
    1203             :      * in valid code: "and", "else", "for", "if", "in", "is" and "or".
    1204             :      * It allows to gradually deprecate existing valid code without adding
    1205             :      * warning before error in most cases of invalid numeric literal (which
    1206             :      * would be confusing and break existing tests).
    1207             :      * Raise a syntax error with slightly better message than plain
    1208             :      * "invalid syntax" if the numeric literal is immediately followed by
    1209             :      * other keyword or identifier.
    1210             :      */
    1211     2913270 :     int r = 0;
    1212     2913270 :     if (c == 'a') {
    1213          14 :         r = lookahead(tok, "nd");
    1214             :     }
    1215     2913250 :     else if (c == 'e') {
    1216          24 :         r = lookahead(tok, "lse");
    1217             :     }
    1218     2913230 :     else if (c == 'f') {
    1219          14 :         r = lookahead(tok, "or");
    1220             :     }
    1221     2913220 :     else if (c == 'i') {
    1222          48 :         int c2 = tok_nextc(tok);
    1223          48 :         if (c2 == 'f' || c2 == 'n' || c2 == 's') {
    1224          48 :             r = 1;
    1225             :         }
    1226          48 :         tok_backup(tok, c2);
    1227             :     }
    1228     2913170 :     else if (c == 'o') {
    1229          22 :         r = lookahead(tok, "r");
    1230             :     }
    1231     2913140 :     else if (c == 'n') {
    1232          17 :         r = lookahead(tok, "ot");
    1233             :     }
    1234     2913270 :     if (r) {
    1235         130 :         tok_backup(tok, c);
    1236         130 :         if (parser_warn(tok, PyExc_SyntaxWarning,
    1237             :                 "invalid %s literal", kind))
    1238             :         {
    1239          64 :             return 0;
    1240             :         }
    1241          66 :         tok_nextc(tok);
    1242             :     }
    1243             :     else /* In future releases, only error will remain. */
    1244     2913140 :     if (is_potential_identifier_char(c)) {
    1245          24 :         tok_backup(tok, c);
    1246          24 :         syntaxerror(tok, "invalid %s literal", kind);
    1247          24 :         return 0;
    1248             :     }
    1249     2913180 :     return 1;
    1250             : }
    1251             : 
    1252             : /* Verify that the identifier follows PEP 3131.
    1253             :    All identifier strings are guaranteed to be "ready" unicode objects.
    1254             :  */
    1255             : static int
    1256         214 : verify_identifier(struct tok_state *tok)
    1257             : {
    1258             :     PyObject *s;
    1259         214 :     if (tok->decoding_erred)
    1260           0 :         return 0;
    1261         214 :     s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
    1262         214 :     if (s == NULL) {
    1263           4 :         if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
    1264           4 :             tok->done = E_DECODE;
    1265             :         }
    1266             :         else {
    1267           0 :             tok->done = E_ERROR;
    1268             :         }
    1269           4 :         return 0;
    1270             :     }
    1271         210 :     Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
    1272         210 :     if (invalid < 0) {
    1273           0 :         Py_DECREF(s);
    1274           0 :         tok->done = E_ERROR;
    1275           0 :         return 0;
    1276             :     }
    1277         210 :     assert(PyUnicode_GET_LENGTH(s) > 0);
    1278         210 :     if (invalid < PyUnicode_GET_LENGTH(s)) {
    1279          10 :         Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
    1280          10 :         if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
    1281             :             /* Determine the offset in UTF-8 encoded input */
    1282           0 :             Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
    1283           0 :             if (s != NULL) {
    1284           0 :                 Py_SETREF(s, PyUnicode_AsUTF8String(s));
    1285             :             }
    1286           0 :             if (s == NULL) {
    1287           0 :                 tok->done = E_ERROR;
    1288           0 :                 return 0;
    1289             :             }
    1290           0 :             tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
    1291             :         }
    1292          10 :         Py_DECREF(s);
    1293             :         // PyUnicode_FromFormatV() does not support %X
    1294             :         char hex[9];
    1295          10 :         (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);
    1296          10 :         if (Py_UNICODE_ISPRINTABLE(ch)) {
    1297           7 :             syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
    1298             :         }
    1299             :         else {
    1300           3 :             syntaxerror(tok, "invalid non-printable character U+%s", hex);
    1301             :         }
    1302          10 :         return 0;
    1303             :     }
    1304         200 :     Py_DECREF(s);
    1305         200 :     return 1;
    1306             : }
    1307             : 
    1308             : static int
    1309     4658080 : tok_decimal_tail(struct tok_state *tok)
    1310             : {
    1311             :     int c;
    1312             : 
    1313        1871 :     while (1) {
    1314             :         do {
    1315     4658080 :             c = tok_nextc(tok);
    1316     4658080 :         } while (isdigit(c));
    1317     2045350 :         if (c != '_') {
    1318     2043450 :             break;
    1319             :         }
    1320        1898 :         c = tok_nextc(tok);
    1321        1898 :         if (!isdigit(c)) {
    1322          27 :             tok_backup(tok, c);
    1323          27 :             syntaxerror(tok, "invalid decimal literal");
    1324          27 :             return 0;
    1325             :         }
    1326             :     }
    1327     2043450 :     return c;
    1328             : }
    1329             : 
    1330             : /* Get next token, after space stripping etc. */
    1331             : 
    1332             : static inline int
    1333        4489 : tok_continuation_line(struct tok_state *tok) {
    1334        4489 :     int c = tok_nextc(tok);
    1335        4489 :     if (c != '\n') {
    1336           7 :         tok->done = E_LINECONT;
    1337           7 :         return -1;
    1338             :     }
    1339        4482 :     c = tok_nextc(tok);
    1340        4482 :     if (c == EOF) {
    1341          11 :         tok->done = E_EOF;
    1342          11 :         tok->cur = tok->inp;
    1343          11 :         return -1;
    1344             :     } else {
    1345        4471 :         tok_backup(tok, c);
    1346             :     }
    1347        4471 :     return c;
    1348             : }
    1349             : 
    1350             : static int
    1351    39583700 : tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
    1352             : {
    1353             :     int c;
    1354             :     int blankline, nonascii;
    1355             : 
    1356    39583700 :     *p_start = *p_end = NULL;
    1357    44376100 :   nextline:
    1358    44376100 :     tok->start = NULL;
    1359    44376100 :     blankline = 0;
    1360             : 
    1361             :     /* Get indentation level */
    1362    44376100 :     if (tok->atbol) {
    1363     8225190 :         int col = 0;
    1364     8225190 :         int altcol = 0;
    1365     8225190 :         tok->atbol = 0;
    1366     8225190 :         int cont_line_col = 0;
    1367             :         for (;;) {
    1368    48308800 :             c = tok_nextc(tok);
    1369    48308800 :             if (c == ' ') {
    1370    40082400 :                 col++, altcol++;
    1371             :             }
    1372     8226470 :             else if (c == '\t') {
    1373         915 :                 col = (col / tok->tabsize + 1) * tok->tabsize;
    1374         915 :                 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
    1375             :             }
    1376     8225550 :             else if (c == '\014')  {/* Control-L (formfeed) */
    1377         334 :                 col = altcol = 0; /* For Emacs users */
    1378             :             }
    1379     8225220 :             else if (c == '\\') {
    1380             :                 // Indentation cannot be split over multiple physical lines
    1381             :                 // using backslashes. This means that if we found a backslash
    1382             :                 // preceded by whitespace, **the first one we find** determines
    1383             :                 // the level of indentation of whatever comes next.
    1384          31 :                 cont_line_col = cont_line_col ? cont_line_col : col;
    1385          31 :                 if ((c = tok_continuation_line(tok)) == -1) {
    1386           2 :                     return ERRORTOKEN;
    1387             :                 }
    1388             :             }
    1389             :             else {
    1390     8225190 :                 break;
    1391             :             }
    1392             :         }
    1393     8225190 :         tok_backup(tok, c);
    1394     8225190 :         if (c == '#' || c == '\n') {
    1395             :             /* Lines with only whitespace and/or comments
    1396             :                shouldn't affect the indentation and are
    1397             :                not passed to the parser as NEWLINE tokens,
    1398             :                except *totally* empty lines in interactive
    1399             :                mode, which signal the end of a command group. */
    1400     3628260 :             if (col == 0 && c == '\n' && tok->prompt != NULL) {
    1401           2 :                 blankline = 0; /* Let it through */
    1402             :             }
    1403     3628260 :             else if (tok->prompt != NULL && tok->lineno == 1) {
    1404             :                 /* In interactive mode, if the first line contains
    1405             :                    only spaces and/or a comment, let it through. */
    1406           0 :                 blankline = 0;
    1407           0 :                 col = altcol = 0;
    1408             :             }
    1409             :             else {
    1410     3628260 :                 blankline = 1; /* Ignore completely */
    1411             :             }
    1412             :             /* We can't jump back right here since we still
    1413             :                may need to skip to the end of a comment */
    1414             :         }
    1415     8225190 :         if (!blankline && tok->level == 0) {
    1416     3432800 :             col = cont_line_col ? cont_line_col : col;
    1417     3432800 :             altcol = cont_line_col ? cont_line_col : altcol;
    1418     3432800 :             if (col == tok->indstack[tok->indent]) {
    1419             :                 /* No change */
    1420     1645700 :                 if (altcol != tok->altindstack[tok->indent]) {
    1421           2 :                     return indenterror(tok);
    1422             :                 }
    1423             :             }
    1424     1787110 :             else if (col > tok->indstack[tok->indent]) {
    1425             :                 /* Indent -- always one */
    1426      997324 :                 if (tok->indent+1 >= MAXINDENT) {
    1427           0 :                     tok->done = E_TOODEEP;
    1428           0 :                     tok->cur = tok->inp;
    1429           0 :                     return ERRORTOKEN;
    1430             :                 }
    1431      997324 :                 if (altcol <= tok->altindstack[tok->indent]) {
    1432           0 :                     return indenterror(tok);
    1433             :                 }
    1434      997324 :                 tok->pendin++;
    1435      997324 :                 tok->indstack[++tok->indent] = col;
    1436      997324 :                 tok->altindstack[tok->indent] = altcol;
    1437             :             }
    1438             :             else /* col < tok->indstack[tok->indent] */ {
    1439             :                 /* Dedent -- any number, must be consistent */
    1440     1787070 :                 while (tok->indent > 0 &&
    1441     1652600 :                     col < tok->indstack[tok->indent]) {
    1442      997281 :                     tok->pendin--;
    1443      997281 :                     tok->indent--;
    1444             :                 }
    1445      789785 :                 if (col != tok->indstack[tok->indent]) {
    1446           6 :                     tok->done = E_DEDENT;
    1447           6 :                     tok->cur = tok->inp;
    1448           6 :                     return ERRORTOKEN;
    1449             :                 }
    1450      789779 :                 if (altcol != tok->altindstack[tok->indent]) {
    1451           0 :                     return indenterror(tok);
    1452             :                 }
    1453             :             }
    1454             :         }
    1455             :     }
    1456             : 
    1457    44376000 :     tok->start = tok->cur;
    1458             : 
    1459             :     /* Return pending indents/dedents */
    1460    44376000 :     if (tok->pendin != 0) {
    1461     1994610 :         if (tok->pendin < 0) {
    1462      997282 :             tok->pendin++;
    1463      997282 :             return DEDENT;
    1464             :         }
    1465             :         else {
    1466      997324 :             tok->pendin--;
    1467      997324 :             return INDENT;
    1468             :         }
    1469             :     }
    1470             : 
    1471             :     /* Peek ahead at the next character */
    1472    42381400 :     c = tok_nextc(tok);
    1473    42381400 :     tok_backup(tok, c);
    1474             :     /* Check if we are closing an async function */
    1475    42381400 :     if (tok->async_def
    1476         141 :         && !blankline
    1477             :         /* Due to some implementation artifacts of type comments,
    1478             :          * a TYPE_COMMENT at the start of a function won't set an
    1479             :          * indentation level and it will produce a NEWLINE after it.
    1480             :          * To avoid spuriously ending an async function due to this,
    1481             :          * wait until we have some non-newline char in front of us. */
    1482         135 :         && c != '\n'
    1483         114 :         && tok->level == 0
    1484             :         /* There was a NEWLINE after ASYNC DEF,
    1485             :            so we're past the signature. */
    1486          75 :         && tok->async_def_nl
    1487             :         /* Current indentation level is less than where
    1488             :            the async function was defined */
    1489          36 :         && tok->async_def_indent >= tok->indent)
    1490             :     {
    1491           9 :         tok->async_def = 0;
    1492           9 :         tok->async_def_indent = 0;
    1493           9 :         tok->async_def_nl = 0;
    1494             :     }
    1495             : 
    1496    42381400 :  again:
    1497    42385900 :     tok->start = NULL;
    1498             :     /* Skip spaces */
    1499             :     do {
    1500    52004300 :         c = tok_nextc(tok);
    1501    52004300 :     } while (c == ' ' || c == '\t' || c == '\014');
    1502             : 
    1503             :     /* Set start of current token */
    1504    42385900 :     tok->start = tok->cur - 1;
    1505             : 
    1506             :     /* Skip comment, unless it's a type comment */
    1507    42385900 :     if (c == '#') {
    1508             :         const char *prefix, *p, *type_start;
    1509             : 
    1510    22033900 :         while (c != EOF && c != '\n') {
    1511    21228300 :             c = tok_nextc(tok);
    1512             :         }
    1513             : 
    1514      805673 :         if (tok->type_comments) {
    1515         611 :             p = tok->start;
    1516         611 :             prefix = type_comment_prefix;
    1517        5499 :             while (*prefix && p < tok->cur) {
    1518        4888 :                 if (*prefix == ' ') {
    1519        2443 :                     while (*p == ' ' || *p == '\t') {
    1520        1221 :                         p++;
    1521             :                     }
    1522        3666 :                 } else if (*prefix == *p) {
    1523        3666 :                     p++;
    1524             :                 } else {
    1525           0 :                     break;
    1526             :                 }
    1527             : 
    1528        4888 :                 prefix++;
    1529             :             }
    1530             : 
    1531             :             /* This is a type comment if we matched all of type_comment_prefix. */
    1532         611 :             if (!*prefix) {
    1533         611 :                 int is_type_ignore = 1;
    1534         611 :                 const char *ignore_end = p + 6;
    1535         611 :                 tok_backup(tok, c);  /* don't eat the newline or EOF */
    1536             : 
    1537         611 :                 type_start = p;
    1538             : 
    1539             :                 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
    1540             :                  * or anything ASCII and non-alphanumeric. */
    1541         611 :                 is_type_ignore = (
    1542         165 :                     tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
    1543         836 :                     && !(tok->cur > ignore_end
    1544          60 :                          && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
    1545             : 
    1546         611 :                 if (is_type_ignore) {
    1547          74 :                     *p_start = ignore_end;
    1548          74 :                     *p_end = tok->cur;
    1549             : 
    1550             :                     /* If this type ignore is the only thing on the line, consume the newline also. */
    1551          74 :                     if (blankline) {
    1552           0 :                         tok_nextc(tok);
    1553           0 :                         tok->atbol = 1;
    1554             :                     }
    1555          74 :                     return TYPE_IGNORE;
    1556             :                 } else {
    1557         537 :                     *p_start = type_start;  /* after type_comment_prefix */
    1558         537 :                     *p_end = tok->cur;
    1559         537 :                     return TYPE_COMMENT;
    1560             :                 }
    1561             :             }
    1562             :         }
    1563             :     }
    1564             : 
    1565    42385300 :     if (tok->done == E_INTERACT_STOP) {
    1566           0 :         return ENDMARKER;
    1567             :     }
    1568             : 
    1569             :     /* Check for EOF and errors now */
    1570    42385300 :     if (c == EOF) {
    1571      124369 :         if (tok->level) {
    1572         197 :             return ERRORTOKEN;
    1573             :         }
    1574      124172 :         return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
    1575             :     }
    1576             : 
    1577             :     /* Identifier (most frequent token!) */
    1578    42260900 :     nonascii = 0;
    1579    42260900 :     if (is_potential_identifier_start(c)) {
    1580             :         /* Process the various legal combinations of b"", r"", u"", and f"". */
    1581    12536100 :         int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
    1582             :         while (1) {
    1583    14714200 :             if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
    1584      764889 :                 saw_b = 1;
    1585             :             /* Since this is a backwards compatibility support literal we don't
    1586             :                want to support it in arbitrary order like byte literals. */
    1587    13949300 :             else if (!(saw_b || saw_u || saw_r || saw_f)
    1588    11771400 :                      && (c == 'u'|| c == 'U')) {
    1589      130866 :                 saw_u = 1;
    1590             :             }
    1591             :             /* ur"" and ru"" are not supported */
    1592    13818500 :             else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
    1593      774232 :                 saw_r = 1;
    1594             :             }
    1595    13044200 :             else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
    1596      599208 :                 saw_f = 1;
    1597             :             }
    1598             :             else {
    1599             :                 break;
    1600             :             }
    1601     2269200 :             c = tok_nextc(tok);
    1602     2269200 :             if (c == '"' || c == '\'') {
    1603       91048 :                 goto letter_quote;
    1604             :             }
    1605             :         }
    1606    79156100 :         while (is_potential_identifier_char(c)) {
    1607    66711000 :             if (c >= 128) {
    1608        1999 :                 nonascii = 1;
    1609             :             }
    1610    66711000 :             c = tok_nextc(tok);
    1611             :         }
    1612    12445000 :         tok_backup(tok, c);
    1613    12445000 :         if (nonascii && !verify_identifier(tok)) {
    1614          14 :             return ERRORTOKEN;
    1615             :         }
    1616             : 
    1617    12445000 :         *p_start = tok->start;
    1618    12445000 :         *p_end = tok->cur;
    1619             : 
    1620             :         /* async/await parsing block. */
    1621    12445000 :         if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
    1622             :             /* May be an 'async' or 'await' token.  For Python 3.7 or
    1623             :                later we recognize them unconditionally.  For Python
    1624             :                3.5 or 3.6 we recognize 'async' in front of 'def', and
    1625             :                either one inside of 'async def'.  (Technically we
    1626             :                shouldn't recognize these at all for 3.4 or earlier,
    1627             :                but there's no *valid* Python 3.4 code that would be
    1628             :                rejected, and async functions will be rejected in a
    1629             :                later phase.) */
    1630       22523 :             if (!tok->async_hacks || tok->async_def) {
    1631             :                 /* Always recognize the keywords. */
    1632       22508 :                 if (memcmp(tok->start, "async", 5) == 0) {
    1633        4306 :                     return ASYNC;
    1634             :                 }
    1635       18202 :                 if (memcmp(tok->start, "await", 5) == 0) {
    1636        2426 :                     return AWAIT;
    1637             :                 }
    1638             :             }
    1639          15 :             else if (memcmp(tok->start, "async", 5) == 0) {
    1640             :                 /* The current token is 'async'.
    1641             :                    Look ahead one token to see if that is 'def'. */
    1642             : 
    1643             :                 struct tok_state ahead_tok;
    1644          12 :                 const char *ahead_tok_start = NULL;
    1645          12 :                 const char *ahead_tok_end = NULL;
    1646             :                 int ahead_tok_kind;
    1647             : 
    1648          12 :                 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
    1649          12 :                 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
    1650             :                                          &ahead_tok_end);
    1651             : 
    1652          12 :                 if (ahead_tok_kind == NAME
    1653           9 :                     && ahead_tok.cur - ahead_tok.start == 3
    1654           9 :                     && memcmp(ahead_tok.start, "def", 3) == 0)
    1655             :                 {
    1656             :                     /* The next token is going to be 'def', so instead of
    1657             :                        returning a plain NAME token, return ASYNC. */
    1658           9 :                     tok->async_def_indent = tok->indent;
    1659           9 :                     tok->async_def = 1;
    1660           9 :                     return ASYNC;
    1661             :                 }
    1662             :             }
    1663             :         }
    1664             : 
    1665    12438300 :         return NAME;
    1666             :     }
    1667             : 
    1668             :     /* Newline */
    1669    29724800 :     if (c == '\n') {
    1670     8100170 :         tok->atbol = 1;
    1671     8100170 :         if (blankline || tok->level > 0) {
    1672     4792350 :             goto nextline;
    1673             :         }
    1674     3307820 :         *p_start = tok->start;
    1675     3307820 :         *p_end = tok->cur - 1; /* Leave '\n' out of the string */
    1676     3307820 :         tok->cont_line = 0;
    1677     3307820 :         if (tok->async_def) {
    1678             :             /* We're somewhere inside an 'async def' function, and
    1679             :                we've encountered a NEWLINE after its signature. */
    1680          21 :             tok->async_def_nl = 1;
    1681             :         }
    1682     3307820 :         return NEWLINE;
    1683             :     }
    1684             : 
    1685             :     /* Period or number starting with period? */
    1686    21624700 :     if (c == '.') {
    1687     2181670 :         c = tok_nextc(tok);
    1688     2181670 :         if (isdigit(c)) {
    1689         168 :             goto fraction;
    1690     2181500 :         } else if (c == '.') {
    1691        5886 :             c = tok_nextc(tok);
    1692        5886 :             if (c == '.') {
    1693        5233 :                 *p_start = tok->start;
    1694        5233 :                 *p_end = tok->cur;
    1695        5233 :                 return ELLIPSIS;
    1696             :             }
    1697             :             else {
    1698         653 :                 tok_backup(tok, c);
    1699             :             }
    1700         653 :             tok_backup(tok, '.');
    1701             :         }
    1702             :         else {
    1703     2175620 :             tok_backup(tok, c);
    1704             :         }
    1705     2176270 :         *p_start = tok->start;
    1706     2176270 :         *p_end = tok->cur;
    1707     2176270 :         return DOT;
    1708             :     }
    1709             : 
    1710             :     /* Number */
    1711    19443000 :     if (isdigit(c)) {
    1712     2913200 :         if (c == '0') {
    1713             :             /* Hex, octal or binary -- maybe. */
    1714      898262 :             c = tok_nextc(tok);
    1715      898262 :             if (c == 'x' || c == 'X') {
    1716             :                 /* Hex */
    1717       96464 :                 c = tok_nextc(tok);
    1718             :                 do {
    1719       96570 :                     if (c == '_') {
    1720         110 :                         c = tok_nextc(tok);
    1721             :                     }
    1722       96570 :                     if (!isxdigit(c)) {
    1723          16 :                         tok_backup(tok, c);
    1724          16 :                         return syntaxerror(tok, "invalid hexadecimal literal");
    1725             :                     }
    1726             :                     do {
    1727      518134 :                         c = tok_nextc(tok);
    1728      518134 :                     } while (isxdigit(c));
    1729       96554 :                 } while (c == '_');
    1730       96448 :                 if (!verify_end_of_number(tok, c, "hexadecimal")) {
    1731          12 :                     return ERRORTOKEN;
    1732             :                 }
    1733             :             }
    1734      801798 :             else if (c == 'o' || c == 'O') {
    1735             :                 /* Octal */
    1736        2044 :                 c = tok_nextc(tok);
    1737             :                 do {
    1738        2054 :                     if (c == '_') {
    1739          12 :                         c = tok_nextc(tok);
    1740             :                     }
    1741        2054 :                     if (c < '0' || c >= '8') {
    1742          12 :                         if (isdigit(c)) {
    1743           5 :                             return syntaxerror(tok,
    1744             :                                     "invalid digit '%c' in octal literal", c);
    1745             :                         }
    1746             :                         else {
    1747           7 :                             tok_backup(tok, c);
    1748           7 :                             return syntaxerror(tok, "invalid octal literal");
    1749             :                         }
    1750             :                     }
    1751             :                     do {
    1752        8450 :                         c = tok_nextc(tok);
    1753        8450 :                     } while ('0' <= c && c < '8');
    1754        2042 :                 } while (c == '_');
    1755        2032 :                 if (isdigit(c)) {
    1756           3 :                     return syntaxerror(tok,
    1757             :                             "invalid digit '%c' in octal literal", c);
    1758             :                 }
    1759        2029 :                 if (!verify_end_of_number(tok, c, "octal")) {
    1760          11 :                     return ERRORTOKEN;
    1761             :                 }
    1762             :             }
    1763      799754 :             else if (c == 'b' || c == 'B') {
    1764             :                 /* Binary */
    1765         387 :                 c = tok_nextc(tok);
    1766             :                 do {
    1767         413 :                     if (c == '_') {
    1768          28 :                         c = tok_nextc(tok);
    1769             :                     }
    1770         413 :                     if (c != '0' && c != '1') {
    1771          12 :                         if (isdigit(c)) {
    1772           5 :                             return syntaxerror(tok,
    1773             :                                     "invalid digit '%c' in binary literal", c);
    1774             :                         }
    1775             :                         else {
    1776           7 :                             tok_backup(tok, c);
    1777           7 :                             return syntaxerror(tok, "invalid binary literal");
    1778             :                         }
    1779             :                     }
    1780             :                     do {
    1781        4927 :                         c = tok_nextc(tok);
    1782        4927 :                     } while (c == '0' || c == '1');
    1783         401 :                 } while (c == '_');
    1784         375 :                 if (isdigit(c)) {
    1785           2 :                     return syntaxerror(tok,
    1786             :                             "invalid digit '%c' in binary literal", c);
    1787             :                 }
    1788         373 :                 if (!verify_end_of_number(tok, c, "binary")) {
    1789          11 :                     return ERRORTOKEN;
    1790             :                 }
    1791             :             }
    1792             :             else {
    1793      799367 :                 int nonzero = 0;
    1794             :                 /* maybe old-style octal; c is first char of it */
    1795             :                 /* in any case, allow '0' as a literal */
    1796             :                 while (1) {
    1797      799574 :                     if (c == '_') {
    1798          13 :                         c = tok_nextc(tok);
    1799          13 :                         if (!isdigit(c)) {
    1800           4 :                             tok_backup(tok, c);
    1801           4 :                             return syntaxerror(tok, "invalid decimal literal");
    1802             :                         }
    1803             :                     }
    1804      799570 :                     if (c != '0') {
    1805      799363 :                         break;
    1806             :                     }
    1807         207 :                     c = tok_nextc(tok);
    1808             :                 }
    1809      799363 :                 char* zeros_end = tok->cur;
    1810      799363 :                 if (isdigit(c)) {
    1811          26 :                     nonzero = 1;
    1812          26 :                     c = tok_decimal_tail(tok);
    1813          26 :                     if (c == 0) {
    1814           0 :                         return ERRORTOKEN;
    1815             :                     }
    1816             :                 }
    1817      799363 :                 if (c == '.') {
    1818        8839 :                     c = tok_nextc(tok);
    1819        8839 :                     goto fraction;
    1820             :                 }
    1821      790524 :                 else if (c == 'e' || c == 'E') {
    1822          13 :                     goto exponent;
    1823             :                 }
    1824      790511 :                 else if (c == 'j' || c == 'J') {
    1825         509 :                     goto imaginary;
    1826             :                 }
    1827      790002 :                 else if (nonzero) {
    1828             :                     /* Old-style octal: now disallowed. */
    1829          13 :                     tok_backup(tok, c);
    1830          13 :                     return syntaxerror_known_range(
    1831          13 :                             tok, (int)(tok->start + 1 - tok->line_start),
    1832          13 :                             (int)(zeros_end - tok->line_start),
    1833             :                             "leading zeros in decimal integer "
    1834             :                             "literals are not permitted; "
    1835             :                             "use an 0o prefix for octal integers");
    1836             :                 }
    1837      789989 :                 if (!verify_end_of_number(tok, c, "decimal")) {
    1838           7 :                     return ERRORTOKEN;
    1839             :                 }
    1840             :             }
    1841             :         }
    1842             :         else {
    1843             :             /* Decimal */
    1844     2014930 :             c = tok_decimal_tail(tok);
    1845     2014930 :             if (c == 0) {
    1846          11 :                 return ERRORTOKEN;
    1847             :             }
    1848             :             {
    1849             :                 /* Accept floating point numbers. */
    1850     2014920 :                 if (c == '.') {
    1851       16316 :                     c = tok_nextc(tok);
    1852       25323 :         fraction:
    1853             :                     /* Fraction */
    1854       25323 :                     if (isdigit(c)) {
    1855       24105 :                         c = tok_decimal_tail(tok);
    1856       24105 :                         if (c == 0) {
    1857          10 :                             return ERRORTOKEN;
    1858             :                         }
    1859             :                     }
    1860             :                 }
    1861     2023920 :                 if (c == 'e' || c == 'E') {
    1862             :                     int e;
    1863        4425 :                   exponent:
    1864        4438 :                     e = c;
    1865             :                     /* Exponent part */
    1866        4438 :                     c = tok_nextc(tok);
    1867        4438 :                     if (c == '+' || c == '-') {
    1868        3131 :                         c = tok_nextc(tok);
    1869        3131 :                         if (!isdigit(c)) {
    1870           8 :                             tok_backup(tok, c);
    1871           8 :                             return syntaxerror(tok, "invalid decimal literal");
    1872             :                         }
    1873        1307 :                     } else if (!isdigit(c)) {
    1874          15 :                         tok_backup(tok, c);
    1875          15 :                         if (!verify_end_of_number(tok, e, "decimal")) {
    1876          10 :                             return ERRORTOKEN;
    1877             :                         }
    1878           5 :                         tok_backup(tok, e);
    1879           5 :                         *p_start = tok->start;
    1880           5 :                         *p_end = tok->cur;
    1881           5 :                         return NUMBER;
    1882             :                     }
    1883        4415 :                     c = tok_decimal_tail(tok);
    1884        4415 :                     if (c == 0) {
    1885           6 :                         return ERRORTOKEN;
    1886             :                     }
    1887             :                 }
    1888     2023900 :                 if (c == 'j' || c == 'J') {
    1889             :                     /* Imaginary part */
    1890        1639 :         imaginary:
    1891        2148 :                     c = tok_nextc(tok);
    1892        2148 :                     if (!verify_end_of_number(tok, c, "imaginary")) {
    1893          10 :                         return ERRORTOKEN;
    1894             :                     }
    1895             :                 }
    1896     2022260 :                 else if (!verify_end_of_number(tok, c, "decimal")) {
    1897          27 :                     return ERRORTOKEN;
    1898             :                 }
    1899             :             }
    1900             :         }
    1901     2913170 :         tok_backup(tok, c);
    1902     2913170 :         *p_start = tok->start;
    1903     2913170 :         *p_end = tok->cur;
    1904     2913170 :         return NUMBER;
    1905             :     }
    1906             : 
    1907    16529800 :   letter_quote:
    1908             :     /* String */
    1909    16620800 :     if (c == '\'' || c == '"') {
    1910     1691840 :         int quote = c;
    1911     1691840 :         int quote_size = 1;             /* 1 or 3 */
    1912     1691840 :         int end_quote_size = 0;
    1913             : 
    1914             :         /* Nodes of type STRING, especially multi line strings
    1915             :            must be handled differently in order to get both
    1916             :            the starting line number and the column offset right.
    1917             :            (cf. issue 16806) */
    1918     1691840 :         tok->first_lineno = tok->lineno;
    1919     1691840 :         tok->multi_line_start = tok->line_start;
    1920             : 
    1921             :         /* Find the quote size and start of string */
    1922     1691840 :         c = tok_nextc(tok);
    1923     1691840 :         if (c == quote) {
    1924      229617 :             c = tok_nextc(tok);
    1925      229617 :             if (c == quote) {
    1926      122483 :                 quote_size = 3;
    1927             :             }
    1928             :             else {
    1929      107134 :                 end_quote_size = 1;     /* empty string found */
    1930             :             }
    1931             :         }
    1932     1691840 :         if (c != quote) {
    1933     1569360 :             tok_backup(tok, c);
    1934             :         }
    1935             : 
    1936             :         /* Get rest of string */
    1937    52327000 :         while (end_quote_size != quote_size) {
    1938    50635200 :             c = tok_nextc(tok);
    1939    50635200 :             if (c == EOF || (quote_size == 1 && c == '\n')) {
    1940          18 :                 assert(tok->multi_line_start != NULL);
    1941             :                 // shift the tok_state's location into
    1942             :                 // the start of string, and report the error
    1943             :                 // from the initial quote character
    1944          18 :                 tok->cur = (char *)tok->start;
    1945          18 :                 tok->cur++;
    1946          18 :                 tok->line_start = tok->multi_line_start;
    1947          18 :                 int start = tok->lineno;
    1948          18 :                 tok->lineno = tok->first_lineno;
    1949          18 :                 if (quote_size == 3) {
    1950           6 :                     syntaxerror(tok, "unterminated triple-quoted string literal"
    1951             :                                      " (detected at line %d)", start);
    1952           6 :                     if (c != '\n') {
    1953           6 :                         tok->done = E_EOFS;
    1954             :                     }
    1955           6 :                     return ERRORTOKEN;
    1956             :                 }
    1957             :                 else {
    1958          12 :                     syntaxerror(tok, "unterminated string literal (detected at"
    1959             :                                      " line %d)", start);
    1960          12 :                     if (c != '\n') {
    1961           6 :                         tok->done = E_EOLS;
    1962             :                     }
    1963          12 :                     return ERRORTOKEN;
    1964             :                 }
    1965             :             }
    1966    50635200 :             if (c == quote) {
    1967     1898940 :                 end_quote_size += 1;
    1968             :             }
    1969             :             else {
    1970    48736300 :                 end_quote_size = 0;
    1971    48736300 :                 if (c == '\\') {
    1972      341804 :                     tok_nextc(tok);  /* skip escaped char */
    1973             :                 }
    1974             :             }
    1975             :         }
    1976             : 
    1977     1691830 :         *p_start = tok->start;
    1978     1691830 :         *p_end = tok->cur;
    1979     1691830 :         return STRING;
    1980             :     }
    1981             : 
    1982             :     /* Line continuation */
    1983    14929000 :     if (c == '\\') {
    1984        4458 :         if ((c = tok_continuation_line(tok)) == -1) {
    1985          16 :             return ERRORTOKEN;
    1986             :         }
    1987        4442 :         tok->cont_line = 1;
    1988        4442 :         goto again; /* Read next line */
    1989             :     }
    1990             : 
    1991             :     /* Check for two-character token */
    1992             :     {
    1993    14924500 :         int c2 = tok_nextc(tok);
    1994    14924500 :         int token = _PyToken_TwoChars(c, c2);
    1995    14924500 :         if (token != OP) {
    1996      215764 :             int c3 = tok_nextc(tok);
    1997      215764 :             int token3 = _PyToken_ThreeChars(c, c2, c3);
    1998      215764 :             if (token3 != OP) {
    1999         765 :                 token = token3;
    2000             :             }
    2001             :             else {
    2002      214999 :                 tok_backup(tok, c3);
    2003             :             }
    2004      215764 :             *p_start = tok->start;
    2005      215764 :             *p_end = tok->cur;
    2006      215764 :             return token;
    2007             :         }
    2008    14708800 :         tok_backup(tok, c2);
    2009             :     }
    2010             : 
    2011             :     /* Keep track of parentheses nesting level */
    2012    14708800 :     switch (c) {
    2013     3392690 :     case '(':
    2014             :     case '[':
    2015             :     case '{':
    2016     3392690 :         if (tok->level >= MAXLEVEL) {
    2017           1 :             return syntaxerror(tok, "too many nested parentheses");
    2018             :         }
    2019     3392690 :         tok->parenstack[tok->level] = c;
    2020     3392690 :         tok->parenlinenostack[tok->level] = tok->lineno;
    2021     3392690 :         tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
    2022     3392690 :         tok->level++;
    2023     3392690 :         break;
    2024     3392240 :     case ')':
    2025             :     case ']':
    2026             :     case '}':
    2027     3392240 :         if (!tok->level) {
    2028          12 :             return syntaxerror(tok, "unmatched '%c'", c);
    2029             :         }
    2030     3392230 :         tok->level--;
    2031     3392230 :         int opening = tok->parenstack[tok->level];
    2032     3392230 :         if (!((opening == '(' && c == ')') ||
    2033      640080 :               (opening == '[' && c == ']') ||
    2034       47751 :               (opening == '{' && c == '}')))
    2035             :         {
    2036           5 :             if (tok->parenlinenostack[tok->level] != tok->lineno) {
    2037           0 :                 return syntaxerror(tok,
    2038             :                         "closing parenthesis '%c' does not match "
    2039             :                         "opening parenthesis '%c' on line %d",
    2040           0 :                         c, opening, tok->parenlinenostack[tok->level]);
    2041             :             }
    2042             :             else {
    2043           5 :                 return syntaxerror(tok,
    2044             :                         "closing parenthesis '%c' does not match "
    2045             :                         "opening parenthesis '%c'",
    2046             :                         c, opening);
    2047             :             }
    2048             :         }
    2049     3392220 :         break;
    2050             :     }
    2051             : 
    2052    14708800 :     if (!Py_UNICODE_ISPRINTABLE(c)) {
    2053             :         char hex[9];
    2054           1 :         (void)PyOS_snprintf(hex, sizeof(hex), "%04X", c);
    2055           1 :         return syntaxerror(tok, "invalid non-printable character U+%s", hex);
    2056             :     }
    2057             : 
    2058             :     /* Punctuation character */
    2059    14708800 :     *p_start = tok->start;
    2060    14708800 :     *p_end = tok->cur;
    2061    14708800 :     return _PyToken_OneChar(c);
    2062             : }
    2063             : 
    2064             : int
    2065    39583700 : _PyTokenizer_Get(struct tok_state *tok,
    2066             :                  const char **p_start, const char **p_end)
    2067             : {
    2068    39583700 :     int result = tok_get(tok, p_start, p_end);
    2069    39583700 :     if (tok->decoding_erred) {
    2070           3 :         result = ERRORTOKEN;
    2071           3 :         tok->done = E_DECODE;
    2072             :     }
    2073    39583700 :     return result;
    2074             : }
    2075             : 
    2076             : #if defined(__wasi__) || (defined(__EMSCRIPTEN__) && (__EMSCRIPTEN_major__ >= 3))
    2077             : // fdopen() with borrowed fd. WASI does not provide dup() and Emscripten's
    2078             : // dup() emulation with open() is slow.
    2079             : typedef union {
    2080             :     void *cookie;
    2081             :     int fd;
    2082             : } borrowed;
    2083             : 
    2084             : static ssize_t
    2085             : borrow_read(void *cookie, char *buf, size_t size)
    2086             : {
    2087             :     borrowed b = {.cookie = cookie};
    2088             :     return read(b.fd, (void *)buf, size);
    2089             : }
    2090             : 
    2091             : static FILE *
    2092             : fdopen_borrow(int fd) {
    2093             :     // supports only reading. seek fails. close and write are no-ops.
    2094             :     cookie_io_functions_t io_cb = {borrow_read, NULL, NULL, NULL};
    2095             :     borrowed b = {.fd = fd};
    2096             :     return fopencookie(b.cookie, "r", io_cb);
    2097             : }
    2098             : #else
    2099             : static FILE *
    2100         476 : fdopen_borrow(int fd) {
    2101         476 :     fd = _Py_dup(fd);
    2102         476 :     if (fd < 0) {
    2103           0 :         return NULL;
    2104             :     }
    2105         476 :     return fdopen(fd, "r");
    2106             : }
    2107             : #endif
    2108             : 
    2109             : /* Get the encoding of a Python file. Check for the coding cookie and check if
    2110             :    the file starts with a BOM.
    2111             : 
    2112             :    _PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
    2113             :    encoding in the first or second line of the file (in which case the encoding
    2114             :    should be assumed to be UTF-8).
    2115             : 
    2116             :    The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed
    2117             :    by the caller. */
    2118             : 
    2119             : char *
    2120         476 : _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
    2121             : {
    2122             :     struct tok_state *tok;
    2123             :     FILE *fp;
    2124         476 :     const char *p_start = NULL;
    2125         476 :     const char *p_end = NULL;
    2126         476 :     char *encoding = NULL;
    2127             : 
    2128         476 :     fp = fdopen_borrow(fd);
    2129         476 :     if (fp == NULL) {
    2130           0 :         return NULL;
    2131             :     }
    2132         476 :     tok = _PyTokenizer_FromFile(fp, NULL, NULL, NULL);
    2133         476 :     if (tok == NULL) {
    2134           0 :         fclose(fp);
    2135           0 :         return NULL;
    2136             :     }
    2137         476 :     if (filename != NULL) {
    2138         476 :         Py_INCREF(filename);
    2139         476 :         tok->filename = filename;
    2140             :     }
    2141             :     else {
    2142           0 :         tok->filename = PyUnicode_FromString("<string>");
    2143           0 :         if (tok->filename == NULL) {
    2144           0 :             fclose(fp);
    2145           0 :             _PyTokenizer_Free(tok);
    2146           0 :             return encoding;
    2147             :         }
    2148             :     }
    2149        1438 :     while (tok->lineno < 2 && tok->done == E_OK) {
    2150         962 :         _PyTokenizer_Get(tok, &p_start, &p_end);
    2151             :     }
    2152         476 :     fclose(fp);
    2153         476 :     if (tok->encoding) {
    2154          12 :         encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1);
    2155          12 :         if (encoding) {
    2156          12 :             strcpy(encoding, tok->encoding);
    2157             :         }
    2158             :     }
    2159         476 :     _PyTokenizer_Free(tok);
    2160         476 :     return encoding;
    2161             : }
    2162             : 
    2163             : #ifdef Py_DEBUG
    2164             : void
    2165           0 : tok_dump(int type, char *start, char *end)
    2166             : {
    2167           0 :     fprintf(stderr, "%s", _PyParser_TokenNames[type]);
    2168           0 :     if (type == NAME || type == NUMBER || type == STRING || type == OP)
    2169           0 :         fprintf(stderr, "(%.*s)", (int)(end - start), start);
    2170           0 : }
    2171             : #endif  // Py_DEBUG

Generated by: LCOV version 1.14