Coverage Report

Created: 2022-07-08 09:39

/home/mdboom/Work/builds/cpython/Parser/tokenizer.c
Line
Count
Source (jump to first uncovered line)
1
2
/* Tokenizer implementation */
3
4
#define PY_SSIZE_T_CLEAN
5
#include "Python.h"
6
#include "pycore_call.h"          // _PyObject_CallNoArgs()
7
8
#include <ctype.h>
9
#include <assert.h>
10
11
#include "tokenizer.h"
12
#include "errcode.h"
13
14
#include "unicodeobject.h"
15
#include "bytesobject.h"
16
#include "fileobject.h"
17
#include "abstract.h"
18
19
/* Alternate tab spacing */
20
#define ALTTABSIZE 1
21
22
#define is_potential_identifier_start(c) (\
23
              (c >= 'a' && 
c <= 'z'2.10M
)\
24
               || 
(7.88M
c >= 'A'7.88M
&&
c <= 'Z'810k
)\
25
               || 
c == '_'7.81M
\
26
               || 
(c >= 128)7.75M
)
27
28
#define is_potential_identifier_char(c) (\
29
              (c >= 'a' && 
c <= 'z'5.12M
)\
30
               || 
(3.43M
c >= 'A'3.43M
&&
c <= 'Z'873k
)\
31
               || 
(3.13M
c >= '0'3.13M
&&
c <= '9'944k
)\
32
               || 
c == '_'3.09M
\
33
               || 
(c >= 128)2.86M
)
34
35
36
/* Don't ever change this -- it would break the portability of Python code */
37
#define TABSIZE 8
38
39
/* Forward */
40
static struct tok_state *tok_new(void);
41
static int tok_nextc(struct tok_state *tok);
42
static void tok_backup(struct tok_state *tok, int c);
43
static int syntaxerror(struct tok_state *tok, const char *format, ...);
44
45
/* Spaces in this constant are treated as "zero or more spaces or tabs" when
46
   tokenizing. */
47
static const char* type_comment_prefix = "# type: ";
48
49
/* Create and initialize a new tok_state structure */
50
51
static struct tok_state *
52
tok_new(void)
53
{
54
    struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
55
                                            sizeof(struct tok_state));
56
    if (tok == NULL)
  Branch (56:9): [True: 0, False: 132k]
57
        return NULL;
58
    tok->buf = tok->cur = tok->inp = NULL;
59
    tok->fp_interactive = 0;
60
    tok->interactive_src_start = NULL;
61
    tok->interactive_src_end = NULL;
62
    tok->start = NULL;
63
    tok->end = NULL;
64
    tok->done = E_OK;
65
    tok->fp = NULL;
66
    tok->input = NULL;
67
    tok->tabsize = TABSIZE;
68
    tok->indent = 0;
69
    tok->indstack[0] = 0;
70
    tok->atbol = 1;
71
    tok->pendin = 0;
72
    tok->prompt = tok->nextprompt = NULL;
73
    tok->lineno = 0;
74
    tok->level = 0;
75
    tok->altindstack[0] = 0;
76
    tok->decoding_state = STATE_INIT;
77
    tok->decoding_erred = 0;
78
    tok->enc = NULL;
79
    tok->encoding = NULL;
80
    tok->cont_line = 0;
81
    tok->filename = NULL;
82
    tok->decoding_readline = NULL;
83
    tok->decoding_buffer = NULL;
84
    tok->type_comments = 0;
85
    tok->async_hacks = 0;
86
    tok->async_def = 0;
87
    tok->async_def_indent = 0;
88
    tok->async_def_nl = 0;
89
    tok->interactive_underflow = IUNDERFLOW_NORMAL;
90
    tok->str = NULL;
91
#ifdef Py_DEBUG
92
    tok->debug = _Py_GetConfig()->parser_debug;
93
#endif
94
    return tok;
95
}
96
97
static char *
98
new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
99
{
100
    char* result = (char *)PyMem_Malloc(len + 1);
101
    if (!result) {
  Branch (101:9): [True: 0, False: 57.1k]
102
        tok->done = E_NOMEM;
103
        return NULL;
104
    }
105
    memcpy(result, s, len);
106
    result[len] = '\0';
107
    return result;
108
}
109
110
static char *
111
error_ret(struct tok_state *tok) /* XXX */
112
{
113
    tok->decoding_erred = 1;
114
    if (tok->fp != NULL && 
tok->buf != NULL0
) /* see _PyTokenizer_Free */
  Branch (114:9): [True: 0, False: 26]
  Branch (114:28): [True: 0, False: 0]
115
        PyMem_Free(tok->buf);
116
    tok->buf = tok->cur = tok->inp = NULL;
117
    tok->start = NULL;
118
    tok->end = NULL;
119
    tok->done = E_DECODE;
120
    return NULL;                /* as if it were EOF */
121
}
122
123
124
static const char *
125
get_normal_name(const char *s)  /* for utf-8 and latin-1 */
126
{
127
    char buf[13];
128
    int i;
129
    for (i = 0; i < 12; 
i++489
) {
  Branch (129:17): [True: 566, False: 0]
130
        int c = s[i];
131
        if (c == '\0')
  Branch (131:13): [True: 77, False: 489]
132
            break;
133
        else if (c == '_')
  Branch (133:18): [True: 0, False: 489]
134
            buf[i] = '-';
135
        else
136
            buf[i] = tolower(c);
137
    }
138
    buf[i] = '\0';
139
    if (strcmp(buf, "utf-8") == 0 ||
  Branch (139:9): [True: 14, False: 63]
140
        
strncmp(buf, "utf-8-", 6) == 063
)
  Branch (140:9): [True: 0, False: 63]
141
        return "utf-8";
142
    else if (strcmp(buf, "latin-1") == 0 ||
  Branch (142:14): [True: 14, False: 49]
143
             
strcmp(buf, "iso-8859-1") == 049
||
  Branch (143:14): [True: 2, False: 47]
144
             
strcmp(buf, "iso-latin-1") == 047
||
  Branch (144:14): [True: 0, False: 47]
145
             
strncmp(buf, "latin-1-", 8) == 047
||
  Branch (145:14): [True: 0, False: 47]
146
             
strncmp(buf, "iso-8859-1-", 11) == 047
||
  Branch (146:14): [True: 0, False: 47]
147
             
strncmp(buf, "iso-latin-1-", 12) == 047
)
  Branch (147:14): [True: 0, False: 47]
148
        return "iso-8859-1";
149
    else
150
        return s;
151
}
152
153
/* Return the coding spec in S, or NULL if none is found.  */
154
155
static int
156
get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
157
{
158
    Py_ssize_t i;
159
    *spec = NULL;
160
    /* Coding spec must be in a comment, and that comment must be
161
     * the only statement on the source code line. */
162
    for (i = 0; i < size - 6; 
i++0
) {
  Branch (162:17): [True: 5.04k, False: 70.9k]
163
        if (s[i] == '#')
  Branch (163:13): [True: 336, False: 4.70k]
164
            break;
165
        if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
  Branch (165:13): [True: 4.70k, False: 0]
  Branch (165:28): [True: 4.70k, False: 0]
  Branch (165:44): [True: 4.70k, False: 0]
166
            return 1;
167
    }
168
    
for (; 71.2k
i < size - 6;
i++10.1k
) { /* XXX inefficient search */
  Branch (168:12): [True: 10.2k, False: 71.1k]
169
        const char* t = s + i;
170
        if (memcmp(t, "coding", 6) == 0) {
  Branch (170:13): [True: 77, False: 10.1k]
171
            const char* begin = NULL;
172
            t += 6;
173
            if (t[0] != ':' && 
t[0] != '='22
)
  Branch (173:17): [True: 22, False: 55]
  Branch (173:32): [True: 0, False: 22]
174
                continue;
175
            
do 77
{
176
                t++;
177
            } while (t[0] == ' ' || 
t[0] == '\t'77
);
  Branch (177:22): [True: 46, False: 77]
  Branch (177:37): [True: 0, False: 77]
178
179
            begin = t;
180
            while (Py_ISALNUM(t[0]) ||
181
                   
t[0] == '-'133
||
t[0] == '_'77
||
t[0] == '.'77
)
  Branch (181:20): [True: 56, False: 77]
  Branch (181:35): [True: 0, False: 77]
  Branch (181:50): [True: 0, False: 77]
182
                t++;
183
184
            if (begin < t) {
  Branch (184:17): [True: 77, False: 0]
185
                char* r = new_string(begin, t - begin, tok);
186
                const char* q;
187
                if (!r)
  Branch (187:21): [True: 0, False: 77]
188
                    return 0;
189
                q = get_normal_name(r);
190
                if (r != q) {
  Branch (190:21): [True: 30, False: 47]
191
                    PyMem_Free(r);
192
                    r = new_string(q, strlen(q), tok);
193
                    if (!r)
  Branch (193:25): [True: 0, False: 30]
194
                        return 0;
195
                }
196
                *spec = r;
197
                break;
198
            }
199
        }
200
    }
201
    return 1;
202
}
203
204
/* Check whether the line contains a coding spec. If it does,
205
   invoke the set_readline function for the new encoding.
206
   This function receives the tok_state and the new encoding.
207
   Return 1 on success, 0 on failure.  */
208
209
static int
210
check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
211
                  int set_readline(struct tok_state *, const char *))
212
{
213
    char *cs;
214
    if (tok->cont_line) {
  Branch (214:9): [True: 0, False: 75.9k]
215
        /* It's a continuation line, so it can't be a coding spec. */
216
        tok->decoding_state = STATE_NORMAL;
217
        return 1;
218
    }
219
    if (!get_coding_spec(line, &cs, size, tok)) {
  Branch (219:9): [True: 0, False: 75.9k]
220
        return 0;
221
    }
222
    if (!cs) {
  Branch (222:9): [True: 75.9k, False: 77]
223
        Py_ssize_t i;
224
        for (i = 0; i < size; 
i++0
) {
  Branch (224:21): [True: 75.5k, False: 344]
225
            if (line[i] == '#' || 
line[i] == '\n'75.2k
||
line[i] == '\r'75.1k
)
  Branch (225:17): [True: 290, False: 75.2k]
  Branch (225:35): [True: 78, False: 75.1k]
  Branch (225:54): [True: 0, False: 75.1k]
226
                break;
227
            if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
  Branch (227:17): [True: 75.1k, False: 0]
  Branch (227:35): [True: 75.1k, False: 0]
  Branch (227:54): [True: 75.1k, False: 0]
228
                /* Stop checking coding spec after a line containing
229
                 * anything except a comment. */
230
                tok->decoding_state = STATE_NORMAL;
231
                break;
232
            }
233
        }
234
        return 1;
235
    }
236
    tok->decoding_state = STATE_NORMAL;
237
    if (tok->encoding == NULL) {
  Branch (237:9): [True: 51, False: 26]
238
        assert(tok->decoding_readline == NULL);
239
        if (strcmp(cs, "utf-8") != 0 && 
!set_readline(tok, cs)43
) {
  Branch (239:13): [True: 43, False: 8]
  Branch (239:41): [True: 0, False: 43]
240
            error_ret(tok);
241
            PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
242
            PyMem_Free(cs);
243
            return 0;
244
        }
245
        tok->encoding = cs;
246
    } else {                /* then, compare cs with BOM */
247
        if (strcmp(tok->encoding, cs) != 0) {
  Branch (247:13): [True: 20, False: 6]
248
            error_ret(tok);
249
            PyErr_Format(PyExc_SyntaxError,
250
                         "encoding problem: %s with BOM", cs);
251
            PyMem_Free(cs);
252
            return 0;
253
        }
254
        PyMem_Free(cs);
255
    }
256
    return 1;
257
}
258
259
/* See whether the file starts with a BOM. If it does,
260
   invoke the set_readline function with the new encoding.
261
   Return 1 on success, 0 on failure.  */
262
263
static int
264
check_bom(int get_char(struct tok_state *),
265
          void unget_char(int, struct tok_state *),
266
          int set_readline(struct tok_state *, const char *),
267
          struct tok_state *tok)
268
{
269
    int ch1, ch2, ch3;
270
    ch1 = get_char(tok);
271
    tok->decoding_state = STATE_SEEK_CODING;
272
    if (ch1 == EOF) {
  Branch (272:9): [True: 0, False: 75.6k]
273
        return 1;
274
    } else if (ch1 == 0xEF) {
  Branch (274:16): [True: 37, False: 75.6k]
275
        ch2 = get_char(tok);
276
        if (ch2 != 0xBB) {
  Branch (276:13): [True: 1, False: 36]
277
            unget_char(ch2, tok);
278
            unget_char(ch1, tok);
279
            return 1;
280
        }
281
        ch3 = get_char(tok);
282
        if (ch3 != 0xBF) {
  Branch (282:13): [True: 2, False: 34]
283
            unget_char(ch3, tok);
284
            unget_char(ch2, tok);
285
            unget_char(ch1, tok);
286
            return 1;
287
        }
288
    } else {
289
        unget_char(ch1, tok);
290
        return 1;
291
    }
292
    if (tok->encoding != NULL)
  Branch (292:9): [True: 0, False: 34]
293
        PyMem_Free(tok->encoding);
294
    tok->encoding = new_string("utf-8", 5, tok);
295
    if (!tok->encoding)
  Branch (295:9): [True: 0, False: 34]
296
        return 0;
297
    /* No need to set_readline: input is already utf-8 */
298
    return 1;
299
}
300
301
static int
302
tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
303
    assert(tok->fp_interactive);
304
305
    if (!line) {
  Branch (305:9): [True: 0, False: 0]
306
        return 0;
307
    }
308
309
    Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start;
310
    Py_ssize_t line_size = strlen(line);
311
    char last_char = line[line_size > 0 ? line_size - 1 : line_size];
  Branch (311:27): [True: 0, False: 0]
312
    if (last_char != '\n') {
  Branch (312:9): [True: 0, False: 0]
313
        line_size += 1;
314
    }
315
    char* new_str = tok->interactive_src_start;
316
317
    new_str = PyMem_Realloc(new_str, current_size + line_size + 1);
318
    if (!new_str) {
  Branch (318:9): [True: 0, False: 0]
319
        if (tok->interactive_src_start) {
  Branch (319:13): [True: 0, False: 0]
320
            PyMem_Free(tok->interactive_src_start);
321
        }
322
        tok->interactive_src_start = NULL;
323
        tok->interactive_src_end = NULL;
324
        tok->done = E_NOMEM;
325
        return -1;
326
    }
327
    strcpy(new_str + current_size, line);
328
    if (last_char != '\n') {
  Branch (328:9): [True: 0, False: 0]
329
        /* Last line does not end in \n, fake one */
330
        new_str[current_size + line_size - 1] = '\n';
331
        new_str[current_size + line_size] = '\0';
332
    }
333
    tok->interactive_src_start = new_str;
334
    tok->interactive_src_end = new_str + current_size + line_size;
335
    return 0;
336
}
337
338
339
/* Read a line of text from TOK into S, using the stream in TOK.
340
   Return NULL on failure, else S.
341
342
   On entry, tok->decoding_buffer will be one of:
343
     1) NULL: need to call tok->decoding_readline to get a new line
344
     2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
345
       stored the result in tok->decoding_buffer
346
     3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
347
       (in the s buffer) to copy entire contents of the line read
348
       by tok->decoding_readline.  tok->decoding_buffer has the overflow.
349
       In this case, tok_readline_recode is called in a loop (with an expanded buffer)
350
       until the buffer ends with a '\n' (or until the end of the file is
351
       reached): see tok_nextc and its calls to tok_reserve_buf.
352
*/
353
354
static int
355
tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
356
{
357
    Py_ssize_t cur = tok->cur - tok->buf;
358
    Py_ssize_t oldsize = tok->inp - tok->buf;
359
    Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
360
    if (newsize > tok->end - tok->buf) {
  Branch (360:9): [True: 9, False: 801]
361
        char *newbuf = tok->buf;
362
        Py_ssize_t start = tok->start == NULL ? 
-11
:
tok->start - tok->buf8
;
  Branch (362:28): [True: 1, False: 8]
363
        Py_ssize_t line_start = tok->start == NULL ? 
-11
:
tok->line_start - tok->buf8
;
  Branch (363:33): [True: 1, False: 8]
364
        Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
365
        newbuf = (char *)PyMem_Realloc(newbuf, newsize);
366
        if (newbuf == NULL) {
  Branch (366:13): [True: 0, False: 9]
367
            tok->done = E_NOMEM;
368
            return 0;
369
        }
370
        tok->buf = newbuf;
371
        tok->cur = tok->buf + cur;
372
        tok->inp = tok->buf + oldsize;
373
        tok->end = tok->buf + newsize;
374
        tok->start = start < 0 ? NULL : 
tok->buf + start8
;
  Branch (374:22): [True: 1, False: 8]
375
        tok->line_start = line_start < 0 ? NULL : 
tok->buf + line_start8
;
  Branch (375:27): [True: 1, False: 8]
376
        tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
  Branch (376:33): [True: 0, False: 9]
377
    }
378
    return 1;
379
}
380
381
static int
382
tok_readline_recode(struct tok_state *tok) {
383
    PyObject *line;
384
    const  char *buf;
385
    Py_ssize_t buflen;
386
    line = tok->decoding_buffer;
387
    if (line == NULL) {
  Branch (387:9): [True: 0, False: 0]
388
        line = PyObject_CallNoArgs(tok->decoding_readline);
389
        if (line == NULL) {
  Branch (389:13): [True: 0, False: 0]
390
            error_ret(tok);
391
            goto error;
392
        }
393
    }
394
    else {
395
        tok->decoding_buffer = NULL;
396
    }
397
    buf = PyUnicode_AsUTF8AndSize(line, &buflen);
398
    if (buf == NULL) {
  Branch (398:9): [True: 0, False: 0]
399
        error_ret(tok);
400
        goto error;
401
    }
402
    if (!tok_reserve_buf(tok, buflen + 1)) {
  Branch (402:9): [True: 0, False: 0]
403
        goto error;
404
    }
405
    memcpy(tok->inp, buf, buflen);
406
    tok->inp += buflen;
407
    *tok->inp = '\0';
408
    if (tok->fp_interactive &&
  Branch (408:9): [True: 0, False: 0]
409
        tok_concatenate_interactive_new_line(tok, buf) == -1) {
  Branch (409:9): [True: 0, False: 0]
410
        goto error;
411
    }
412
    Py_DECREF(line);
413
    return 1;
414
error:
415
    Py_XDECREF(line);
416
    return 0;
417
}
418
419
/* Set the readline function for TOK to a StreamReader's
420
   readline function. The StreamReader is named ENC.
421
422
   This function is called from check_bom and check_coding_spec.
423
424
   ENC is usually identical to the future value of tok->encoding,
425
   except for the (currently unsupported) case of UTF-16.
426
427
   Return 1 on success, 0 on failure. */
428
429
static int
430
fp_setreadl(struct tok_state *tok, const char* enc)
431
{
432
    PyObject *readline, *open, *stream;
433
    int fd;
434
    long pos;
435
436
    fd = fileno(tok->fp);
437
    /* Due to buffering the file offset for fd can be different from the file
438
     * position of tok->fp.  If tok->fp was opened in text mode on Windows,
439
     * its file position counts CRLF as one char and can't be directly mapped
440
     * to the file offset for fd.  Instead we step back one byte and read to
441
     * the end of line.*/
442
    pos = ftell(tok->fp);
443
    if (pos == -1 ||
  Branch (443:9): [True: 0, False: 0]
444
        lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
  Branch (444:9): [True: 0, False: 0]
  Branch (444:27): [True: 0, False: 0]
445
        PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
446
        return 0;
447
    }
448
449
    open = _PyImport_GetModuleAttrString("io", "open");
450
    if (open == NULL) {
  Branch (450:9): [True: 0, False: 0]
451
        return 0;
452
    }
453
    stream = PyObject_CallFunction(open, "isisOOO",
454
                    fd, "r", -1, enc, Py_None, Py_None, Py_False);
455
    Py_DECREF(open);
456
    if (stream == NULL) {
  Branch (456:9): [True: 0, False: 0]
457
        return 0;
458
    }
459
460
    readline = PyObject_GetAttr(stream, &_Py_ID(readline));
461
    Py_DECREF(stream);
462
    if (readline == NULL) {
  Branch (462:9): [True: 0, False: 0]
463
        return 0;
464
    }
465
    Py_XSETREF(tok->decoding_readline, readline);
466
467
    if (pos > 0) {
  Branch (467:9): [True: 0, False: 0]
468
        PyObject *bufobj = _PyObject_CallNoArgs(readline);
469
        if (bufobj == NULL) {
  Branch (469:13): [True: 0, False: 0]
470
            return 0;
471
        }
472
        Py_DECREF(bufobj);
473
    }
474
475
    return 1;
476
}
477
478
/* Fetch the next byte from TOK. */
479
480
static int fp_getc(struct tok_state *tok) {
481
    return getc(tok->fp);
482
}
483
484
/* Unfetch the last byte back into TOK.  */
485
486
static void fp_ungetc(int c, struct tok_state *tok) {
487
    ungetc(c, tok->fp);
488
}
489
490
/* Check whether the characters at s start a valid
491
   UTF-8 sequence. Return the number of characters forming
492
   the sequence if yes, 0 if not.  */
493
static int valid_utf8(const unsigned char* s)
494
{
495
    int expected = 0;
496
    int length;
497
    if (*s < 0x80)
  Branch (497:9): [True: 18.2k, False: 0]
498
        /* single-byte code */
499
        return 1;
500
    if (*s < 0xc0)
  Branch (500:9): [True: 0, False: 0]
501
        /* following byte */
502
        return 0;
503
    if (*s < 0xE0)
  Branch (503:9): [True: 0, False: 0]
504
        expected = 1;
505
    else if (*s < 0xF0)
  Branch (505:14): [True: 0, False: 0]
506
        expected = 2;
507
    else if (*s < 0xF8)
  Branch (507:14): [True: 0, False: 0]
508
        expected = 3;
509
    else
510
        return 0;
511
    length = expected + 1;
512
    for (; expected; expected--)
  Branch (512:12): [True: 0, False: 0]
513
        if (s[expected] < 0x80 || s[expected] >= 0xC0)
  Branch (513:13): [True: 0, False: 0]
  Branch (513:35): [True: 0, False: 0]
514
            return 0;
515
    return length;
516
}
517
518
static int
519
ensure_utf8(char *line, struct tok_state *tok)
520
{
521
    int badchar = 0;
522
    unsigned char *c;
523
    int length;
524
    for (c = (unsigned char *)line; *c; 
c += length18.2k
) {
  Branch (524:37): [True: 18.2k, False: 806]
525
        if (!(length = valid_utf8(c))) {
  Branch (525:13): [True: 0, False: 18.2k]
526
            badchar = *c;
527
            break;
528
        }
529
    }
530
    if (badchar) {
  Branch (530:9): [True: 0, False: 806]
531
        /* Need to add 1 to the line number, since this line
532
       has not been counted, yet.  */
533
        PyErr_Format(PyExc_SyntaxError,
534
                     "Non-UTF-8 code starting with '\\x%.2x' "
535
                     "in file %U on line %i, "
536
                     "but no encoding declared; "
537
                     "see https://peps.python.org/pep-0263/ for details",
538
                     badchar, tok->filename, tok->lineno + 1);
539
        return 0;
540
    }
541
    return 1;
542
}
543
544
/* Fetch a byte from TOK, using the string buffer. */
545
546
static int
547
buf_getc(struct tok_state *tok) {
548
    return Py_CHARMASK(*tok->str++);
549
}
550
551
/* Unfetch a byte from TOK, using the string buffer. */
552
553
static void
554
buf_ungetc(int c, struct tok_state *tok) {
555
    tok->str--;
556
    assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
557
}
558
559
/* Set the readline function for TOK to ENC. For the string-based
560
   tokenizer, this means to just record the encoding. */
561
562
static int
563
buf_setreadl(struct tok_state *tok, const char* enc) {
564
    tok->enc = enc;
565
    return 1;
566
}
567
568
/* Return a UTF-8 encoding Python string object from the
569
   C byte string STR, which is encoded with ENC. */
570
571
static PyObject *
572
translate_into_utf8(const char* str, const char* enc) {
573
    PyObject *utf8;
574
    PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
575
    if (buf == NULL)
  Branch (575:9): [True: 6, False: 37]
576
        return NULL;
577
    utf8 = PyUnicode_AsUTF8String(buf);
578
    Py_DECREF(buf);
579
    return utf8;
580
}
581
582
583
static char *
584
translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
585
    int skip_next_lf = 0;
586
    size_t needed_length = strlen(s) + 2, final_length;
587
    char *buf, *current;
588
    char c = '\0';
589
    buf = PyMem_Malloc(needed_length);
590
    if (buf == NULL) {
  Branch (590:9): [True: 0, False: 132k]
591
        tok->done = E_NOMEM;
592
        return NULL;
593
    }
594
    
for (current = buf; 132k
*s;
s++, current++27.2M
) {
  Branch (594:25): [True: 27.2M, False: 132k]
595
        c = *s;
596
        if (skip_next_lf) {
  Branch (596:13): [True: 40, False: 27.2M]
597
            skip_next_lf = 0;
598
            if (c == '\n') {
  Branch (598:17): [True: 23, False: 17]
599
                c = *++s;
600
                if (!c)
  Branch (600:21): [True: 7, False: 16]
601
                    break;
602
            }
603
        }
604
        if (c == '\r') {
  Branch (604:13): [True: 47, False: 27.2M]
605
            skip_next_lf = 1;
606
            c = '\n';
607
        }
608
        *current = c;
609
    }
610
    /* If this is exec input, add a newline to the end of the string if
611
       there isn't one already. */
612
    if (exec_input && 
c != '\n'90.7k
) {
  Branch (612:9): [True: 90.7k, False: 41.6k]
  Branch (612:23): [True: 86.4k, False: 4.29k]
613
        *current = '\n';
614
        current++;
615
    }
616
    *current = '\0';
617
    final_length = current - buf + 1;
618
    if (final_length < needed_length && 
final_length45.9k
) {
  Branch (618:9): [True: 45.9k, False: 86.4k]
  Branch (618:41): [True: 45.9k, False: 0]
619
        /* should never fail */
620
        char* result = PyMem_Realloc(buf, final_length);
621
        if (result == NULL) {
  Branch (621:13): [True: 0, False: 45.9k]
622
            PyMem_Free(buf);
623
        }
624
        buf = result;
625
    }
626
    return buf;
627
}
628
629
/* Decode a byte string STR for use as the buffer of TOK.
630
   Look for encoding declarations inside STR, and record them
631
   inside TOK.  */
632
633
static char *
634
decode_str(const char *input, int single, struct tok_state *tok)
635
{
636
    PyObject* utf8 = NULL;
637
    char *str;
638
    const char *s;
639
    const char *newl[2] = {NULL, NULL};
640
    int lineno = 0;
641
    tok->input = str = translate_newlines(input, single, tok);
642
    if (str == NULL)
  Branch (642:9): [True: 0, False: 75.4k]
643
        return NULL;
644
    tok->enc = NULL;
645
    tok->str = str;
646
    if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
  Branch (646:9): [True: 0, False: 75.4k]
647
        return error_ret(tok);
648
    str = tok->str;             /* string after BOM if any */
649
    assert(str);
650
    if (tok->enc != NULL) {
  Branch (650:9): [True: 0, False: 75.4k]
651
        utf8 = translate_into_utf8(str, tok->enc);
652
        if (utf8 == NULL)
  Branch (652:13): [True: 0, False: 0]
653
            return error_ret(tok);
654
        str = PyBytes_AsString(utf8);
655
    }
656
    
for (s = str;; 75.4k
s++846k
) {
657
        if (*s == '\0') 
break74.2k
;
  Branch (657:13): [True: 74.2k, False: 847k]
658
        else if (*s == '\n') {
  Branch (658:18): [True: 76.5k, False: 770k]
659
            assert(lineno < 2);
660
            newl[lineno] = s;
661
            lineno++;
662
            if (lineno == 2) 
break1.18k
;
  Branch (662:17): [True: 1.18k, False: 75.4k]
663
        }
664
    }
665
    tok->enc = NULL;
666
    /* need to check line 1 and 2 separately since check_coding_spec
667
       assumes a single line as input */
668
    if (newl[0]) {
  Branch (668:9): [True: 75.4k, False: 13]
669
        if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
  Branch (669:13): [True: 20, False: 75.3k]
670
            return NULL;
671
        }
672
        if (tok->enc == NULL && 
tok->decoding_state != STATE_NORMAL75.3k
&&
newl[1]481
) {
  Branch (672:13): [True: 75.3k, False: 36]
  Branch (672:33): [True: 481, False: 74.8k]
  Branch (672:72): [True: 224, False: 257]
673
            if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
  Branch (673:17): [True: 0, False: 224]
674
                                   tok, buf_setreadl))
675
                return NULL;
676
        }
677
    }
678
    if (tok->enc != NULL) {
  Branch (678:9): [True: 43, False: 75.3k]
679
        assert(utf8 == NULL);
680
        utf8 = translate_into_utf8(str, tok->enc);
681
        if (utf8 == NULL)
  Branch (681:13): [True: 6, False: 37]
682
            return error_ret(tok);
683
        str = PyBytes_AS_STRING(utf8);
684
    }
685
    assert(tok->decoding_buffer == NULL);
686
    tok->decoding_buffer = utf8; /* CAUTION */
687
    return str;
688
}
689
690
/* Set up tokenizer for string */
691
692
struct tok_state *
693
_PyTokenizer_FromString(const char *str, int exec_input)
694
{
695
    struct tok_state *tok = tok_new();
696
    char *decoded;
697
698
    if (tok == NULL)
  Branch (698:9): [True: 0, False: 75.4k]
699
        return NULL;
700
    decoded = decode_str(str, exec_input, tok);
701
    if (decoded == NULL) {
  Branch (701:9): [True: 26, False: 75.4k]
702
        _PyTokenizer_Free(tok);
703
        return NULL;
704
    }
705
706
    tok->buf = tok->cur = tok->inp = decoded;
707
    tok->end = decoded;
708
    return tok;
709
}
710
711
/* Set up tokenizer for UTF-8 string */
712
713
struct tok_state *
714
_PyTokenizer_FromUTF8(const char *str, int exec_input)
715
{
716
    struct tok_state *tok = tok_new();
717
    char *translated;
718
    if (tok == NULL)
  Branch (718:9): [True: 0, False: 56.9k]
719
        return NULL;
720
    tok->input = translated = translate_newlines(str, exec_input, tok);
721
    if (translated == NULL) {
  Branch (721:9): [True: 0, False: 56.9k]
722
        _PyTokenizer_Free(tok);
723
        return NULL;
724
    }
725
    tok->decoding_state = STATE_NORMAL;
726
    tok->enc = NULL;
727
    tok->str = translated;
728
    tok->encoding = new_string("utf-8", 5, tok);
729
    if (!tok->encoding) {
  Branch (729:9): [True: 0, False: 56.9k]
730
        _PyTokenizer_Free(tok);
731
        return NULL;
732
    }
733
734
    tok->buf = tok->cur = tok->inp = translated;
735
    tok->end = translated;
736
    return tok;
737
}
738
739
/* Set up tokenizer for file */
740
741
struct tok_state *
742
_PyTokenizer_FromFile(FILE *fp, const char* enc,
743
                      const char *ps1, const char *ps2)
744
{
745
    struct tok_state *tok = tok_new();
746
    if (tok == NULL)
  Branch (746:9): [True: 0, False: 257]
747
        return NULL;
748
    if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
  Branch (748:9): [True: 0, False: 257]
749
        _PyTokenizer_Free(tok);
750
        return NULL;
751
    }
752
    tok->cur = tok->inp = tok->buf;
753
    tok->end = tok->buf + BUFSIZ;
754
    tok->fp = fp;
755
    tok->prompt = ps1;
756
    tok->nextprompt = ps2;
757
    if (enc != NULL) {
  Branch (757:9): [True: 0, False: 257]
758
        /* Must copy encoding declaration since it
759
           gets copied into the parse tree. */
760
        tok->encoding = new_string(enc, strlen(enc), tok);
761
        if (!tok->encoding) {
  Branch (761:13): [True: 0, False: 0]
762
            _PyTokenizer_Free(tok);
763
            return NULL;
764
        }
765
        tok->decoding_state = STATE_NORMAL;
766
    }
767
    return tok;
768
}
769
770
/* Free a tok_state structure */
771
772
void
773
_PyTokenizer_Free(struct tok_state *tok)
774
{
775
    if (tok->encoding != NULL) {
  Branch (775:9): [True: 57.0k, False: 75.5k]
776
        PyMem_Free(tok->encoding);
777
    }
778
    Py_XDECREF(tok->decoding_readline);
779
    Py_XDECREF(tok->decoding_buffer);
780
    Py_XDECREF(tok->filename);
781
    if (tok->fp != NULL && 
tok->buf != NULL257
) {
  Branch (781:9): [True: 257, False: 132k]
  Branch (781:28): [True: 257, False: 0]
782
        PyMem_Free(tok->buf);
783
    }
784
    if (tok->input) {
  Branch (784:9): [True: 132k, False: 257]
785
        PyMem_Free(tok->input);
786
    }
787
    if (tok->interactive_src_start != NULL) {
  Branch (787:9): [True: 0, False: 132k]
788
        PyMem_Free(tok->interactive_src_start);
789
    }
790
    PyMem_Free(tok);
791
}
792
793
static int
794
tok_readline_raw(struct tok_state *tok)
795
{
796
    do {
797
        if (!tok_reserve_buf(tok, BUFSIZ)) {
  Branch (797:13): [True: 0, False: 810]
798
            return 0;
799
        }
800
        char *line = Py_UniversalNewlineFgets(tok->inp,
801
                                              (int)(tok->end - tok->inp),
802
                                              tok->fp, NULL);
803
        if (line == NULL) {
  Branch (803:13): [True: 4, False: 806]
804
            return 1;
805
        }
806
        if (tok->fp_interactive &&
  Branch (806:13): [True: 0, False: 806]
807
            
tok_concatenate_interactive_new_line(tok, line) == -10
) {
  Branch (807:13): [True: 0, False: 0]
808
            return 0;
809
        }
810
        tok->inp = strchr(tok->inp, '\0');
811
        if (tok->inp == tok->buf) {
  Branch (811:13): [True: 0, False: 806]
812
            return 0;
813
        }
814
    } while (tok->inp[-1] != '\n');
  Branch (814:14): [True: 1, False: 805]
815
    return 1;
816
}
817
818
static int
819
tok_underflow_string(struct tok_state *tok) {
820
    char *end = strchr(tok->inp, '\n');
821
    if (end != NULL) {
  Branch (821:9): [True: 3.37M, False: 92.9k]
822
        end++;
823
    }
824
    else {
825
        end = strchr(tok->inp, '\0');
826
        if (end == tok->inp) {
  Branch (826:13): [True: 55.7k, False: 37.1k]
827
            tok->done = E_EOF;
828
            return 0;
829
        }
830
    }
831
    if (tok->start == NULL) {
  Branch (831:9): [True: 3.34M, False: 64.1k]
832
        tok->buf = tok->cur;
833
    }
834
    tok->line_start = tok->cur;
835
    tok->lineno++;
836
    tok->inp = end;
837
    return 1;
838
}
839
840
static int
841
tok_underflow_interactive(struct tok_state *tok) {
842
    if (tok->interactive_underflow == IUNDERFLOW_STOP) {
  Branch (842:9): [True: 0, False: 0]
843
        tok->done = E_INTERACT_STOP;
844
        return 1;
845
    }
846
    char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt);
  Branch (846:34): [True: 0, False: 0]
847
    if (newtok != NULL) {
  Branch (847:9): [True: 0, False: 0]
848
        char *translated = translate_newlines(newtok, 0, tok);
849
        PyMem_Free(newtok);
850
        if (translated == NULL) {
  Branch (850:13): [True: 0, False: 0]
851
            return 0;
852
        }
853
        newtok = translated;
854
    }
855
    if (tok->encoding && newtok && *newtok) {
  Branch (855:9): [True: 0, False: 0]
  Branch (855:26): [True: 0, False: 0]
  Branch (855:36): [True: 0, False: 0]
856
        /* Recode to UTF-8 */
857
        Py_ssize_t buflen;
858
        const char* buf;
859
        PyObject *u = translate_into_utf8(newtok, tok->encoding);
860
        PyMem_Free(newtok);
861
        if (u == NULL) {
  Branch (861:13): [True: 0, False: 0]
862
            tok->done = E_DECODE;
863
            return 0;
864
        }
865
        buflen = PyBytes_GET_SIZE(u);
866
        buf = PyBytes_AS_STRING(u);
867
        newtok = PyMem_Malloc(buflen+1);
868
        if (newtok == NULL) {
  Branch (868:13): [True: 0, False: 0]
869
            Py_DECREF(u);
870
            tok->done = E_NOMEM;
871
            return 0;
872
        }
873
        strcpy(newtok, buf);
874
        Py_DECREF(u);
875
    }
876
    if (tok->fp_interactive &&
  Branch (876:9): [True: 0, False: 0]
877
        tok_concatenate_interactive_new_line(tok, newtok) == -1) {
  Branch (877:9): [True: 0, False: 0]
878
        PyMem_Free(newtok);
879
        return 0;
880
    }
881
    if (tok->nextprompt != NULL) {
  Branch (881:9): [True: 0, False: 0]
882
        tok->prompt = tok->nextprompt;
883
    }
884
    if (newtok == NULL) {
  Branch (884:9): [True: 0, False: 0]
885
        tok->done = E_INTR;
886
    }
887
    else if (*newtok == '\0') {
  Branch (887:14): [True: 0, False: 0]
888
        PyMem_Free(newtok);
889
        tok->done = E_EOF;
890
    }
891
    else if (tok->start != NULL) {
  Branch (891:14): [True: 0, False: 0]
892
        Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
893
        size_t size = strlen(newtok);
894
        tok->lineno++;
895
        if (!tok_reserve_buf(tok, size + 1)) {
  Branch (895:13): [True: 0, False: 0]
896
            PyMem_Free(tok->buf);
897
            tok->buf = NULL;
898
            PyMem_Free(newtok);
899
            return 0;
900
        }
901
        memcpy(tok->cur, newtok, size + 1);
902
        PyMem_Free(newtok);
903
        tok->inp += size;
904
        tok->multi_line_start = tok->buf + cur_multi_line_start;
905
    }
906
    else {
907
        tok->lineno++;
908
        PyMem_Free(tok->buf);
909
        tok->buf = newtok;
910
        tok->cur = tok->buf;
911
        tok->line_start = tok->buf;
912
        tok->inp = strchr(tok->buf, '\0');
913
        tok->end = tok->inp + 1;
914
    }
915
    if (tok->done != E_OK) {
  Branch (915:9): [True: 0, False: 0]
916
        if (tok->prompt != NULL) {
  Branch (916:13): [True: 0, False: 0]
917
            PySys_WriteStderr("\n");
918
        }
919
        return 0;
920
    }
921
    return 1;
922
}
923
924
static int
925
tok_underflow_file(struct tok_state *tok) {
926
    if (tok->start == NULL) {
  Branch (926:9): [True: 801, False: 8]
927
        tok->cur = tok->inp = tok->buf;
928
    }
929
    if (tok->decoding_state == STATE_INIT) {
  Branch (929:9): [True: 257, False: 552]
930
        /* We have not yet determined the encoding.
931
           If an encoding is found, use the file-pointer
932
           reader functions from now on. */
933
        if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {
  Branch (933:13): [True: 0, False: 257]
934
            error_ret(tok);
935
            return 0;
936
        }
937
        assert(tok->decoding_state != STATE_INIT);
938
    }
939
    /* Read until '\n' or EOF */
940
    if (tok->decoding_readline != NULL) {
  Branch (940:9): [True: 0, False: 809]
941
        /* We already have a codec associated with this input. */
942
        if (!tok_readline_recode(tok)) {
  Branch (942:13): [True: 0, False: 0]
943
            return 0;
944
        }
945
    }
946
    else {
947
        /* We want a 'raw' read. */
948
        if (!tok_readline_raw(tok)) {
  Branch (948:13): [True: 0, False: 809]
949
            return 0;
950
        }
951
    }
952
    if (tok->inp == tok->cur) {
  Branch (952:9): [True: 3, False: 806]
953
        tok->done = E_EOF;
954
        return 0;
955
    }
956
    if (tok->inp[-1] != '\n') {
  Branch (956:9): [True: 1, False: 805]
957
        /* Last line does not end in \n, fake one */
958
        *tok->inp++ = '\n';
959
        *tok->inp = '\0';
960
    }
961
962
    tok->lineno++;
963
    if (tok->decoding_state != STATE_NORMAL) {
  Branch (963:9): [True: 425, False: 381]
964
        if (tok->lineno > 2) {
  Branch (964:13): [True: 84, False: 341]
965
            tok->decoding_state = STATE_NORMAL;
966
        }
967
        else if (!check_coding_spec(tok->cur, strlen(tok->cur),
  Branch (967:18): [True: 0, False: 341]
968
                                    tok, fp_setreadl))
969
        {
970
            return 0;
971
        }
972
    }
973
    /* The default encoding is UTF-8, so make sure we don't have any
974
       non-UTF-8 sequences in it. */
975
    if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
  Branch (975:9): [True: 806, False: 0]
  Branch (975:27): [True: 0, False: 806]
976
        error_ret(tok);
977
        return 0;
978
    }
979
    assert(tok->done == E_OK);
980
    return tok->done == E_OK;
981
}
982
983
#if defined(Py_DEBUG)
984
static void
985
print_escape(FILE *f, const char *s, Py_ssize_t size)
986
{
987
    if (s == NULL) {
988
        fputs("NULL", f);
989
        return;
990
    }
991
    putc('"', f);
992
    while (size-- > 0) {
993
        unsigned char c = *s++;
994
        switch (c) {
995
            case '\n': fputs("\\n", f); break;
996
            case '\r': fputs("\\r", f); break;
997
            case '\t': fputs("\\t", f); break;
998
            case '\f': fputs("\\f", f); break;
999
            case '\'': fputs("\\'", f); break;
1000
            case '"': fputs("\\\"", f); break;
1001
            default:
1002
                if (0x20 <= c && c <= 0x7f)
1003
                    putc(c, f);
1004
                else
1005
                    fprintf(f, "\\x%02x", c);
1006
        }
1007
    }
1008
    putc('"', f);
1009
}
1010
#endif
1011
1012
/* Get next char, updating state; error code goes into tok->done */
1013
1014
static int
1015
tok_nextc(struct tok_state *tok)
1016
{
1017
    int rc;
1018
    for (;;) {
1019
        if (tok->cur != tok->inp) {
  Branch (1019:13): [True: 47.0M, False: 3.57M]
1020
            return Py_CHARMASK(*tok->cur++); /* Fast path */
1021
        }
1022
        if (tok->done != E_OK) {
  Branch (1022:13): [True: 107k, False: 3.46M]
1023
           return EOF;
1024
        }
1025
        if (tok->fp == NULL) {
  Branch (1025:13): [True: 3.46M, False: 809]
1026
            rc = tok_underflow_string(tok);
1027
        }
1028
        else if (tok->prompt != NULL) {
  Branch (1028:18): [True: 0, False: 809]
1029
            rc = tok_underflow_interactive(tok);
1030
        }
1031
        else {
1032
            rc = tok_underflow_file(tok);
1033
        }
1034
#if defined(Py_DEBUG)
1035
        if (tok->debug) {
1036
            fprintf(stderr, "line[%d] = ", tok->lineno);
1037
            print_escape(stderr, tok->cur, tok->inp - tok->cur);
1038
            fprintf(stderr, "  tok->done = %d\n", tok->done);
1039
        }
1040
#endif
1041
        if (!rc) {
  Branch (1041:13): [True: 55.7k, False: 3.41M]
1042
            tok->cur = tok->inp;
1043
            return EOF;
1044
        }
1045
        tok->line_start = tok->cur;
1046
    }
1047
    
Py_UNREACHABLE0
();
1048
}
1049
1050
/* Back-up one character */
1051
1052
static void
1053
tok_backup(struct tok_state *tok, int c)
1054
{
1055
    if (c != EOF) {
  Branch (1055:9): [True: 19.9M, False: 107k]
1056
        if (--tok->cur < tok->buf) {
  Branch (1056:13): [True: 0, False: 19.9M]
1057
            Py_FatalError("tokenizer beginning of buffer");
1058
        }
1059
        if ((int)(unsigned char)*tok->cur != c) {
  Branch (1059:13): [True: 0, False: 19.9M]
1060
            Py_FatalError("tok_backup: wrong character");
1061
        }
1062
    }
1063
}
1064
1065
static int
1066
_syntaxerror_range(struct tok_state *tok, const char *format,
1067
                   int col_offset, int end_col_offset,
1068
                   va_list vargs)
1069
{
1070
    PyObject *errmsg, *errtext, *args;
1071
    errmsg = PyUnicode_FromFormatV(format, vargs);
1072
    if (!errmsg) {
  Branch (1072:9): [True: 0, False: 228]
1073
        goto error;
1074
    }
1075
1076
    errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1077
                                   "replace");
1078
    if (!errtext) {
  Branch (1078:9): [True: 0, False: 228]
1079
        goto error;
1080
    }
1081
1082
    if (col_offset == -1) {
  Branch (1082:9): [True: 215, False: 13]
1083
        col_offset = (int)PyUnicode_GET_LENGTH(errtext);
1084
    }
1085
    if (end_col_offset == -1) {
  Branch (1085:9): [True: 215, False: 13]
1086
        end_col_offset = col_offset;
1087
    }
1088
1089
    Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1090
    if (line_len != tok->cur - tok->line_start) {
  Branch (1090:9): [True: 153, False: 75]
1091
        Py_DECREF(errtext);
1092
        errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1093
                                       "replace");
1094
    }
1095
    if (!errtext) {
  Branch (1095:9): [True: 0, False: 228]
1096
        goto error;
1097
    }
1098
1099
    args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
1100
                         col_offset, errtext, tok->lineno, end_col_offset);
1101
    if (args) {
  Branch (1101:9): [True: 228, False: 0]
1102
        PyErr_SetObject(PyExc_SyntaxError, args);
1103
        Py_DECREF(args);
1104
    }
1105
1106
error:
1107
    Py_XDECREF(errmsg);
1108
    tok->done = E_ERROR;
1109
    return ERRORTOKEN;
1110
}
1111
1112
static int
1113
syntaxerror(struct tok_state *tok, const char *format, ...)
1114
{
1115
    va_list vargs;
1116
    va_start(vargs, format);
1117
    int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
1118
    va_end(vargs);
1119
    return ret;
1120
}
1121
1122
static int
1123
syntaxerror_known_range(struct tok_state *tok,
1124
                        int col_offset, int end_col_offset,
1125
                        const char *format, ...)
1126
{
1127
    va_list vargs;
1128
    va_start(vargs, format);
1129
    int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);
1130
    va_end(vargs);
1131
    return ret;
1132
}
1133
1134
1135
1136
static int
1137
indenterror(struct tok_state *tok)
1138
{
1139
    tok->done = E_TABSPACE;
1140
    tok->cur = tok->inp;
1141
    return ERRORTOKEN;
1142
}
1143
1144
static int
1145
parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...)
1146
{
1147
    PyObject *errmsg;
1148
    va_list vargs;
1149
    va_start(vargs, format);
1150
    errmsg = PyUnicode_FromFormatV(format, vargs);
1151
    va_end(vargs);
1152
    if (!errmsg) {
  Branch (1152:9): [True: 0, False: 130]
1153
        goto error;
1154
    }
1155
1156
    if (PyErr_WarnExplicitObject(category, errmsg, tok->filename,
  Branch (1156:9): [True: 64, False: 66]
1157
                                 tok->lineno, NULL, NULL) < 0) {
1158
        if (PyErr_ExceptionMatches(category)) {
  Branch (1158:13): [True: 64, False: 0]
1159
            /* Replace the DeprecationWarning exception with a SyntaxError
1160
               to get a more accurate error report */
1161
            PyErr_Clear();
1162
            syntaxerror(tok, "%U", errmsg);
1163
        }
1164
        goto error;
1165
    }
1166
    Py_DECREF(errmsg);
1167
    return 0;
1168
1169
error:
1170
    Py_XDECREF(errmsg);
1171
    tok->done = E_ERROR;
1172
    return -1;
1173
}
1174
1175
static int
1176
lookahead(struct tok_state *tok, const char *test)
1177
{
1178
    const char *s = test;
1179
    int res = 0;
1180
    while (1) {
  Branch (1180:12): [Folded - Ignored]
1181
        int c = tok_nextc(tok);
1182
        if (*s == 0) {
  Branch (1182:13): [True: 82, False: 167]
1183
            res = !is_potential_identifier_char(c);
1184
        }
1185
        else if (c == *s) {
  Branch (1185:18): [True: 158, False: 9]
1186
            s++;
1187
            continue;
1188
        }
1189
1190
        tok_backup(tok, c);
1191
        while (s != test) {
  Branch (1191:16): [True: 158, False: 91]
1192
            tok_backup(tok, *--s);
1193
        }
1194
        return res;
1195
    }
1196
}
1197
1198
static int
1199
verify_end_of_number(struct tok_state *tok, int c, const char *kind)
1200
{
1201
    /* Emit a deprecation warning only if the numeric literal is immediately
1202
     * followed by one of keywords which can occur after a numeric literal
1203
     * in valid code: "and", "else", "for", "if", "in", "is" and "or".
1204
     * It allows to gradually deprecate existing valid code without adding
1205
     * warning before error in most cases of invalid numeric literal (which
1206
     * would be confusing and break existing tests).
1207
     * Raise a syntax error with slightly better message than plain
1208
     * "invalid syntax" if the numeric literal is immediately followed by
1209
     * other keyword or identifier.
1210
     */
1211
    int r = 0;
1212
    if (c == 'a') {
  Branch (1212:9): [True: 14, False: 646k]
1213
        r = lookahead(tok, "nd");
1214
    }
1215
    else if (c == 'e') {
  Branch (1215:14): [True: 24, False: 646k]
1216
        r = lookahead(tok, "lse");
1217
    }
1218
    else if (c == 'f') {
  Branch (1218:14): [True: 14, False: 646k]
1219
        r = lookahead(tok, "or");
1220
    }
1221
    else if (c == 'i') {
  Branch (1221:14): [True: 48, False: 646k]
1222
        int c2 = tok_nextc(tok);
1223
        if (c2 == 'f' || 
c2 == 'n'32
||
c2 == 's'16
) {
  Branch (1223:13): [True: 16, False: 32]
  Branch (1223:26): [True: 16, False: 16]
  Branch (1223:39): [True: 16, False: 0]
1224
            r = 1;
1225
        }
1226
        tok_backup(tok, c2);
1227
    }
1228
    else if (c == 'o') {
  Branch (1228:14): [True: 22, False: 646k]
1229
        r = lookahead(tok, "r");
1230
    }
1231
    else if (c == 'n') {
  Branch (1231:14): [True: 17, False: 646k]
1232
        r = lookahead(tok, "ot");
1233
    }
1234
    if (r) {
  Branch (1234:9): [True: 130, False: 646k]
1235
        tok_backup(tok, c);
1236
        if (parser_warn(tok, PyExc_SyntaxWarning,
  Branch (1236:13): [True: 64, False: 66]
1237
                "invalid %s literal", kind))
1238
        {
1239
            return 0;
1240
        }
1241
        tok_nextc(tok);
1242
    }
1243
    else /* In future releases, only error will remain. */
1244
    if (is_potential_identifier_char(c)) {
1245
        tok_backup(tok, c);
1246
        syntaxerror(tok, "invalid %s literal", kind);
1247
        return 0;
1248
    }
1249
    return 1;
1250
}
1251
1252
/* Verify that the identifier follows PEP 3131.
1253
   All identifier strings are guaranteed to be "ready" unicode objects.
1254
 */
1255
static int
1256
verify_identifier(struct tok_state *tok)
1257
{
1258
    PyObject *s;
1259
    if (tok->decoding_erred)
  Branch (1259:9): [True: 0, False: 48]
1260
        return 0;
1261
    s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1262
    if (s == NULL) {
  Branch (1262:9): [True: 4, False: 44]
1263
        if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
  Branch (1263:13): [True: 4, False: 0]
1264
            tok->done = E_DECODE;
1265
        }
1266
        else {
1267
            tok->done = E_ERROR;
1268
        }
1269
        return 0;
1270
    }
1271
    Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
1272
    if (invalid < 0) {
  Branch (1272:9): [True: 0, False: 44]
1273
        Py_DECREF(s);
1274
        tok->done = E_ERROR;
1275
        return 0;
1276
    }
1277
    assert(PyUnicode_GET_LENGTH(s) > 0);
1278
    if (invalid < PyUnicode_GET_LENGTH(s)) {
  Branch (1278:9): [True: 8, False: 36]
1279
        Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
1280
        if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
  Branch (1280:13): [True: 0, False: 8]
1281
            /* Determine the offset in UTF-8 encoded input */
1282
            Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
1283
            if (s != NULL) {
  Branch (1283:17): [True: 0, False: 0]
1284
                Py_SETREF(s, PyUnicode_AsUTF8String(s));
1285
            }
1286
            if (s == NULL) {
  Branch (1286:17): [True: 0, False: 0]
1287
                tok->done = E_ERROR;
1288
                return 0;
1289
            }
1290
            tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
1291
        }
1292
        Py_DECREF(s);
1293
        // PyUnicode_FromFormatV() does not support %X
1294
        char hex[9];
1295
        (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);
1296
        if (Py_UNICODE_ISPRINTABLE(ch)) {
1297
            syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
1298
        }
1299
        else {
1300
            syntaxerror(tok, "invalid non-printable character U+%s", hex);
1301
        }
1302
        return 0;
1303
    }
1304
    Py_DECREF(s);
1305
    return 1;
1306
}
1307
1308
static int
1309
tok_decimal_tail(struct tok_state *tok)
1310
{
1311
    int c;
1312
1313
    while (1) {
  Branch (1313:12): [Folded - Ignored]
1314
        do {
1315
            c = tok_nextc(tok);
1316
        } while (isdigit(c));
1317
        if (c != '_') {
  Branch (1317:13): [True: 330k, False: 262]
1318
            break;
1319
        }
1320
        c = tok_nextc(tok);
1321
        if (!isdigit(c)) {
  Branch (1321:13): [True: 27, False: 235]
1322
            tok_backup(tok, c);
1323
            syntaxerror(tok, "invalid decimal literal");
1324
            return 0;
1325
        }
1326
    }
1327
    return c;
1328
}
1329
1330
/* Get next token, after space stripping etc. */
1331
1332
static inline int
1333
tok_continuation_line(struct tok_state *tok) {
1334
    int c = tok_nextc(tok);
1335
    if (c != '\n') {
  Branch (1335:9): [True: 7, False: 486]
1336
        tok->done = E_LINECONT;
1337
        return -1;
1338
    }
1339
    c = tok_nextc(tok);
1340
    if (c == EOF) {
  Branch (1340:9): [True: 9, False: 477]
1341
        tok->done = E_EOF;
1342
        tok->cur = tok->inp;
1343
        return -1;
1344
    } else {
1345
        tok_backup(tok, c);
1346
    }
1347
    return c;
1348
}
1349
1350
static int
1351
tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
1352
{
1353
    int c;
1354
    int blankline, nonascii;
1355
1356
    *p_start = *p_end = NULL;
1357
  nextline:
1358
    tok->start = NULL;
1359
    blankline = 0;
1360
1361
    /* Get indentation level */
1362
    if (tok->atbol) {
  Branch (1362:9): [True: 3.36M, False: 6.82M]
1363
        int col = 0;
1364
        int altcol = 0;
1365
        tok->atbol = 0;
1366
        int cont_line_col = 0;
1367
        for (;;) {
1368
            c = tok_nextc(tok);
1369
            if (c == ' ') {
  Branch (1369:17): [True: 4.90M, False: 3.36M]
1370
                col++, altcol++;
1371
            }
1372
            else if (c == '\t') {
  Branch (1372:22): [True: 907, False: 3.36M]
1373
                col = (col / tok->tabsize + 1) * tok->tabsize;
1374
                altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
1375
            }
1376
            else if (c == '\014')  {/* Control-L (formfeed) */
  Branch (1376:22): [True: 7, False: 3.36M]
1377
                col = altcol = 0; /* For Emacs users */
1378
            }
1379
            else if (c == '\\') {
  Branch (1379:22): [True: 30, False: 3.36M]
1380
                // Indentation cannot be split over multiple physical lines
1381
                // using backslashes. This means that if we found a backslash
1382
                // preceded by whitespace, **the first one we find** determines
1383
                // the level of indentation of whatever comes next.
1384
                cont_line_col = cont_line_col ? 
cont_line_col6
:
col24
;
  Branch (1384:33): [True: 6, False: 24]
1385
                if ((c = tok_continuation_line(tok)) == -1) {
  Branch (1385:21): [True: 1, False: 29]
1386
                    return ERRORTOKEN;
1387
                }
1388
            }
1389
            else {
1390
                break;
1391
            }
1392
        }
1393
        tok_backup(tok, c);
1394
        if (c == '#' || 
c == '\n'3.33M
) {
  Branch (1394:13): [True: 33.8k, False: 3.33M]
  Branch (1394:25): [True: 2.62M, False: 707k]
1395
            /* Lines with only whitespace and/or comments
1396
               shouldn't affect the indentation and are
1397
               not passed to the parser as NEWLINE tokens,
1398
               except *totally* empty lines in interactive
1399
               mode, which signal the end of a command group. */
1400
            if (col == 0 && 
c == '\n'61.5k
&&
tok->prompt != NULL49.4k
) {
  Branch (1400:17): [True: 61.5k, False: 2.59M]
  Branch (1400:29): [True: 49.4k, False: 12.1k]
  Branch (1400:42): [True: 0, False: 49.4k]
1401
                blankline = 0; /* Let it through */
1402
            }
1403
            else if (tok->prompt != NULL && 
tok->lineno == 10
) {
  Branch (1403:22): [True: 0, False: 2.65M]
  Branch (1403:45): [True: 0, False: 0]
1404
                /* In interactive mode, if the first line contains
1405
                   only spaces and/or a comment, let it through. */
1406
                blankline = 0;
1407
                col = altcol = 0;
1408
            }
1409
            else {
1410
                blankline = 1; /* Ignore completely */
1411
            }
1412
            /* We can't jump back right here since we still
1413
               may need to skip to the end of a comment */
1414
        }
1415
        if (!blankline && 
tok->level == 0707k
) {
  Branch (1415:13): [True: 707k, False: 2.65M]
  Branch (1415:27): [True: 593k, False: 114k]
1416
            col = cont_line_col ? 
cont_line_col10
:
col593k
;
  Branch (1416:19): [True: 10, False: 593k]
1417
            altcol = cont_line_col ? 
cont_line_col10
:
altcol593k
;
  Branch (1417:22): [True: 10, False: 593k]
1418
            if (col == tok->indstack[tok->indent]) {
  Branch (1418:17): [True: 445k, False: 147k]
1419
                /* No change */
1420
                if (altcol != tok->altindstack[tok->indent]) {
  Branch (1420:21): [True: 2, False: 445k]
1421
                    return indenterror(tok);
1422
                }
1423
            }
1424
            else if (col > tok->indstack[tok->indent]) {
  Branch (1424:22): [True: 81.6k, False: 65.7k]
1425
                /* Indent -- always one */
1426
                if (tok->indent+1 >= MAXINDENT) {
  Branch (1426:21): [True: 0, False: 81.6k]
1427
                    tok->done = E_TOODEEP;
1428
                    tok->cur = tok->inp;
1429
                    return ERRORTOKEN;
1430
                }
1431
                if (altcol <= tok->altindstack[tok->indent]) {
  Branch (1431:21): [True: 0, False: 81.6k]
1432
                    return indenterror(tok);
1433
                }
1434
                tok->pendin++;
1435
                tok->indstack[++tok->indent] = col;
1436
                tok->altindstack[tok->indent] = altcol;
1437
            }
1438
            else /* col < tok->indstack[tok->indent] */ {
1439
                /* Dedent -- any number, must be consistent */
1440
                while (tok->indent > 0 &&
  Branch (1440:24): [True: 132k, False: 14.9k]
1441
                    
col < tok->indstack[tok->indent]132k
) {
  Branch (1441:21): [True: 81.6k, False: 50.7k]
1442
                    tok->pendin--;
1443
                    tok->indent--;
1444
                }
1445
                if (col != tok->indstack[tok->indent]) {
  Branch (1445:21): [True: 6, False: 65.7k]
1446
                    tok->done = E_DEDENT;
1447
                    tok->cur = tok->inp;
1448
                    return ERRORTOKEN;
1449
                }
1450
                if (altcol != tok->altindstack[tok->indent]) {
  Branch (1450:21): [True: 0, False: 65.7k]
1451
                    return indenterror(tok);
1452
                }
1453
            }
1454
        }
1455
    }
1456
1457
    tok->start = tok->cur;
1458
1459
    /* Return pending indents/dedents */
1460
    if (tok->pendin != 0) {
  Branch (1460:9): [True: 163k, False: 10.0M]
1461
        if (tok->pendin < 0) {
  Branch (1461:13): [True: 81.6k, False: 81.6k]
1462
            tok->pendin++;
1463
            return DEDENT;
1464
        }
1465
        else {
1466
            tok->pendin--;
1467
            return INDENT;
1468
        }
1469
    }
1470
1471
    /* Peek ahead at the next character */
1472
    c = tok_nextc(tok);
1473
    tok_backup(tok, c);
1474
    /* Check if we are closing an async function */
1475
    if (tok->async_def
  Branch (1475:9): [True: 141, False: 10.0M]
1476
        && 
!blankline141
  Branch (1476:12): [True: 135, False: 6]
1477
        /* Due to some implementation artifacts of type comments,
1478
         * a TYPE_COMMENT at the start of a function won't set an
1479
         * indentation level and it will produce a NEWLINE after it.
1480
         * To avoid spuriously ending an async function due to this,
1481
         * wait until we have some non-newline char in front of us. */
1482
        && 
c != '\n'135
  Branch (1482:12): [True: 114, False: 21]
1483
        && 
tok->level == 0114
  Branch (1483:12): [True: 75, False: 39]
1484
        /* There was a NEWLINE after ASYNC DEF,
1485
           so we're past the signature. */
1486
        && 
tok->async_def_nl75
  Branch (1486:12): [True: 36, False: 39]
1487
        /* Current indentation level is less than where
1488
           the async function was defined */
1489
        && 
tok->async_def_indent >= tok->indent36
)
  Branch (1489:12): [True: 9, False: 27]
1490
    {
1491
        tok->async_def = 0;
1492
        tok->async_def_indent = 0;
1493
        tok->async_def_nl = 0;
1494
    }
1495
1496
 again:
1497
    tok->start = NULL;
1498
    /* Skip spaces */
1499
    do {
1500
        c = tok_nextc(tok);
1501
    } while (c == ' ' || 
c == '\t'10.0M
||
c == '\014'10.0M
);
  Branch (1501:14): [True: 1.18M, False: 10.0M]
  Branch (1501:26): [True: 4, False: 10.0M]
  Branch (1501:39): [True: 0, False: 10.0M]
1502
1503
    /* Set start of current token */
1504
    tok->start = tok->cur - 1;
1505
1506
    /* Skip comment, unless it's a type comment */
1507
    if (c == '#') {
  Branch (1507:9): [True: 38.2k, False: 9.99M]
1508
        const char *prefix, *p, *type_start;
1509
1510
        while (c != EOF && 
c != '\n'1.73M
) {
  Branch (1510:16): [True: 1.73M, False: 1]
  Branch (1510:28): [True: 1.69M, False: 38.2k]
1511
            c = tok_nextc(tok);
1512
        }
1513
1514
        if (tok->type_comments) {
  Branch (1514:13): [True: 611, False: 37.6k]
1515
            p = tok->start;
1516
            prefix = type_comment_prefix;
1517
            while (*prefix && 
p < tok->cur4.88k
) {
  Branch (1517:20): [True: 4.88k, False: 611]
  Branch (1517:31): [True: 4.88k, False: 0]
1518
                if (*prefix == ' ') {
  Branch (1518:21): [True: 1.22k, False: 3.66k]
1519
                    while (*p == ' ' || 
*p == '\t'1.22k
) {
  Branch (1519:28): [True: 1.22k, False: 1.22k]
  Branch (1519:41): [True: 0, False: 1.22k]
1520
                        p++;
1521
                    }
1522
                } else if (*prefix == *p) {
  Branch (1522:28): [True: 3.66k, False: 0]
1523
                    p++;
1524
                } else {
1525
                    break;
1526
                }
1527
1528
                prefix++;
1529
            }
1530
1531
            /* This is a type comment if we matched all of type_comment_prefix. */
1532
            if (!*prefix) {
  Branch (1532:17): [True: 611, False: 0]
1533
                int is_type_ignore = 1;
1534
                const char *ignore_end = p + 6;
1535
                tok_backup(tok, c);  /* don't eat the newline or EOF */
1536
1537
                type_start = p;
1538
1539
                /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
1540
                 * or anything ASCII and non-alphanumeric. */
1541
                is_type_ignore = (
1542
                    tok->cur >= ignore_end && 
memcmp(p, "ignore", 6) == 0165
  Branch (1542:21): [True: 165, False: 446]
  Branch (1542:47): [True: 92, False: 73]
1543
                    && 
!(92
tok->cur > ignore_end92
  Branch (1543:26): [True: 60, False: 32]
1544
                         && 
(60
(unsigned char)ignore_end[0] >= 12860
||
Py_ISALNUM51
(ignore_end[0]))));
  Branch (1544:30): [True: 9, False: 51]
1545
1546
                if (is_type_ignore) {
  Branch (1546:21): [True: 74, False: 537]
1547
                    *p_start = ignore_end;
1548
                    *p_end = tok->cur;
1549
1550
                    /* If this type ignore is the only thing on the line, consume the newline also. */
1551
                    if (blankline) {
  Branch (1551:25): [True: 0, False: 74]
1552
                        tok_nextc(tok);
1553
                        tok->atbol = 1;
1554
                    }
1555
                    return TYPE_IGNORE;
1556
                } else {
1557
                    *p_start = type_start;  /* after type_comment_prefix */
1558
                    *p_end = tok->cur;
1559
                    return TYPE_COMMENT;
1560
                }
1561
            }
1562
        }
1563
    }
1564
1565
    if (tok->done == E_INTERACT_STOP) {
  Branch (1565:9): [True: 0, False: 10.0M]
1566
        return ENDMARKER;
1567
    }
1568
1569
    /* Check for EOF and errors now */
1570
    if (c == EOF) {
  Branch (1570:9): [True: 55.7k, False: 9.97M]
1571
        if (tok->level) {
  Branch (1571:13): [True: 185, False: 55.5k]
1572
            return ERRORTOKEN;
1573
        }
1574
        return tok->done == E_EOF ? ENDMARKER : 
ERRORTOKEN0
;
  Branch (1574:16): [True: 55.5k, False: 0]
1575
    }
1576
1577
    /* Identifier (most frequent token!) */
1578
    nonascii = 0;
1579
    if (is_potential_identifier_start(c)) {
1580
        /* Process the various legal combinations of b"", r"", u"", and f"". */
1581
        int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
1582
        while (1) {
  Branch (1582:16): [Folded - Ignored]
1583
            if (!(saw_b || 
saw_u2.33M
||
saw_f2.32M
) &&
(2.27M
c == 'b'2.27M
||
c == 'B'1.74M
))
  Branch (1583:19): [True: 524k, False: 2.33M]
  Branch (1583:28): [True: 6.35k, False: 2.32M]
  Branch (1583:37): [True: 52.4k, False: 2.27M]
  Branch (1583:48): [True: 523k, False: 1.74M]
  Branch (1583:60): [True: 3.92k, False: 1.74M]
1584
                saw_b = 1;
1585
            /* Since this is a backwards compatibility support literal we don't
1586
               want to support it in arbitrary order like byte literals. */
1587
            else if (!(saw_b || 
saw_u1.80M
||
saw_r1.79M
||
saw_f1.74M
)
  Branch (1587:24): [True: 524k, False: 1.80M]
  Branch (1587:33): [True: 6.35k, False: 1.79M]
  Branch (1587:42): [True: 53.3k, False: 1.74M]
  Branch (1587:51): [True: 47.2k, False: 1.69M]
1588
                     && 
(1.69M
c == 'u'1.69M
||
c == 'U'1.69M
)) {
  Branch (1588:26): [True: 5.14k, False: 1.69M]
  Branch (1588:37): [True: 1.25k, False: 1.69M]
1589
                saw_u = 1;
1590
            }
1591
            /* ur"" and ru"" are not supported */
1592
            else if (!(saw_r || 
saw_u2.26M
) &&
(2.26M
c == 'r'2.26M
||
c == 'R'2.20M
)) {
  Branch (1592:24): [True: 54.7k, False: 2.26M]
  Branch (1592:33): [True: 6.35k, False: 2.26M]
  Branch (1592:44): [True: 54.0k, False: 2.20M]
  Branch (1592:56): [True: 1.55k, False: 2.20M]
1593
                saw_r = 1;
1594
            }
1595
            else if (!(saw_f || 
saw_b2.21M
||
saw_u1.69M
) &&
(1.68M
c == 'f'1.68M
||
c == 'F'1.64M
)) {
  Branch (1595:24): [True: 47.3k, False: 2.21M]
  Branch (1595:33): [True: 523k, False: 1.69M]
  Branch (1595:42): [True: 6.35k, False: 1.68M]
  Branch (1595:53): [True: 46.8k, False: 1.64M]
  Branch (1595:65): [True: 4.91k, False: 1.63M]
1596
                saw_f = 1;
1597
            }
1598
            else {
1599
                break;
1600
            }
1601
            c = tok_nextc(tok);
1602
            if (c == '"' || 
c == '\''639k
) {
  Branch (1602:17): [True: 2.49k, False: 639k]
  Branch (1602:29): [True: 7.24k, False: 631k]
1603
                goto letter_quote;
1604
            }
1605
        }
1606
        
while (2.21M
is_potential_identifier_char(c)) {
1607
            if (c >= 128) {
  Branch (1607:17): [True: 98, False: 5.69M]
1608
                nonascii = 1;
1609
            }
1610
            c = tok_nextc(tok);
1611
        }
1612
        tok_backup(tok, c);
1613
        if (nonascii && 
!verify_identifier(tok)48
) {
  Branch (1613:13): [True: 48, False: 2.21M]
  Branch (1613:25): [True: 12, False: 36]
1614
            return ERRORTOKEN;
1615
        }
1616
1617
        *p_start = tok->start;
1618
        *p_end = tok->cur;
1619
1620
        /* async/await parsing block. */
1621
        if (tok->cur - tok->start == 5 && 
tok->start[0] == 'a'75.4k
) {
  Branch (1621:13): [True: 75.4k, False: 2.13M]
  Branch (1621:43): [True: 2.20k, False: 73.2k]
1622
            /* May be an 'async' or 'await' token.  For Python 3.7 or
1623
               later we recognize them unconditionally.  For Python
1624
               3.5 or 3.6 we recognize 'async' in front of 'def', and
1625
               either one inside of 'async def'.  (Technically we
1626
               shouldn't recognize these at all for 3.4 or earlier,
1627
               but there's no *valid* Python 3.4 code that would be
1628
               rejected, and async functions will be rejected in a
1629
               later phase.) */
1630
            if (!tok->async_hacks || 
tok->async_def24
) {
  Branch (1630:17): [True: 2.18k, False: 24]
  Branch (1630:38): [True: 9, False: 15]
1631
                /* Always recognize the keywords. */
1632
                if (memcmp(tok->start, "async", 5) == 0) {
  Branch (1632:21): [True: 957, False: 1.23k]
1633
                    return ASYNC;
1634
                }
1635
                if (memcmp(tok->start, "await", 5) == 0) {
  Branch (1635:21): [True: 219, False: 1.01k]
1636
                    return AWAIT;
1637
                }
1638
            }
1639
            else if (memcmp(tok->start, "async", 5) == 0) {
  Branch (1639:22): [True: 12, False: 3]
1640
                /* The current token is 'async'.
1641
                   Look ahead one token to see if that is 'def'. */
1642
1643
                struct tok_state ahead_tok;
1644
                const char *ahead_tok_start = NULL;
1645
                const char *ahead_tok_end = NULL;
1646
                int ahead_tok_kind;
1647
1648
                memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1649
                ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1650
                                         &ahead_tok_end);
1651
1652
                if (ahead_tok_kind == NAME
  Branch (1652:21): [True: 9, False: 3]
1653
                    && 
ahead_tok.cur - ahead_tok.start == 39
  Branch (1653:24): [True: 9, False: 0]
1654
                    && 
memcmp(ahead_tok.start, "def", 3) == 09
)
  Branch (1654:24): [True: 9, False: 0]
1655
                {
1656
                    /* The next token is going to be 'def', so instead of
1657
                       returning a plain NAME token, return ASYNC. */
1658
                    tok->async_def_indent = tok->indent;
1659
                    tok->async_def = 1;
1660
                    return ASYNC;
1661
                }
1662
            }
1663
        }
1664
1665
        return NAME;
1666
    }
1667
1668
    /* Newline */
1669
    if (c == '\n') {
  Branch (1669:9): [True: 3.31M, False: 4.44M]
1670
        tok->atbol = 1;
1671
        if (blankline || 
tok->level > 0651k
) {
  Branch (1671:13): [True: 2.65M, False: 651k]
  Branch (1671:26): [True: 114k, False: 536k]
1672
            goto nextline;
1673
        }
1674
        *p_start = tok->start;
1675
        *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1676
        tok->cont_line = 0;
1677
        if (tok->async_def) {
  Branch (1677:13): [True: 21, False: 536k]
1678
            /* We're somewhere inside an 'async def' function, and
1679
               we've encountered a NEWLINE after its signature. */
1680
            tok->async_def_nl = 1;
1681
        }
1682
        return NEWLINE;
1683
    }
1684
1685
    /* Period or number starting with period? */
1686
    if (c == '.') {
  Branch (1686:9): [True: 423k, False: 4.01M]
1687
        c = tok_nextc(tok);
1688
        if (isdigit(c)) {
1689
            goto fraction;
1690
        } else if (c == '.') {
  Branch (1690:20): [True: 1.40k, False: 422k]
1691
            c = tok_nextc(tok);
1692
            if (c == '.') {
  Branch (1692:17): [True: 1.39k, False: 15]
1693
                *p_start = tok->start;
1694
                *p_end = tok->cur;
1695
                return ELLIPSIS;
1696
            }
1697
            else {
1698
                tok_backup(tok, c);
1699
            }
1700
            tok_backup(tok, '.');
1701
        }
1702
        else {
1703
            tok_backup(tok, c);
1704
        }
1705
        *p_start = tok->start;
1706
        *p_end = tok->cur;
1707
        return DOT;
1708
    }
1709
1710
    /* Number */
1711
    if (isdigit(c)) {
1712
        if (c == '0') {
  Branch (1712:13): [True: 320k, False: 325k]
1713
            /* Hex, octal or binary -- maybe. */
1714
            c = tok_nextc(tok);
1715
            if (c == 'x' || 
c == 'X'318k
) {
  Branch (1715:17): [True: 2.30k, False: 318k]
  Branch (1715:29): [True: 2, False: 318k]
1716
                /* Hex */
1717
                c = tok_nextc(tok);
1718
                do {
1719
                    if (c == '_') {
  Branch (1719:25): [True: 17, False: 2.30k]
1720
                        c = tok_nextc(tok);
1721
                    }
1722
                    if (!isxdigit(c)) {
  Branch (1722:25): [True: 16, False: 2.30k]
1723
                        tok_backup(tok, c);
1724
                        return syntaxerror(tok, "invalid hexadecimal literal");
1725
                    }
1726
                    
do 2.30k
{
1727
                        c = tok_nextc(tok);
1728
                    } while (isxdigit(c));
1729
                } while (c == '_');
  Branch (1729:26): [True: 13, False: 2.29k]
1730
                if (!verify_end_of_number(tok, c, "hexadecimal")) {
  Branch (1730:21): [True: 12, False: 2.28k]
1731
                    return ERRORTOKEN;
1732
                }
1733
            }
1734
            else if (c == 'o' || 
c == 'O'318k
) {
  Branch (1734:22): [True: 142, False: 318k]
  Branch (1734:34): [True: 3, False: 318k]
1735
                /* Octal */
1736
                c = tok_nextc(tok);
1737
                do {
1738
                    if (c == '_') {
  Branch (1738:25): [True: 12, False: 143]
1739
                        c = tok_nextc(tok);
1740
                    }
1741
                    if (c < '0' || 
c >= '8'150
) {
  Branch (1741:25): [True: 5, False: 150]
  Branch (1741:36): [True: 7, False: 143]
1742
                        if (isdigit(c)) {
1743
                            return syntaxerror(tok,
1744
                                    "invalid digit '%c' in octal literal", c);
1745
                        }
1746
                        else {
1747
                            tok_backup(tok, c);
1748
                            return syntaxerror(tok, "invalid octal literal");
1749
                        }
1750
                    }
1751
                    
do 143
{
1752
                        c = tok_nextc(tok);
1753
                    } while ('0' <= c && 
c < '8'527
);
  Branch (1753:30): [True: 527, False: 95]
  Branch (1753:42): [True: 479, False: 48]
1754
                } while (c == '_');
  Branch (1754:26): [True: 10, False: 133]
1755
                if (isdigit(c)) {
1756
                    return syntaxerror(tok,
1757
                            "invalid digit '%c' in octal literal", c);
1758
                }
1759
                if (!verify_end_of_number(tok, c, "octal")) {
  Branch (1759:21): [True: 11, False: 119]
1760
                    return ERRORTOKEN;
1761
                }
1762
            }
1763
            else if (c == 'b' || 
c == 'B'318k
) {
  Branch (1763:22): [True: 54, False: 318k]
  Branch (1763:34): [True: 3, False: 318k]
1764
                /* Binary */
1765
                c = tok_nextc(tok);
1766
                do {
1767
                    if (c == '_') {
  Branch (1767:25): [True: 10, False: 55]
1768
                        c = tok_nextc(tok);
1769
                    }
1770
                    if (c != '0' && 
c != '1'59
) {
  Branch (1770:25): [True: 59, False: 6]
  Branch (1770:37): [True: 12, False: 47]
1771
                        if (isdigit(c)) {
1772
                            return syntaxerror(tok,
1773
                                    "invalid digit '%c' in binary literal", c);
1774
                        }
1775
                        else {
1776
                            tok_backup(tok, c);
1777
                            return syntaxerror(tok, "invalid binary literal");
1778
                        }
1779
                    }
1780
                    
do 53
{
1781
                        c = tok_nextc(tok);
1782
                    } while (c == '0' || 
c == '1'384
);
  Branch (1782:30): [True: 235, False: 384]
  Branch (1782:42): [True: 331, False: 53]
1783
                } while (c == '_');
  Branch (1783:26): [True: 8, False: 45]
1784
                if (isdigit(c)) {
1785
                    return syntaxerror(tok,
1786
                            "invalid digit '%c' in binary literal", c);
1787
                }
1788
                if (!verify_end_of_number(tok, c, "binary")) {
  Branch (1788:21): [True: 11, False: 32]
1789
                    return ERRORTOKEN;
1790
                }
1791
            }
1792
            else {
1793
                int nonzero = 0;
1794
                /* maybe old-style octal; c is first char of it */
1795
                /* in any case, allow '0' as a literal */
1796
                while (1) {
  Branch (1796:24): [Folded - Ignored]
1797
                    if (c == '_') {
  Branch (1797:25): [True: 13, False: 318k]
1798
                        c = tok_nextc(tok);
1799
                        if (!isdigit(c)) {
  Branch (1799:29): [True: 4, False: 9]
1800
                            tok_backup(tok, c);
1801
                            return syntaxerror(tok, "invalid decimal literal");
1802
                        }
1803
                    }
1804
                    if (c != '0') {
  Branch (1804:25): [True: 318k, False: 139]
1805
                        break;
1806
                    }
1807
                    c = tok_nextc(tok);
1808
                }
1809
                char* zeros_end = tok->cur;
1810
                if (isdigit(c)) {
1811
                    nonzero = 1;
1812
                    c = tok_decimal_tail(tok);
1813
                    if (c == 0) {
  Branch (1813:25): [True: 0, False: 26]
1814
                        return ERRORTOKEN;
1815
                    }
1816
                }
1817
                if (c == '.') {
  Branch (1817:21): [True: 445, False: 317k]
1818
                    c = tok_nextc(tok);
1819
                    goto fraction;
1820
                }
1821
                else if (c == 'e' || 
c == 'E'317k
) {
  Branch (1821:26): [True: 11, False: 317k]
  Branch (1821:38): [True: 2, False: 317k]
1822
                    goto exponent;
1823
                }
1824
                else if (c == 'j' || 
c == 'J'317k
) {
  Branch (1824:26): [True: 128, False: 317k]
  Branch (1824:38): [True: 0, False: 317k]
1825
                    goto imaginary;
1826
                }
1827
                else if (nonzero) {
  Branch (1827:26): [True: 13, False: 317k]
1828
                    /* Old-style octal: now disallowed. */
1829
                    tok_backup(tok, c);
1830
                    return syntaxerror_known_range(
1831
                            tok, (int)(tok->start + 1 - tok->line_start),
1832
                            (int)(zeros_end - tok->line_start),
1833
                            "leading zeros in decimal integer "
1834
                            "literals are not permitted; "
1835
                            "use an 0o prefix for octal integers");
1836
                }
1837
                if (!verify_end_of_number(tok, c, "decimal")) {
  Branch (1837:21): [True: 7, False: 317k]
1838
                    return ERRORTOKEN;
1839
                }
1840
            }
1841
        }
1842
        else {
1843
            /* Decimal */
1844
            c = tok_decimal_tail(tok);
1845
            if (c == 0) {
  Branch (1845:17): [True: 11, False: 325k]
1846
                return ERRORTOKEN;
1847
            }
1848
            {
1849
                /* Accept floating point numbers. */
1850
                if (c == '.') {
  Branch (1850:21): [True: 2.05k, False: 323k]
1851
                    c = tok_nextc(tok);
1852
        fraction:
1853
                    /* Fraction */
1854
                    if (isdigit(c)) {
1855
                        c = tok_decimal_tail(tok);
1856
                        if (c == 0) {
  Branch (1856:29): [True: 10, False: 2.42k]
1857
                            return ERRORTOKEN;
1858
                        }
1859
                    }
1860
                }
1861
                if (c == 'e' || 
c == 'E'324k
) {
  Branch (1861:21): [True: 1.52k, False: 324k]
  Branch (1861:33): [True: 1.02k, False: 323k]
1862
                    int e;
1863
                  exponent:
1864
                    e = c;
1865
                    /* Exponent part */
1866
                    c = tok_nextc(tok);
1867
                    if (c == '+' || 
c == '-'1.95k
) {
  Branch (1867:25): [True: 604, False: 1.95k]
  Branch (1867:37): [True: 1.23k, False: 714]
1868
                        c = tok_nextc(tok);
1869
                        if (!isdigit(c)) {
  Branch (1869:29): [True: 8, False: 1.83k]
1870
                            tok_backup(tok, c);
1871
                            return syntaxerror(tok, "invalid decimal literal");
1872
                        }
1873
                    } else 
if (714
!isdigit714
(c)) {
  Branch (1873:32): [True: 15, False: 699]
1874
                        tok_backup(tok, c);
1875
                        if (!verify_end_of_number(tok, e, "decimal")) {
  Branch (1875:29): [True: 10, False: 5]
1876
                            return ERRORTOKEN;
1877
                        }
1878
                        tok_backup(tok, e);
1879
                        *p_start = tok->start;
1880
                        *p_end = tok->cur;
1881
                        return NUMBER;
1882
                    }
1883
                    c = tok_decimal_tail(tok);
1884
                    if (c == 0) {
  Branch (1884:25): [True: 6, False: 2.52k]
1885
                        return ERRORTOKEN;
1886
                    }
1887
                }
1888
                if (c == 'j' || 
c == 'J'325k
) {
  Branch (1888:21): [True: 490, False: 325k]
  Branch (1888:33): [True: 0, False: 325k]
1889
                    /* Imaginary part */
1890
        imaginary:
1891
                    c = tok_nextc(tok);
1892
                    if (!verify_end_of_number(tok, c, "imaginary")) {
  Branch (1892:25): [True: 10, False: 608]
1893
                        return ERRORTOKEN;
1894
                    }
1895
                }
1896
                else if (!verify_end_of_number(tok, c, "decimal")) {
  Branch (1896:26): [True: 27, False: 325k]
1897
                    return ERRORTOKEN;
1898
                }
1899
            }
1900
        }
1901
        tok_backup(tok, c);
1902
        *p_start = tok->start;
1903
        *p_end = tok->cur;
1904
        return NUMBER;
1905
    }
1906
1907
  letter_quote:
1908
    /* String */
1909
    if (c == '\'' || 
c == '"'3.27M
) {
  Branch (1909:9): [True: 107k, False: 3.27M]
  Branch (1909:22): [True: 98.8k, False: 3.17M]
1910
        int quote = c;
1911
        int quote_size = 1;             /* 1 or 3 */
1912
        int end_quote_size = 0;
1913
1914
        /* Nodes of type STRING, especially multi line strings
1915
           must be handled differently in order to get both
1916
           the starting line number and the column offset right.
1917
           (cf. issue 16806) */
1918
        tok->first_lineno = tok->lineno;
1919
        tok->multi_line_start = tok->line_start;
1920
1921
        /* Find the quote size and start of string */
1922
        c = tok_nextc(tok);
1923
        if (c == quote) {
  Branch (1923:13): [True: 76.2k, False: 129k]
1924
            c = tok_nextc(tok);
1925
            if (c == quote) {
  Branch (1925:17): [True: 8.52k, False: 67.6k]
1926
                quote_size = 3;
1927
            }
1928
            else {
1929
                end_quote_size = 1;     /* empty string found */
1930
            }
1931
        }
1932
        if (c != quote) {
  Branch (1932:13): [True: 197k, False: 8.52k]
1933
            tok_backup(tok, c);
1934
        }
1935
1936
        /* Get rest of string */
1937
        while (end_quote_size != quote_size) {
  Branch (1937:16): [True: 4.20M, False: 206k]
1938
            c = tok_nextc(tok);
1939
            if (c == EOF || 
(4.20M
quote_size == 14.20M
&&
c == '\n'1.72M
)) {
  Branch (1939:17): [True: 11, False: 4.20M]
  Branch (1939:30): [True: 1.72M, False: 2.48M]
  Branch (1939:49): [True: 5, False: 1.72M]
1940
                assert(tok->multi_line_start != NULL);
1941
                // shift the tok_state's location into
1942
                // the start of string, and report the error
1943
                // from the initial quote character
1944
                tok->cur = (char *)tok->start;
1945
                tok->cur++;
1946
                tok->line_start = tok->multi_line_start;
1947
                int start = tok->lineno;
1948
                tok->lineno = tok->first_lineno;
1949
                if (quote_size == 3) {
  Branch (1949:21): [True: 5, False: 11]
1950
                    syntaxerror(tok, "unterminated triple-quoted string literal"
1951
                                     " (detected at line %d)", start);
1952
                    if (c != '\n') {
  Branch (1952:25): [True: 5, False: 0]
1953
                        tok->done = E_EOFS;
1954
                    }
1955
                    return ERRORTOKEN;
1956
                }
1957
                else {
1958
                    syntaxerror(tok, "unterminated string literal (detected at"
1959
                                     " line %d)", start);
1960
                    if (c != '\n') {
  Branch (1960:25): [True: 6, False: 5]
1961
                        tok->done = E_EOLS;
1962
                    }
1963
                    return ERRORTOKEN;
1964
                }
1965
            }
1966
            if (c == quote) {
  Branch (1966:17): [True: 160k, False: 4.04M]
1967
                end_quote_size += 1;
1968
            }
1969
            else {
1970
                end_quote_size = 0;
1971
                if (c == '\\') {
  Branch (1971:21): [True: 29.6k, False: 4.01M]
1972
                    tok_nextc(tok);  /* skip escaped char */
1973
                }
1974
            }
1975
        }
1976
1977
        *p_start = tok->start;
1978
        *p_end = tok->cur;
1979
        return STRING;
1980
    }
1981
1982
    /* Line continuation */
1983
    if (c == '\\') {
  Branch (1983:9): [True: 463, False: 3.17M]
1984
        if ((c = tok_continuation_line(tok)) == -1) {
  Branch (1984:13): [True: 15, False: 448]
1985
            return ERRORTOKEN;
1986
        }
1987
        tok->cont_line = 1;
1988
        goto again; /* Read next line */
1989
    }
1990
1991
    /* Check for two-character token */
1992
    {
1993
        int c2 = tok_nextc(tok);
1994
        int token = _PyToken_TwoChars(c, c2);
1995
        if (token != OP) {
  Branch (1995:13): [True: 18.4k, False: 3.15M]
1996
            int c3 = tok_nextc(tok);
1997
            int token3 = _PyToken_ThreeChars(c, c2, c3);
1998
            if (token3 != OP) {
  Branch (1998:17): [True: 89, False: 18.3k]
1999
                token = token3;
2000
            }
2001
            else {
2002
                tok_backup(tok, c3);
2003
            }
2004
            *p_start = tok->start;
2005
            *p_end = tok->cur;
2006
            return token;
2007
        }
2008
        tok_backup(tok, c2);
2009
    }
2010
2011
    /* Keep track of parentheses nesting level */
2012
    switch (c) {
  Branch (2012:13): [True: 1.35M, False: 1.80M]
2013
    case '(':
  Branch (2013:5): [True: 565k, False: 2.58M]
2014
    case '[':
  Branch (2014:5): [True: 332k, False: 2.82M]
2015
    case '{':
  Branch (2015:5): [True: 4.78k, False: 3.15M]
2016
        if (tok->level >= MAXLEVEL) {
  Branch (2016:13): [True: 1, False: 902k]
2017
            return syntaxerror(tok, "too many nested parentheses");
2018
        }
2019
        tok->parenstack[tok->level] = c;
2020
        tok->parenlinenostack[tok->level] = tok->lineno;
2021
        tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
2022
        tok->level++;
2023
        break;
2024
    case ')':
  Branch (2024:5): [True: 565k, False: 2.59M]
2025
    case ']':
  Branch (2025:5): [True: 332k, False: 2.82M]
2026
    case '}':
  Branch (2026:5): [True: 4.72k, False: 3.15M]
2027
        if (!tok->level) {
  Branch (2027:13): [True: 12, False: 902k]
2028
            return syntaxerror(tok, "unmatched '%c'", c);
2029
        }
2030
        tok->level--;
2031
        int opening = tok->parenstack[tok->level];
2032
        if (!((opening == '(' && 
c == ')'565k
) ||
  Branch (2032:16): [True: 565k, False: 336k]
  Branch (2032:34): [True: 565k, False: 2]
2033
              
(336k
opening == '['336k
&&
c == ']'332k
) ||
  Branch (2033:16): [True: 332k, False: 4.72k]
  Branch (2033:34): [True: 332k, False: 2]
2034
              
(4.72k
opening == '{'4.72k
&&
c == '}'4.72k
)))
  Branch (2034:16): [True: 4.72k, False: 4]
  Branch (2034:34): [True: 4.72k, False: 1]
2035
        {
2036
            if (tok->parenlinenostack[tok->level] != tok->lineno) {
  Branch (2036:17): [True: 0, False: 5]
2037
                return syntaxerror(tok,
2038
                        "closing parenthesis '%c' does not match "
2039
                        "opening parenthesis '%c' on line %d",
2040
                        c, opening, tok->parenlinenostack[tok->level]);
2041
            }
2042
            else {
2043
                return syntaxerror(tok,
2044
                        "closing parenthesis '%c' does not match "
2045
                        "opening parenthesis '%c'",
2046
                        c, opening);
2047
            }
2048
        }
2049
        break;
2050
    }
2051
2052
    if (!Py_UNICODE_ISPRINTABLE(c)) {
  Branch (2052:9): [True: 1, False: 3.15M]
2053
        char hex[9];
2054
        (void)PyOS_snprintf(hex, sizeof(hex), "%04X", c);
2055
        return syntaxerror(tok, "invalid non-printable character U+%s", hex);
2056
    }
2057
2058
    /* Punctuation character */
2059
    *p_start = tok->start;
2060
    *p_end = tok->cur;
2061
    return _PyToken_OneChar(c);
2062
}
2063
2064
int
2065
_PyTokenizer_Get(struct tok_state *tok,
2066
                 const char **p_start, const char **p_end)
2067
{
2068
    int result = tok_get(tok, p_start, p_end);
2069
    if (tok->decoding_erred) {
  Branch (2069:9): [True: 0, False: 7.42M]
2070
        result = ERRORTOKEN;
2071
        tok->done = E_DECODE;
2072
    }
2073
    return result;
2074
}
2075
2076
#if defined(__wasi__) || (defined(__EMSCRIPTEN__) && (__EMSCRIPTEN_major__ >= 3))
2077
// fdopen() with borrowed fd. WASI does not provide dup() and Emscripten's
2078
// dup() emulation with open() is slow.
2079
typedef union {
2080
    void *cookie;
2081
    int fd;
2082
} borrowed;
2083
2084
static ssize_t
2085
borrow_read(void *cookie, char *buf, size_t size)
2086
{
2087
    borrowed b = {.cookie = cookie};
2088
    return read(b.fd, (void *)buf, size);
2089
}
2090
2091
static FILE *
2092
fdopen_borrow(int fd) {
2093
    // supports only reading. seek fails. close and write are no-ops.
2094
    cookie_io_functions_t io_cb = {borrow_read, NULL, NULL, NULL};
2095
    borrowed b = {.fd = fd};
2096
    return fopencookie(b.cookie, "r", io_cb);
2097
}
2098
#else
2099
static FILE *
2100
fdopen_borrow(int fd) {
2101
    fd = _Py_dup(fd);
2102
    if (fd < 0) {
  Branch (2102:9): [True: 0, False: 256]
2103
        return NULL;
2104
    }
2105
    return fdopen(fd, "r");
2106
}
2107
#endif
2108
2109
/* Get the encoding of a Python file. Check for the coding cookie and check if
2110
   the file starts with a BOM.
2111
2112
   _PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
2113
   encoding in the first or second line of the file (in which case the encoding
2114
   should be assumed to be UTF-8).
2115
2116
   The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed
2117
   by the caller. */
2118
2119
char *
2120
_PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
2121
{
2122
    struct tok_state *tok;
2123
    FILE *fp;
2124
    const char *p_start = NULL;
2125
    const char *p_end = NULL;
2126
    char *encoding = NULL;
2127
2128
    fp = fdopen_borrow(fd);
2129
    if (fp == NULL) {
  Branch (2129:9): [True: 0, False: 256]
2130
        return NULL;
2131
    }
2132
    tok = _PyTokenizer_FromFile(fp, NULL, NULL, NULL);
2133
    if (tok == NULL) {
  Branch (2133:9): [True: 0, False: 256]
2134
        fclose(fp);
2135
        return NULL;
2136
    }
2137
    if (filename != NULL) {
  Branch (2137:9): [True: 256, False: 0]
2138
        Py_INCREF(filename);
2139
        tok->filename = filename;
2140
    }
2141
    else {
2142
        tok->filename = PyUnicode_FromString("<string>");
2143
        if (tok->filename == NULL) {
  Branch (2143:13): [True: 0, False: 0]
2144
            fclose(fp);
2145
            _PyTokenizer_Free(tok);
2146
            return encoding;
2147
        }
2148
    }
2149
    
while (256
tok->lineno < 2 &&
tok->done == 622
E_OK622
) {
  Branch (2149:12): [True: 622, False: 254]
  Branch (2149:31): [True: 620, False: 2]
2150
        _PyTokenizer_Get(tok, &p_start, &p_end);
2151
    }
2152
    fclose(fp);
2153
    if (tok->encoding) {
  Branch (2153:9): [True: 0, False: 256]
2154
        encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1);
2155
        if (encoding) {
  Branch (2155:13): [True: 0, False: 0]
2156
            strcpy(encoding, tok->encoding);
2157
        }
2158
    }
2159
    _PyTokenizer_Free(tok);
2160
    return encoding;
2161
}
2162
2163
#ifdef Py_DEBUG
2164
void
2165
tok_dump(int type, char *start, char *end)
2166
{
2167
    fprintf(stderr, "%s", _PyParser_TokenNames[type]);
2168
    if (type == NAME || type == NUMBER || type == STRING || type == OP)
2169
        fprintf(stderr, "(%.*s)", (int)(end - start), start);
2170
}
2171
#endif  // Py_DEBUG