/home/mdboom/Work/builds/cpython/Parser/tokenizer.c
Line | Count | Source (jump to first uncovered line) |
1 | |
2 | /* Tokenizer implementation */ |
3 | |
4 | #define PY_SSIZE_T_CLEAN |
5 | #include "Python.h" |
6 | #include "pycore_call.h" // _PyObject_CallNoArgs() |
7 | |
8 | #include <ctype.h> |
9 | #include <assert.h> |
10 | |
11 | #include "tokenizer.h" |
12 | #include "errcode.h" |
13 | |
14 | #include "unicodeobject.h" |
15 | #include "bytesobject.h" |
16 | #include "fileobject.h" |
17 | #include "abstract.h" |
18 | |
19 | /* Alternate tab spacing */ |
20 | #define ALTTABSIZE 1 |
21 | |
22 | #define is_potential_identifier_start(c) (\ |
23 | (c >= 'a' && c <= 'z'2.10M )\ |
24 | || (7.88M c >= 'A'7.88M && c <= 'Z'810k )\ |
25 | || c == '_'7.81M \ |
26 | || (c >= 128)7.75M ) |
27 | |
28 | #define is_potential_identifier_char(c) (\ |
29 | (c >= 'a' && c <= 'z'5.12M )\ |
30 | || (3.43M c >= 'A'3.43M && c <= 'Z'873k )\ |
31 | || (3.13M c >= '0'3.13M && c <= '9'944k )\ |
32 | || c == '_'3.09M \ |
33 | || (c >= 128)2.86M ) |
34 | |
35 | |
36 | /* Don't ever change this -- it would break the portability of Python code */ |
37 | #define TABSIZE 8 |
38 | |
39 | /* Forward */ |
40 | static struct tok_state *tok_new(void); |
41 | static int tok_nextc(struct tok_state *tok); |
42 | static void tok_backup(struct tok_state *tok, int c); |
43 | static int syntaxerror(struct tok_state *tok, const char *format, ...); |
44 | |
45 | /* Spaces in this constant are treated as "zero or more spaces or tabs" when |
46 | tokenizing. */ |
47 | static const char* type_comment_prefix = "# type: "; |
48 | |
49 | /* Create and initialize a new tok_state structure */ |
50 | |
51 | static struct tok_state * |
52 | tok_new(void) |
53 | { |
54 | struct tok_state *tok = (struct tok_state *)PyMem_Malloc( |
55 | sizeof(struct tok_state)); |
56 | if (tok == NULL) Branch (56:9): [True: 0, False: 132k]
|
57 | return NULL; |
58 | tok->buf = tok->cur = tok->inp = NULL; |
59 | tok->fp_interactive = 0; |
60 | tok->interactive_src_start = NULL; |
61 | tok->interactive_src_end = NULL; |
62 | tok->start = NULL; |
63 | tok->end = NULL; |
64 | tok->done = E_OK; |
65 | tok->fp = NULL; |
66 | tok->input = NULL; |
67 | tok->tabsize = TABSIZE; |
68 | tok->indent = 0; |
69 | tok->indstack[0] = 0; |
70 | tok->atbol = 1; |
71 | tok->pendin = 0; |
72 | tok->prompt = tok->nextprompt = NULL; |
73 | tok->lineno = 0; |
74 | tok->level = 0; |
75 | tok->altindstack[0] = 0; |
76 | tok->decoding_state = STATE_INIT; |
77 | tok->decoding_erred = 0; |
78 | tok->enc = NULL; |
79 | tok->encoding = NULL; |
80 | tok->cont_line = 0; |
81 | tok->filename = NULL; |
82 | tok->decoding_readline = NULL; |
83 | tok->decoding_buffer = NULL; |
84 | tok->type_comments = 0; |
85 | tok->async_hacks = 0; |
86 | tok->async_def = 0; |
87 | tok->async_def_indent = 0; |
88 | tok->async_def_nl = 0; |
89 | tok->interactive_underflow = IUNDERFLOW_NORMAL; |
90 | tok->str = NULL; |
91 | #ifdef Py_DEBUG |
92 | tok->debug = _Py_GetConfig()->parser_debug; |
93 | #endif |
94 | return tok; |
95 | } |
96 | |
97 | static char * |
98 | new_string(const char *s, Py_ssize_t len, struct tok_state *tok) |
99 | { |
100 | char* result = (char *)PyMem_Malloc(len + 1); |
101 | if (!result) { Branch (101:9): [True: 0, False: 57.1k]
|
102 | tok->done = E_NOMEM; |
103 | return NULL; |
104 | } |
105 | memcpy(result, s, len); |
106 | result[len] = '\0'; |
107 | return result; |
108 | } |
109 | |
110 | static char * |
111 | error_ret(struct tok_state *tok) /* XXX */ |
112 | { |
113 | tok->decoding_erred = 1; |
114 | if (tok->fp != NULL && tok->buf != NULL0 ) /* see _PyTokenizer_Free */ Branch (114:9): [True: 0, False: 26]
Branch (114:28): [True: 0, False: 0]
|
115 | PyMem_Free(tok->buf); |
116 | tok->buf = tok->cur = tok->inp = NULL; |
117 | tok->start = NULL; |
118 | tok->end = NULL; |
119 | tok->done = E_DECODE; |
120 | return NULL; /* as if it were EOF */ |
121 | } |
122 | |
123 | |
124 | static const char * |
125 | get_normal_name(const char *s) /* for utf-8 and latin-1 */ |
126 | { |
127 | char buf[13]; |
128 | int i; |
129 | for (i = 0; i < 12; i++489 ) { Branch (129:17): [True: 566, False: 0]
|
130 | int c = s[i]; |
131 | if (c == '\0') Branch (131:13): [True: 77, False: 489]
|
132 | break; |
133 | else if (c == '_') Branch (133:18): [True: 0, False: 489]
|
134 | buf[i] = '-'; |
135 | else |
136 | buf[i] = tolower(c); |
137 | } |
138 | buf[i] = '\0'; |
139 | if (strcmp(buf, "utf-8") == 0 || Branch (139:9): [True: 14, False: 63]
|
140 | strncmp(buf, "utf-8-", 6) == 063 ) Branch (140:9): [True: 0, False: 63]
|
141 | return "utf-8"; |
142 | else if (strcmp(buf, "latin-1") == 0 || Branch (142:14): [True: 14, False: 49]
|
143 | strcmp(buf, "iso-8859-1") == 049 || Branch (143:14): [True: 2, False: 47]
|
144 | strcmp(buf, "iso-latin-1") == 047 || Branch (144:14): [True: 0, False: 47]
|
145 | strncmp(buf, "latin-1-", 8) == 047 || Branch (145:14): [True: 0, False: 47]
|
146 | strncmp(buf, "iso-8859-1-", 11) == 047 || Branch (146:14): [True: 0, False: 47]
|
147 | strncmp(buf, "iso-latin-1-", 12) == 047 ) Branch (147:14): [True: 0, False: 47]
|
148 | return "iso-8859-1"; |
149 | else |
150 | return s; |
151 | } |
152 | |
153 | /* Return the coding spec in S, or NULL if none is found. */ |
154 | |
155 | static int |
156 | get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok) |
157 | { |
158 | Py_ssize_t i; |
159 | *spec = NULL; |
160 | /* Coding spec must be in a comment, and that comment must be |
161 | * the only statement on the source code line. */ |
162 | for (i = 0; i < size - 6; i++0 ) { Branch (162:17): [True: 5.04k, False: 70.9k]
|
163 | if (s[i] == '#') Branch (163:13): [True: 336, False: 4.70k]
|
164 | break; |
165 | if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014') Branch (165:13): [True: 4.70k, False: 0]
Branch (165:28): [True: 4.70k, False: 0]
Branch (165:44): [True: 4.70k, False: 0]
|
166 | return 1; |
167 | } |
168 | for (; 71.2k i < size - 6; i++10.1k ) { /* XXX inefficient search */ Branch (168:12): [True: 10.2k, False: 71.1k]
|
169 | const char* t = s + i; |
170 | if (memcmp(t, "coding", 6) == 0) { Branch (170:13): [True: 77, False: 10.1k]
|
171 | const char* begin = NULL; |
172 | t += 6; |
173 | if (t[0] != ':' && t[0] != '='22 ) Branch (173:17): [True: 22, False: 55]
Branch (173:32): [True: 0, False: 22]
|
174 | continue; |
175 | do 77 { |
176 | t++; |
177 | } while (t[0] == ' ' || t[0] == '\t'77 ); Branch (177:22): [True: 46, False: 77]
Branch (177:37): [True: 0, False: 77]
|
178 | |
179 | begin = t; |
180 | while (Py_ISALNUM(t[0]) || |
181 | t[0] == '-'133 || t[0] == '_'77 || t[0] == '.'77 ) Branch (181:20): [True: 56, False: 77]
Branch (181:35): [True: 0, False: 77]
Branch (181:50): [True: 0, False: 77]
|
182 | t++; |
183 | |
184 | if (begin < t) { Branch (184:17): [True: 77, False: 0]
|
185 | char* r = new_string(begin, t - begin, tok); |
186 | const char* q; |
187 | if (!r) Branch (187:21): [True: 0, False: 77]
|
188 | return 0; |
189 | q = get_normal_name(r); |
190 | if (r != q) { Branch (190:21): [True: 30, False: 47]
|
191 | PyMem_Free(r); |
192 | r = new_string(q, strlen(q), tok); |
193 | if (!r) Branch (193:25): [True: 0, False: 30]
|
194 | return 0; |
195 | } |
196 | *spec = r; |
197 | break; |
198 | } |
199 | } |
200 | } |
201 | return 1; |
202 | } |
203 | |
204 | /* Check whether the line contains a coding spec. If it does, |
205 | invoke the set_readline function for the new encoding. |
206 | This function receives the tok_state and the new encoding. |
207 | Return 1 on success, 0 on failure. */ |
208 | |
209 | static int |
210 | check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, |
211 | int set_readline(struct tok_state *, const char *)) |
212 | { |
213 | char *cs; |
214 | if (tok->cont_line) { Branch (214:9): [True: 0, False: 75.9k]
|
215 | /* It's a continuation line, so it can't be a coding spec. */ |
216 | tok->decoding_state = STATE_NORMAL; |
217 | return 1; |
218 | } |
219 | if (!get_coding_spec(line, &cs, size, tok)) { Branch (219:9): [True: 0, False: 75.9k]
|
220 | return 0; |
221 | } |
222 | if (!cs) { Branch (222:9): [True: 75.9k, False: 77]
|
223 | Py_ssize_t i; |
224 | for (i = 0; i < size; i++0 ) { Branch (224:21): [True: 75.5k, False: 344]
|
225 | if (line[i] == '#' || line[i] == '\n'75.2k || line[i] == '\r'75.1k ) Branch (225:17): [True: 290, False: 75.2k]
Branch (225:35): [True: 78, False: 75.1k]
Branch (225:54): [True: 0, False: 75.1k]
|
226 | break; |
227 | if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') { Branch (227:17): [True: 75.1k, False: 0]
Branch (227:35): [True: 75.1k, False: 0]
Branch (227:54): [True: 75.1k, False: 0]
|
228 | /* Stop checking coding spec after a line containing |
229 | * anything except a comment. */ |
230 | tok->decoding_state = STATE_NORMAL; |
231 | break; |
232 | } |
233 | } |
234 | return 1; |
235 | } |
236 | tok->decoding_state = STATE_NORMAL; |
237 | if (tok->encoding == NULL) { Branch (237:9): [True: 51, False: 26]
|
238 | assert(tok->decoding_readline == NULL); |
239 | if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)43 ) { Branch (239:13): [True: 43, False: 8]
Branch (239:41): [True: 0, False: 43]
|
240 | error_ret(tok); |
241 | PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs); |
242 | PyMem_Free(cs); |
243 | return 0; |
244 | } |
245 | tok->encoding = cs; |
246 | } else { /* then, compare cs with BOM */ |
247 | if (strcmp(tok->encoding, cs) != 0) { Branch (247:13): [True: 20, False: 6]
|
248 | error_ret(tok); |
249 | PyErr_Format(PyExc_SyntaxError, |
250 | "encoding problem: %s with BOM", cs); |
251 | PyMem_Free(cs); |
252 | return 0; |
253 | } |
254 | PyMem_Free(cs); |
255 | } |
256 | return 1; |
257 | } |
258 | |
259 | /* See whether the file starts with a BOM. If it does, |
260 | invoke the set_readline function with the new encoding. |
261 | Return 1 on success, 0 on failure. */ |
262 | |
263 | static int |
264 | check_bom(int get_char(struct tok_state *), |
265 | void unget_char(int, struct tok_state *), |
266 | int set_readline(struct tok_state *, const char *), |
267 | struct tok_state *tok) |
268 | { |
269 | int ch1, ch2, ch3; |
270 | ch1 = get_char(tok); |
271 | tok->decoding_state = STATE_SEEK_CODING; |
272 | if (ch1 == EOF) { Branch (272:9): [True: 0, False: 75.6k]
|
273 | return 1; |
274 | } else if (ch1 == 0xEF) { Branch (274:16): [True: 37, False: 75.6k]
|
275 | ch2 = get_char(tok); |
276 | if (ch2 != 0xBB) { Branch (276:13): [True: 1, False: 36]
|
277 | unget_char(ch2, tok); |
278 | unget_char(ch1, tok); |
279 | return 1; |
280 | } |
281 | ch3 = get_char(tok); |
282 | if (ch3 != 0xBF) { Branch (282:13): [True: 2, False: 34]
|
283 | unget_char(ch3, tok); |
284 | unget_char(ch2, tok); |
285 | unget_char(ch1, tok); |
286 | return 1; |
287 | } |
288 | } else { |
289 | unget_char(ch1, tok); |
290 | return 1; |
291 | } |
292 | if (tok->encoding != NULL) Branch (292:9): [True: 0, False: 34]
|
293 | PyMem_Free(tok->encoding); |
294 | tok->encoding = new_string("utf-8", 5, tok); |
295 | if (!tok->encoding) Branch (295:9): [True: 0, False: 34]
|
296 | return 0; |
297 | /* No need to set_readline: input is already utf-8 */ |
298 | return 1; |
299 | } |
300 | |
301 | static int |
302 | tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) { |
303 | assert(tok->fp_interactive); |
304 |
|
305 | if (!line) { Branch (305:9): [True: 0, False: 0]
|
306 | return 0; |
307 | } |
308 | |
309 | Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start; |
310 | Py_ssize_t line_size = strlen(line); |
311 | char last_char = line[line_size > 0 ? line_size - 1 : line_size]; Branch (311:27): [True: 0, False: 0]
|
312 | if (last_char != '\n') { Branch (312:9): [True: 0, False: 0]
|
313 | line_size += 1; |
314 | } |
315 | char* new_str = tok->interactive_src_start; |
316 |
|
317 | new_str = PyMem_Realloc(new_str, current_size + line_size + 1); |
318 | if (!new_str) { Branch (318:9): [True: 0, False: 0]
|
319 | if (tok->interactive_src_start) { Branch (319:13): [True: 0, False: 0]
|
320 | PyMem_Free(tok->interactive_src_start); |
321 | } |
322 | tok->interactive_src_start = NULL; |
323 | tok->interactive_src_end = NULL; |
324 | tok->done = E_NOMEM; |
325 | return -1; |
326 | } |
327 | strcpy(new_str + current_size, line); |
328 | if (last_char != '\n') { Branch (328:9): [True: 0, False: 0]
|
329 | /* Last line does not end in \n, fake one */ |
330 | new_str[current_size + line_size - 1] = '\n'; |
331 | new_str[current_size + line_size] = '\0'; |
332 | } |
333 | tok->interactive_src_start = new_str; |
334 | tok->interactive_src_end = new_str + current_size + line_size; |
335 | return 0; |
336 | } |
337 | |
338 | |
339 | /* Read a line of text from TOK into S, using the stream in TOK. |
340 | Return NULL on failure, else S. |
341 | |
342 | On entry, tok->decoding_buffer will be one of: |
343 | 1) NULL: need to call tok->decoding_readline to get a new line |
344 | 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and |
345 | stored the result in tok->decoding_buffer |
346 | 3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room |
347 | (in the s buffer) to copy entire contents of the line read |
348 | by tok->decoding_readline. tok->decoding_buffer has the overflow. |
349 | In this case, tok_readline_recode is called in a loop (with an expanded buffer) |
350 | until the buffer ends with a '\n' (or until the end of the file is |
351 | reached): see tok_nextc and its calls to tok_reserve_buf. |
352 | */ |
353 | |
354 | static int |
355 | tok_reserve_buf(struct tok_state *tok, Py_ssize_t size) |
356 | { |
357 | Py_ssize_t cur = tok->cur - tok->buf; |
358 | Py_ssize_t oldsize = tok->inp - tok->buf; |
359 | Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1); |
360 | if (newsize > tok->end - tok->buf) { Branch (360:9): [True: 9, False: 801]
|
361 | char *newbuf = tok->buf; |
362 | Py_ssize_t start = tok->start == NULL ? -11 : tok->start - tok->buf8 ; Branch (362:28): [True: 1, False: 8]
|
363 | Py_ssize_t line_start = tok->start == NULL ? -11 : tok->line_start - tok->buf8 ; Branch (363:33): [True: 1, False: 8]
|
364 | Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf; |
365 | newbuf = (char *)PyMem_Realloc(newbuf, newsize); |
366 | if (newbuf == NULL) { Branch (366:13): [True: 0, False: 9]
|
367 | tok->done = E_NOMEM; |
368 | return 0; |
369 | } |
370 | tok->buf = newbuf; |
371 | tok->cur = tok->buf + cur; |
372 | tok->inp = tok->buf + oldsize; |
373 | tok->end = tok->buf + newsize; |
374 | tok->start = start < 0 ? NULL : tok->buf + start8 ; Branch (374:22): [True: 1, False: 8]
|
375 | tok->line_start = line_start < 0 ? NULL : tok->buf + line_start8 ; Branch (375:27): [True: 1, False: 8]
|
376 | tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start; Branch (376:33): [True: 0, False: 9]
|
377 | } |
378 | return 1; |
379 | } |
380 | |
381 | static int |
382 | tok_readline_recode(struct tok_state *tok) { |
383 | PyObject *line; |
384 | const char *buf; |
385 | Py_ssize_t buflen; |
386 | line = tok->decoding_buffer; |
387 | if (line == NULL) { Branch (387:9): [True: 0, False: 0]
|
388 | line = PyObject_CallNoArgs(tok->decoding_readline); |
389 | if (line == NULL) { Branch (389:13): [True: 0, False: 0]
|
390 | error_ret(tok); |
391 | goto error; |
392 | } |
393 | } |
394 | else { |
395 | tok->decoding_buffer = NULL; |
396 | } |
397 | buf = PyUnicode_AsUTF8AndSize(line, &buflen); |
398 | if (buf == NULL) { Branch (398:9): [True: 0, False: 0]
|
399 | error_ret(tok); |
400 | goto error; |
401 | } |
402 | if (!tok_reserve_buf(tok, buflen + 1)) { Branch (402:9): [True: 0, False: 0]
|
403 | goto error; |
404 | } |
405 | memcpy(tok->inp, buf, buflen); |
406 | tok->inp += buflen; |
407 | *tok->inp = '\0'; |
408 | if (tok->fp_interactive && Branch (408:9): [True: 0, False: 0]
|
409 | tok_concatenate_interactive_new_line(tok, buf) == -1) { Branch (409:9): [True: 0, False: 0]
|
410 | goto error; |
411 | } |
412 | Py_DECREF(line); |
413 | return 1; |
414 | error: |
415 | Py_XDECREF(line); |
416 | return 0; |
417 | } |
418 | |
419 | /* Set the readline function for TOK to a StreamReader's |
420 | readline function. The StreamReader is named ENC. |
421 | |
422 | This function is called from check_bom and check_coding_spec. |
423 | |
424 | ENC is usually identical to the future value of tok->encoding, |
425 | except for the (currently unsupported) case of UTF-16. |
426 | |
427 | Return 1 on success, 0 on failure. */ |
428 | |
429 | static int |
430 | fp_setreadl(struct tok_state *tok, const char* enc) |
431 | { |
432 | PyObject *readline, *open, *stream; |
433 | int fd; |
434 | long pos; |
435 |
|
436 | fd = fileno(tok->fp); |
437 | /* Due to buffering the file offset for fd can be different from the file |
438 | * position of tok->fp. If tok->fp was opened in text mode on Windows, |
439 | * its file position counts CRLF as one char and can't be directly mapped |
440 | * to the file offset for fd. Instead we step back one byte and read to |
441 | * the end of line.*/ |
442 | pos = ftell(tok->fp); |
443 | if (pos == -1 || Branch (443:9): [True: 0, False: 0]
|
444 | lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) { Branch (444:9): [True: 0, False: 0]
Branch (444:27): [True: 0, False: 0]
|
445 | PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL); |
446 | return 0; |
447 | } |
448 | |
449 | open = _PyImport_GetModuleAttrString("io", "open"); |
450 | if (open == NULL) { Branch (450:9): [True: 0, False: 0]
|
451 | return 0; |
452 | } |
453 | stream = PyObject_CallFunction(open, "isisOOO", |
454 | fd, "r", -1, enc, Py_None, Py_None, Py_False); |
455 | Py_DECREF(open); |
456 | if (stream == NULL) { Branch (456:9): [True: 0, False: 0]
|
457 | return 0; |
458 | } |
459 | |
460 | readline = PyObject_GetAttr(stream, &_Py_ID(readline)); |
461 | Py_DECREF(stream); |
462 | if (readline == NULL) { Branch (462:9): [True: 0, False: 0]
|
463 | return 0; |
464 | } |
465 | Py_XSETREF(tok->decoding_readline, readline); |
466 |
|
467 | if (pos > 0) { Branch (467:9): [True: 0, False: 0]
|
468 | PyObject *bufobj = _PyObject_CallNoArgs(readline); |
469 | if (bufobj == NULL) { Branch (469:13): [True: 0, False: 0]
|
470 | return 0; |
471 | } |
472 | Py_DECREF(bufobj); |
473 | } |
474 | |
475 | return 1; |
476 | } |
477 | |
478 | /* Fetch the next byte from TOK. */ |
479 | |
480 | static int fp_getc(struct tok_state *tok) { |
481 | return getc(tok->fp); |
482 | } |
483 | |
484 | /* Unfetch the last byte back into TOK. */ |
485 | |
486 | static void fp_ungetc(int c, struct tok_state *tok) { |
487 | ungetc(c, tok->fp); |
488 | } |
489 | |
490 | /* Check whether the characters at s start a valid |
491 | UTF-8 sequence. Return the number of characters forming |
492 | the sequence if yes, 0 if not. */ |
493 | static int valid_utf8(const unsigned char* s) |
494 | { |
495 | int expected = 0; |
496 | int length; |
497 | if (*s < 0x80) Branch (497:9): [True: 18.2k, False: 0]
|
498 | /* single-byte code */ |
499 | return 1; |
500 | if (*s < 0xc0) Branch (500:9): [True: 0, False: 0]
|
501 | /* following byte */ |
502 | return 0; |
503 | if (*s < 0xE0) Branch (503:9): [True: 0, False: 0]
|
504 | expected = 1; |
505 | else if (*s < 0xF0) Branch (505:14): [True: 0, False: 0]
|
506 | expected = 2; |
507 | else if (*s < 0xF8) Branch (507:14): [True: 0, False: 0]
|
508 | expected = 3; |
509 | else |
510 | return 0; |
511 | length = expected + 1; |
512 | for (; expected; expected--) Branch (512:12): [True: 0, False: 0]
|
513 | if (s[expected] < 0x80 || s[expected] >= 0xC0) Branch (513:13): [True: 0, False: 0]
Branch (513:35): [True: 0, False: 0]
|
514 | return 0; |
515 | return length; |
516 | } |
517 | |
518 | static int |
519 | ensure_utf8(char *line, struct tok_state *tok) |
520 | { |
521 | int badchar = 0; |
522 | unsigned char *c; |
523 | int length; |
524 | for (c = (unsigned char *)line; *c; c += length18.2k ) { Branch (524:37): [True: 18.2k, False: 806]
|
525 | if (!(length = valid_utf8(c))) { Branch (525:13): [True: 0, False: 18.2k]
|
526 | badchar = *c; |
527 | break; |
528 | } |
529 | } |
530 | if (badchar) { Branch (530:9): [True: 0, False: 806]
|
531 | /* Need to add 1 to the line number, since this line |
532 | has not been counted, yet. */ |
533 | PyErr_Format(PyExc_SyntaxError, |
534 | "Non-UTF-8 code starting with '\\x%.2x' " |
535 | "in file %U on line %i, " |
536 | "but no encoding declared; " |
537 | "see https://peps.python.org/pep-0263/ for details", |
538 | badchar, tok->filename, tok->lineno + 1); |
539 | return 0; |
540 | } |
541 | return 1; |
542 | } |
543 | |
544 | /* Fetch a byte from TOK, using the string buffer. */ |
545 | |
546 | static int |
547 | buf_getc(struct tok_state *tok) { |
548 | return Py_CHARMASK(*tok->str++); |
549 | } |
550 | |
551 | /* Unfetch a byte from TOK, using the string buffer. */ |
552 | |
553 | static void |
554 | buf_ungetc(int c, struct tok_state *tok) { |
555 | tok->str--; |
556 | assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */ |
557 | } |
558 | |
559 | /* Set the readline function for TOK to ENC. For the string-based |
560 | tokenizer, this means to just record the encoding. */ |
561 | |
562 | static int |
563 | buf_setreadl(struct tok_state *tok, const char* enc) { |
564 | tok->enc = enc; |
565 | return 1; |
566 | } |
567 | |
568 | /* Return a UTF-8 encoding Python string object from the |
569 | C byte string STR, which is encoded with ENC. */ |
570 | |
571 | static PyObject * |
572 | translate_into_utf8(const char* str, const char* enc) { |
573 | PyObject *utf8; |
574 | PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL); |
575 | if (buf == NULL) Branch (575:9): [True: 6, False: 37]
|
576 | return NULL; |
577 | utf8 = PyUnicode_AsUTF8String(buf); |
578 | Py_DECREF(buf); |
579 | return utf8; |
580 | } |
581 | |
582 | |
583 | static char * |
584 | translate_newlines(const char *s, int exec_input, struct tok_state *tok) { |
585 | int skip_next_lf = 0; |
586 | size_t needed_length = strlen(s) + 2, final_length; |
587 | char *buf, *current; |
588 | char c = '\0'; |
589 | buf = PyMem_Malloc(needed_length); |
590 | if (buf == NULL) { Branch (590:9): [True: 0, False: 132k]
|
591 | tok->done = E_NOMEM; |
592 | return NULL; |
593 | } |
594 | for (current = buf; 132k *s; s++, current++27.2M ) { Branch (594:25): [True: 27.2M, False: 132k]
|
595 | c = *s; |
596 | if (skip_next_lf) { Branch (596:13): [True: 40, False: 27.2M]
|
597 | skip_next_lf = 0; |
598 | if (c == '\n') { Branch (598:17): [True: 23, False: 17]
|
599 | c = *++s; |
600 | if (!c) Branch (600:21): [True: 7, False: 16]
|
601 | break; |
602 | } |
603 | } |
604 | if (c == '\r') { Branch (604:13): [True: 47, False: 27.2M]
|
605 | skip_next_lf = 1; |
606 | c = '\n'; |
607 | } |
608 | *current = c; |
609 | } |
610 | /* If this is exec input, add a newline to the end of the string if |
611 | there isn't one already. */ |
612 | if (exec_input && c != '\n'90.7k ) { Branch (612:9): [True: 90.7k, False: 41.6k]
Branch (612:23): [True: 86.4k, False: 4.29k]
|
613 | *current = '\n'; |
614 | current++; |
615 | } |
616 | *current = '\0'; |
617 | final_length = current - buf + 1; |
618 | if (final_length < needed_length && final_length45.9k ) { Branch (618:9): [True: 45.9k, False: 86.4k]
Branch (618:41): [True: 45.9k, False: 0]
|
619 | /* should never fail */ |
620 | char* result = PyMem_Realloc(buf, final_length); |
621 | if (result == NULL) { Branch (621:13): [True: 0, False: 45.9k]
|
622 | PyMem_Free(buf); |
623 | } |
624 | buf = result; |
625 | } |
626 | return buf; |
627 | } |
628 | |
629 | /* Decode a byte string STR for use as the buffer of TOK. |
630 | Look for encoding declarations inside STR, and record them |
631 | inside TOK. */ |
632 | |
633 | static char * |
634 | decode_str(const char *input, int single, struct tok_state *tok) |
635 | { |
636 | PyObject* utf8 = NULL; |
637 | char *str; |
638 | const char *s; |
639 | const char *newl[2] = {NULL, NULL}; |
640 | int lineno = 0; |
641 | tok->input = str = translate_newlines(input, single, tok); |
642 | if (str == NULL) Branch (642:9): [True: 0, False: 75.4k]
|
643 | return NULL; |
644 | tok->enc = NULL; |
645 | tok->str = str; |
646 | if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok)) Branch (646:9): [True: 0, False: 75.4k]
|
647 | return error_ret(tok); |
648 | str = tok->str; /* string after BOM if any */ |
649 | assert(str); |
650 | if (tok->enc != NULL) { Branch (650:9): [True: 0, False: 75.4k]
|
651 | utf8 = translate_into_utf8(str, tok->enc); |
652 | if (utf8 == NULL) Branch (652:13): [True: 0, False: 0]
|
653 | return error_ret(tok); |
654 | str = PyBytes_AsString(utf8); |
655 | } |
656 | for (s = str;; 75.4k s++846k ) { |
657 | if (*s == '\0') break74.2k ; Branch (657:13): [True: 74.2k, False: 847k]
|
658 | else if (*s == '\n') { Branch (658:18): [True: 76.5k, False: 770k]
|
659 | assert(lineno < 2); |
660 | newl[lineno] = s; |
661 | lineno++; |
662 | if (lineno == 2) break1.18k ; Branch (662:17): [True: 1.18k, False: 75.4k]
|
663 | } |
664 | } |
665 | tok->enc = NULL; |
666 | /* need to check line 1 and 2 separately since check_coding_spec |
667 | assumes a single line as input */ |
668 | if (newl[0]) { Branch (668:9): [True: 75.4k, False: 13]
|
669 | if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) { Branch (669:13): [True: 20, False: 75.3k]
|
670 | return NULL; |
671 | } |
672 | if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL75.3k && newl[1]481 ) { Branch (672:13): [True: 75.3k, False: 36]
Branch (672:33): [True: 481, False: 74.8k]
Branch (672:72): [True: 224, False: 257]
|
673 | if (!check_coding_spec(newl[0]+1, newl[1] - newl[0], Branch (673:17): [True: 0, False: 224]
|
674 | tok, buf_setreadl)) |
675 | return NULL; |
676 | } |
677 | } |
678 | if (tok->enc != NULL) { Branch (678:9): [True: 43, False: 75.3k]
|
679 | assert(utf8 == NULL); |
680 | utf8 = translate_into_utf8(str, tok->enc); |
681 | if (utf8 == NULL) Branch (681:13): [True: 6, False: 37]
|
682 | return error_ret(tok); |
683 | str = PyBytes_AS_STRING(utf8); |
684 | } |
685 | assert(tok->decoding_buffer == NULL); |
686 | tok->decoding_buffer = utf8; /* CAUTION */ |
687 | return str; |
688 | } |
689 | |
690 | /* Set up tokenizer for string */ |
691 | |
692 | struct tok_state * |
693 | _PyTokenizer_FromString(const char *str, int exec_input) |
694 | { |
695 | struct tok_state *tok = tok_new(); |
696 | char *decoded; |
697 | |
698 | if (tok == NULL) Branch (698:9): [True: 0, False: 75.4k]
|
699 | return NULL; |
700 | decoded = decode_str(str, exec_input, tok); |
701 | if (decoded == NULL) { Branch (701:9): [True: 26, False: 75.4k]
|
702 | _PyTokenizer_Free(tok); |
703 | return NULL; |
704 | } |
705 | |
706 | tok->buf = tok->cur = tok->inp = decoded; |
707 | tok->end = decoded; |
708 | return tok; |
709 | } |
710 | |
711 | /* Set up tokenizer for UTF-8 string */ |
712 | |
713 | struct tok_state * |
714 | _PyTokenizer_FromUTF8(const char *str, int exec_input) |
715 | { |
716 | struct tok_state *tok = tok_new(); |
717 | char *translated; |
718 | if (tok == NULL) Branch (718:9): [True: 0, False: 56.9k]
|
719 | return NULL; |
720 | tok->input = translated = translate_newlines(str, exec_input, tok); |
721 | if (translated == NULL) { Branch (721:9): [True: 0, False: 56.9k]
|
722 | _PyTokenizer_Free(tok); |
723 | return NULL; |
724 | } |
725 | tok->decoding_state = STATE_NORMAL; |
726 | tok->enc = NULL; |
727 | tok->str = translated; |
728 | tok->encoding = new_string("utf-8", 5, tok); |
729 | if (!tok->encoding) { Branch (729:9): [True: 0, False: 56.9k]
|
730 | _PyTokenizer_Free(tok); |
731 | return NULL; |
732 | } |
733 | |
734 | tok->buf = tok->cur = tok->inp = translated; |
735 | tok->end = translated; |
736 | return tok; |
737 | } |
738 | |
739 | /* Set up tokenizer for file */ |
740 | |
741 | struct tok_state * |
742 | _PyTokenizer_FromFile(FILE *fp, const char* enc, |
743 | const char *ps1, const char *ps2) |
744 | { |
745 | struct tok_state *tok = tok_new(); |
746 | if (tok == NULL) Branch (746:9): [True: 0, False: 257]
|
747 | return NULL; |
748 | if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) { Branch (748:9): [True: 0, False: 257]
|
749 | _PyTokenizer_Free(tok); |
750 | return NULL; |
751 | } |
752 | tok->cur = tok->inp = tok->buf; |
753 | tok->end = tok->buf + BUFSIZ; |
754 | tok->fp = fp; |
755 | tok->prompt = ps1; |
756 | tok->nextprompt = ps2; |
757 | if (enc != NULL) { Branch (757:9): [True: 0, False: 257]
|
758 | /* Must copy encoding declaration since it |
759 | gets copied into the parse tree. */ |
760 | tok->encoding = new_string(enc, strlen(enc), tok); |
761 | if (!tok->encoding) { Branch (761:13): [True: 0, False: 0]
|
762 | _PyTokenizer_Free(tok); |
763 | return NULL; |
764 | } |
765 | tok->decoding_state = STATE_NORMAL; |
766 | } |
767 | return tok; |
768 | } |
769 | |
770 | /* Free a tok_state structure */ |
771 | |
772 | void |
773 | _PyTokenizer_Free(struct tok_state *tok) |
774 | { |
775 | if (tok->encoding != NULL) { Branch (775:9): [True: 57.0k, False: 75.5k]
|
776 | PyMem_Free(tok->encoding); |
777 | } |
778 | Py_XDECREF(tok->decoding_readline); |
779 | Py_XDECREF(tok->decoding_buffer); |
780 | Py_XDECREF(tok->filename); |
781 | if (tok->fp != NULL && tok->buf != NULL257 ) { Branch (781:9): [True: 257, False: 132k]
Branch (781:28): [True: 257, False: 0]
|
782 | PyMem_Free(tok->buf); |
783 | } |
784 | if (tok->input) { Branch (784:9): [True: 132k, False: 257]
|
785 | PyMem_Free(tok->input); |
786 | } |
787 | if (tok->interactive_src_start != NULL) { Branch (787:9): [True: 0, False: 132k]
|
788 | PyMem_Free(tok->interactive_src_start); |
789 | } |
790 | PyMem_Free(tok); |
791 | } |
792 | |
793 | static int |
794 | tok_readline_raw(struct tok_state *tok) |
795 | { |
796 | do { |
797 | if (!tok_reserve_buf(tok, BUFSIZ)) { Branch (797:13): [True: 0, False: 810]
|
798 | return 0; |
799 | } |
800 | char *line = Py_UniversalNewlineFgets(tok->inp, |
801 | (int)(tok->end - tok->inp), |
802 | tok->fp, NULL); |
803 | if (line == NULL) { Branch (803:13): [True: 4, False: 806]
|
804 | return 1; |
805 | } |
806 | if (tok->fp_interactive && Branch (806:13): [True: 0, False: 806]
|
807 | tok_concatenate_interactive_new_line(tok, line) == -10 ) { Branch (807:13): [True: 0, False: 0]
|
808 | return 0; |
809 | } |
810 | tok->inp = strchr(tok->inp, '\0'); |
811 | if (tok->inp == tok->buf) { Branch (811:13): [True: 0, False: 806]
|
812 | return 0; |
813 | } |
814 | } while (tok->inp[-1] != '\n'); Branch (814:14): [True: 1, False: 805]
|
815 | return 1; |
816 | } |
817 | |
818 | static int |
819 | tok_underflow_string(struct tok_state *tok) { |
820 | char *end = strchr(tok->inp, '\n'); |
821 | if (end != NULL) { Branch (821:9): [True: 3.37M, False: 92.9k]
|
822 | end++; |
823 | } |
824 | else { |
825 | end = strchr(tok->inp, '\0'); |
826 | if (end == tok->inp) { Branch (826:13): [True: 55.7k, False: 37.1k]
|
827 | tok->done = E_EOF; |
828 | return 0; |
829 | } |
830 | } |
831 | if (tok->start == NULL) { Branch (831:9): [True: 3.34M, False: 64.1k]
|
832 | tok->buf = tok->cur; |
833 | } |
834 | tok->line_start = tok->cur; |
835 | tok->lineno++; |
836 | tok->inp = end; |
837 | return 1; |
838 | } |
839 | |
840 | static int |
841 | tok_underflow_interactive(struct tok_state *tok) { |
842 | if (tok->interactive_underflow == IUNDERFLOW_STOP) { Branch (842:9): [True: 0, False: 0]
|
843 | tok->done = E_INTERACT_STOP; |
844 | return 1; |
845 | } |
846 | char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt); Branch (846:34): [True: 0, False: 0]
|
847 | if (newtok != NULL) { Branch (847:9): [True: 0, False: 0]
|
848 | char *translated = translate_newlines(newtok, 0, tok); |
849 | PyMem_Free(newtok); |
850 | if (translated == NULL) { Branch (850:13): [True: 0, False: 0]
|
851 | return 0; |
852 | } |
853 | newtok = translated; |
854 | } |
855 | if (tok->encoding && newtok && *newtok) { Branch (855:9): [True: 0, False: 0]
Branch (855:26): [True: 0, False: 0]
Branch (855:36): [True: 0, False: 0]
|
856 | /* Recode to UTF-8 */ |
857 | Py_ssize_t buflen; |
858 | const char* buf; |
859 | PyObject *u = translate_into_utf8(newtok, tok->encoding); |
860 | PyMem_Free(newtok); |
861 | if (u == NULL) { Branch (861:13): [True: 0, False: 0]
|
862 | tok->done = E_DECODE; |
863 | return 0; |
864 | } |
865 | buflen = PyBytes_GET_SIZE(u); |
866 | buf = PyBytes_AS_STRING(u); |
867 | newtok = PyMem_Malloc(buflen+1); |
868 | if (newtok == NULL) { Branch (868:13): [True: 0, False: 0]
|
869 | Py_DECREF(u); |
870 | tok->done = E_NOMEM; |
871 | return 0; |
872 | } |
873 | strcpy(newtok, buf); |
874 | Py_DECREF(u); |
875 | } |
876 | if (tok->fp_interactive && Branch (876:9): [True: 0, False: 0]
|
877 | tok_concatenate_interactive_new_line(tok, newtok) == -1) { Branch (877:9): [True: 0, False: 0]
|
878 | PyMem_Free(newtok); |
879 | return 0; |
880 | } |
881 | if (tok->nextprompt != NULL) { Branch (881:9): [True: 0, False: 0]
|
882 | tok->prompt = tok->nextprompt; |
883 | } |
884 | if (newtok == NULL) { Branch (884:9): [True: 0, False: 0]
|
885 | tok->done = E_INTR; |
886 | } |
887 | else if (*newtok == '\0') { Branch (887:14): [True: 0, False: 0]
|
888 | PyMem_Free(newtok); |
889 | tok->done = E_EOF; |
890 | } |
891 | else if (tok->start != NULL) { Branch (891:14): [True: 0, False: 0]
|
892 | Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf; |
893 | size_t size = strlen(newtok); |
894 | tok->lineno++; |
895 | if (!tok_reserve_buf(tok, size + 1)) { Branch (895:13): [True: 0, False: 0]
|
896 | PyMem_Free(tok->buf); |
897 | tok->buf = NULL; |
898 | PyMem_Free(newtok); |
899 | return 0; |
900 | } |
901 | memcpy(tok->cur, newtok, size + 1); |
902 | PyMem_Free(newtok); |
903 | tok->inp += size; |
904 | tok->multi_line_start = tok->buf + cur_multi_line_start; |
905 | } |
906 | else { |
907 | tok->lineno++; |
908 | PyMem_Free(tok->buf); |
909 | tok->buf = newtok; |
910 | tok->cur = tok->buf; |
911 | tok->line_start = tok->buf; |
912 | tok->inp = strchr(tok->buf, '\0'); |
913 | tok->end = tok->inp + 1; |
914 | } |
915 | if (tok->done != E_OK) { Branch (915:9): [True: 0, False: 0]
|
916 | if (tok->prompt != NULL) { Branch (916:13): [True: 0, False: 0]
|
917 | PySys_WriteStderr("\n"); |
918 | } |
919 | return 0; |
920 | } |
921 | return 1; |
922 | } |
923 | |
924 | static int |
925 | tok_underflow_file(struct tok_state *tok) { |
926 | if (tok->start == NULL) { Branch (926:9): [True: 801, False: 8]
|
927 | tok->cur = tok->inp = tok->buf; |
928 | } |
929 | if (tok->decoding_state == STATE_INIT) { Branch (929:9): [True: 257, False: 552]
|
930 | /* We have not yet determined the encoding. |
931 | If an encoding is found, use the file-pointer |
932 | reader functions from now on. */ |
933 | if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) { Branch (933:13): [True: 0, False: 257]
|
934 | error_ret(tok); |
935 | return 0; |
936 | } |
937 | assert(tok->decoding_state != STATE_INIT); |
938 | } |
939 | /* Read until '\n' or EOF */ |
940 | if (tok->decoding_readline != NULL) { Branch (940:9): [True: 0, False: 809]
|
941 | /* We already have a codec associated with this input. */ |
942 | if (!tok_readline_recode(tok)) { Branch (942:13): [True: 0, False: 0]
|
943 | return 0; |
944 | } |
945 | } |
946 | else { |
947 | /* We want a 'raw' read. */ |
948 | if (!tok_readline_raw(tok)) { Branch (948:13): [True: 0, False: 809]
|
949 | return 0; |
950 | } |
951 | } |
952 | if (tok->inp == tok->cur) { Branch (952:9): [True: 3, False: 806]
|
953 | tok->done = E_EOF; |
954 | return 0; |
955 | } |
956 | if (tok->inp[-1] != '\n') { Branch (956:9): [True: 1, False: 805]
|
957 | /* Last line does not end in \n, fake one */ |
958 | *tok->inp++ = '\n'; |
959 | *tok->inp = '\0'; |
960 | } |
961 | |
962 | tok->lineno++; |
963 | if (tok->decoding_state != STATE_NORMAL) { Branch (963:9): [True: 425, False: 381]
|
964 | if (tok->lineno > 2) { Branch (964:13): [True: 84, False: 341]
|
965 | tok->decoding_state = STATE_NORMAL; |
966 | } |
967 | else if (!check_coding_spec(tok->cur, strlen(tok->cur), Branch (967:18): [True: 0, False: 341]
|
968 | tok, fp_setreadl)) |
969 | { |
970 | return 0; |
971 | } |
972 | } |
973 | /* The default encoding is UTF-8, so make sure we don't have any |
974 | non-UTF-8 sequences in it. */ |
975 | if (!tok->encoding && !ensure_utf8(tok->cur, tok)) { Branch (975:9): [True: 806, False: 0]
Branch (975:27): [True: 0, False: 806]
|
976 | error_ret(tok); |
977 | return 0; |
978 | } |
979 | assert(tok->done == E_OK); |
980 | return tok->done == E_OK; |
981 | } |
982 | |
983 | #if defined(Py_DEBUG) |
984 | static void |
985 | print_escape(FILE *f, const char *s, Py_ssize_t size) |
986 | { |
987 | if (s == NULL) { |
988 | fputs("NULL", f); |
989 | return; |
990 | } |
991 | putc('"', f); |
992 | while (size-- > 0) { |
993 | unsigned char c = *s++; |
994 | switch (c) { |
995 | case '\n': fputs("\\n", f); break; |
996 | case '\r': fputs("\\r", f); break; |
997 | case '\t': fputs("\\t", f); break; |
998 | case '\f': fputs("\\f", f); break; |
999 | case '\'': fputs("\\'", f); break; |
1000 | case '"': fputs("\\\"", f); break; |
1001 | default: |
1002 | if (0x20 <= c && c <= 0x7f) |
1003 | putc(c, f); |
1004 | else |
1005 | fprintf(f, "\\x%02x", c); |
1006 | } |
1007 | } |
1008 | putc('"', f); |
1009 | } |
1010 | #endif |
1011 | |
1012 | /* Get next char, updating state; error code goes into tok->done */ |
1013 | |
1014 | static int |
1015 | tok_nextc(struct tok_state *tok) |
1016 | { |
1017 | int rc; |
1018 | for (;;) { |
1019 | if (tok->cur != tok->inp) { Branch (1019:13): [True: 47.0M, False: 3.57M]
|
1020 | return Py_CHARMASK(*tok->cur++); /* Fast path */ |
1021 | } |
1022 | if (tok->done != E_OK) { Branch (1022:13): [True: 107k, False: 3.46M]
|
1023 | return EOF; |
1024 | } |
1025 | if (tok->fp == NULL) { Branch (1025:13): [True: 3.46M, False: 809]
|
1026 | rc = tok_underflow_string(tok); |
1027 | } |
1028 | else if (tok->prompt != NULL) { Branch (1028:18): [True: 0, False: 809]
|
1029 | rc = tok_underflow_interactive(tok); |
1030 | } |
1031 | else { |
1032 | rc = tok_underflow_file(tok); |
1033 | } |
1034 | #if defined(Py_DEBUG) |
1035 | if (tok->debug) { |
1036 | fprintf(stderr, "line[%d] = ", tok->lineno); |
1037 | print_escape(stderr, tok->cur, tok->inp - tok->cur); |
1038 | fprintf(stderr, " tok->done = %d\n", tok->done); |
1039 | } |
1040 | #endif |
1041 | if (!rc) { Branch (1041:13): [True: 55.7k, False: 3.41M]
|
1042 | tok->cur = tok->inp; |
1043 | return EOF; |
1044 | } |
1045 | tok->line_start = tok->cur; |
1046 | } |
1047 | Py_UNREACHABLE0 (); |
1048 | } |
1049 | |
1050 | /* Back-up one character */ |
1051 | |
1052 | static void |
1053 | tok_backup(struct tok_state *tok, int c) |
1054 | { |
1055 | if (c != EOF) { Branch (1055:9): [True: 19.9M, False: 107k]
|
1056 | if (--tok->cur < tok->buf) { Branch (1056:13): [True: 0, False: 19.9M]
|
1057 | Py_FatalError("tokenizer beginning of buffer"); |
1058 | } |
1059 | if ((int)(unsigned char)*tok->cur != c) { Branch (1059:13): [True: 0, False: 19.9M]
|
1060 | Py_FatalError("tok_backup: wrong character"); |
1061 | } |
1062 | } |
1063 | } |
1064 | |
1065 | static int |
1066 | _syntaxerror_range(struct tok_state *tok, const char *format, |
1067 | int col_offset, int end_col_offset, |
1068 | va_list vargs) |
1069 | { |
1070 | PyObject *errmsg, *errtext, *args; |
1071 | errmsg = PyUnicode_FromFormatV(format, vargs); |
1072 | if (!errmsg) { Branch (1072:9): [True: 0, False: 228]
|
1073 | goto error; |
1074 | } |
1075 | |
1076 | errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start, |
1077 | "replace"); |
1078 | if (!errtext) { Branch (1078:9): [True: 0, False: 228]
|
1079 | goto error; |
1080 | } |
1081 | |
1082 | if (col_offset == -1) { Branch (1082:9): [True: 215, False: 13]
|
1083 | col_offset = (int)PyUnicode_GET_LENGTH(errtext); |
1084 | } |
1085 | if (end_col_offset == -1) { Branch (1085:9): [True: 215, False: 13]
|
1086 | end_col_offset = col_offset; |
1087 | } |
1088 | |
1089 | Py_ssize_t line_len = strcspn(tok->line_start, "\n"); |
1090 | if (line_len != tok->cur - tok->line_start) { Branch (1090:9): [True: 153, False: 75]
|
1091 | Py_DECREF(errtext); |
1092 | errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len, |
1093 | "replace"); |
1094 | } |
1095 | if (!errtext) { Branch (1095:9): [True: 0, False: 228]
|
1096 | goto error; |
1097 | } |
1098 | |
1099 | args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno, |
1100 | col_offset, errtext, tok->lineno, end_col_offset); |
1101 | if (args) { Branch (1101:9): [True: 228, False: 0]
|
1102 | PyErr_SetObject(PyExc_SyntaxError, args); |
1103 | Py_DECREF(args); |
1104 | } |
1105 | |
1106 | error: |
1107 | Py_XDECREF(errmsg); |
1108 | tok->done = E_ERROR; |
1109 | return ERRORTOKEN; |
1110 | } |
1111 | |
1112 | static int |
1113 | syntaxerror(struct tok_state *tok, const char *format, ...) |
1114 | { |
1115 | va_list vargs; |
1116 | va_start(vargs, format); |
1117 | int ret = _syntaxerror_range(tok, format, -1, -1, vargs); |
1118 | va_end(vargs); |
1119 | return ret; |
1120 | } |
1121 | |
1122 | static int |
1123 | syntaxerror_known_range(struct tok_state *tok, |
1124 | int col_offset, int end_col_offset, |
1125 | const char *format, ...) |
1126 | { |
1127 | va_list vargs; |
1128 | va_start(vargs, format); |
1129 | int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs); |
1130 | va_end(vargs); |
1131 | return ret; |
1132 | } |
1133 | |
1134 | |
1135 | |
1136 | static int |
1137 | indenterror(struct tok_state *tok) |
1138 | { |
1139 | tok->done = E_TABSPACE; |
1140 | tok->cur = tok->inp; |
1141 | return ERRORTOKEN; |
1142 | } |
1143 | |
1144 | static int |
1145 | parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...) |
1146 | { |
1147 | PyObject *errmsg; |
1148 | va_list vargs; |
1149 | va_start(vargs, format); |
1150 | errmsg = PyUnicode_FromFormatV(format, vargs); |
1151 | va_end(vargs); |
1152 | if (!errmsg) { Branch (1152:9): [True: 0, False: 130]
|
1153 | goto error; |
1154 | } |
1155 | |
1156 | if (PyErr_WarnExplicitObject(category, errmsg, tok->filename, Branch (1156:9): [True: 64, False: 66]
|
1157 | tok->lineno, NULL, NULL) < 0) { |
1158 | if (PyErr_ExceptionMatches(category)) { Branch (1158:13): [True: 64, False: 0]
|
1159 | /* Replace the DeprecationWarning exception with a SyntaxError |
1160 | to get a more accurate error report */ |
1161 | PyErr_Clear(); |
1162 | syntaxerror(tok, "%U", errmsg); |
1163 | } |
1164 | goto error; |
1165 | } |
1166 | Py_DECREF(errmsg); |
1167 | return 0; |
1168 | |
1169 | error: |
1170 | Py_XDECREF(errmsg); |
1171 | tok->done = E_ERROR; |
1172 | return -1; |
1173 | } |
1174 | |
1175 | static int |
1176 | lookahead(struct tok_state *tok, const char *test) |
1177 | { |
1178 | const char *s = test; |
1179 | int res = 0; |
1180 | while (1) { Branch (1180:12): [Folded - Ignored]
|
1181 | int c = tok_nextc(tok); |
1182 | if (*s == 0) { Branch (1182:13): [True: 82, False: 167]
|
1183 | res = !is_potential_identifier_char(c); |
1184 | } |
1185 | else if (c == *s) { Branch (1185:18): [True: 158, False: 9]
|
1186 | s++; |
1187 | continue; |
1188 | } |
1189 | |
1190 | tok_backup(tok, c); |
1191 | while (s != test) { Branch (1191:16): [True: 158, False: 91]
|
1192 | tok_backup(tok, *--s); |
1193 | } |
1194 | return res; |
1195 | } |
1196 | } |
1197 | |
1198 | static int |
1199 | verify_end_of_number(struct tok_state *tok, int c, const char *kind) |
1200 | { |
1201 | /* Emit a deprecation warning only if the numeric literal is immediately |
1202 | * followed by one of keywords which can occur after a numeric literal |
1203 | * in valid code: "and", "else", "for", "if", "in", "is" and "or". |
1204 | * It allows to gradually deprecate existing valid code without adding |
1205 | * warning before error in most cases of invalid numeric literal (which |
1206 | * would be confusing and break existing tests). |
1207 | * Raise a syntax error with slightly better message than plain |
1208 | * "invalid syntax" if the numeric literal is immediately followed by |
1209 | * other keyword or identifier. |
1210 | */ |
1211 | int r = 0; |
1212 | if (c == 'a') { Branch (1212:9): [True: 14, False: 646k]
|
1213 | r = lookahead(tok, "nd"); |
1214 | } |
1215 | else if (c == 'e') { Branch (1215:14): [True: 24, False: 646k]
|
1216 | r = lookahead(tok, "lse"); |
1217 | } |
1218 | else if (c == 'f') { Branch (1218:14): [True: 14, False: 646k]
|
1219 | r = lookahead(tok, "or"); |
1220 | } |
1221 | else if (c == 'i') { Branch (1221:14): [True: 48, False: 646k]
|
1222 | int c2 = tok_nextc(tok); |
1223 | if (c2 == 'f' || c2 == 'n'32 || c2 == 's'16 ) { Branch (1223:13): [True: 16, False: 32]
Branch (1223:26): [True: 16, False: 16]
Branch (1223:39): [True: 16, False: 0]
|
1224 | r = 1; |
1225 | } |
1226 | tok_backup(tok, c2); |
1227 | } |
1228 | else if (c == 'o') { Branch (1228:14): [True: 22, False: 646k]
|
1229 | r = lookahead(tok, "r"); |
1230 | } |
1231 | else if (c == 'n') { Branch (1231:14): [True: 17, False: 646k]
|
1232 | r = lookahead(tok, "ot"); |
1233 | } |
1234 | if (r) { Branch (1234:9): [True: 130, False: 646k]
|
1235 | tok_backup(tok, c); |
1236 | if (parser_warn(tok, PyExc_SyntaxWarning, Branch (1236:13): [True: 64, False: 66]
|
1237 | "invalid %s literal", kind)) |
1238 | { |
1239 | return 0; |
1240 | } |
1241 | tok_nextc(tok); |
1242 | } |
1243 | else /* In future releases, only error will remain. */ |
1244 | if (is_potential_identifier_char(c)) { |
1245 | tok_backup(tok, c); |
1246 | syntaxerror(tok, "invalid %s literal", kind); |
1247 | return 0; |
1248 | } |
1249 | return 1; |
1250 | } |
1251 | |
1252 | /* Verify that the identifier follows PEP 3131. |
1253 | All identifier strings are guaranteed to be "ready" unicode objects. |
1254 | */ |
1255 | static int |
1256 | verify_identifier(struct tok_state *tok) |
1257 | { |
1258 | PyObject *s; |
1259 | if (tok->decoding_erred) Branch (1259:9): [True: 0, False: 48]
|
1260 | return 0; |
1261 | s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL); |
1262 | if (s == NULL) { Branch (1262:9): [True: 4, False: 44]
|
1263 | if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { Branch (1263:13): [True: 4, False: 0]
|
1264 | tok->done = E_DECODE; |
1265 | } |
1266 | else { |
1267 | tok->done = E_ERROR; |
1268 | } |
1269 | return 0; |
1270 | } |
1271 | Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s); |
1272 | if (invalid < 0) { Branch (1272:9): [True: 0, False: 44]
|
1273 | Py_DECREF(s); |
1274 | tok->done = E_ERROR; |
1275 | return 0; |
1276 | } |
1277 | assert(PyUnicode_GET_LENGTH(s) > 0); |
1278 | if (invalid < PyUnicode_GET_LENGTH(s)) { Branch (1278:9): [True: 8, False: 36]
|
1279 | Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid); |
1280 | if (invalid + 1 < PyUnicode_GET_LENGTH(s)) { Branch (1280:13): [True: 0, False: 8]
|
1281 | /* Determine the offset in UTF-8 encoded input */ |
1282 | Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1)); |
1283 | if (s != NULL) { Branch (1283:17): [True: 0, False: 0]
|
1284 | Py_SETREF(s, PyUnicode_AsUTF8String(s)); |
1285 | } |
1286 | if (s == NULL) { Branch (1286:17): [True: 0, False: 0]
|
1287 | tok->done = E_ERROR; |
1288 | return 0; |
1289 | } |
1290 | tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s); |
1291 | } |
1292 | Py_DECREF(s); |
1293 | // PyUnicode_FromFormatV() does not support %X |
1294 | char hex[9]; |
1295 | (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch); |
1296 | if (Py_UNICODE_ISPRINTABLE(ch)) { |
1297 | syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex); |
1298 | } |
1299 | else { |
1300 | syntaxerror(tok, "invalid non-printable character U+%s", hex); |
1301 | } |
1302 | return 0; |
1303 | } |
1304 | Py_DECREF(s); |
1305 | return 1; |
1306 | } |
1307 | |
1308 | static int |
1309 | tok_decimal_tail(struct tok_state *tok) |
1310 | { |
1311 | int c; |
1312 | |
1313 | while (1) { Branch (1313:12): [Folded - Ignored]
|
1314 | do { |
1315 | c = tok_nextc(tok); |
1316 | } while (isdigit(c)); |
1317 | if (c != '_') { Branch (1317:13): [True: 330k, False: 262]
|
1318 | break; |
1319 | } |
1320 | c = tok_nextc(tok); |
1321 | if (!isdigit(c)) { Branch (1321:13): [True: 27, False: 235]
|
1322 | tok_backup(tok, c); |
1323 | syntaxerror(tok, "invalid decimal literal"); |
1324 | return 0; |
1325 | } |
1326 | } |
1327 | return c; |
1328 | } |
1329 | |
1330 | /* Get next token, after space stripping etc. */ |
1331 | |
1332 | static inline int |
1333 | tok_continuation_line(struct tok_state *tok) { |
1334 | int c = tok_nextc(tok); |
1335 | if (c != '\n') { Branch (1335:9): [True: 7, False: 486]
|
1336 | tok->done = E_LINECONT; |
1337 | return -1; |
1338 | } |
1339 | c = tok_nextc(tok); |
1340 | if (c == EOF) { Branch (1340:9): [True: 9, False: 477]
|
1341 | tok->done = E_EOF; |
1342 | tok->cur = tok->inp; |
1343 | return -1; |
1344 | } else { |
1345 | tok_backup(tok, c); |
1346 | } |
1347 | return c; |
1348 | } |
1349 | |
1350 | static int |
1351 | tok_get(struct tok_state *tok, const char **p_start, const char **p_end) |
1352 | { |
1353 | int c; |
1354 | int blankline, nonascii; |
1355 | |
1356 | *p_start = *p_end = NULL; |
1357 | nextline: |
1358 | tok->start = NULL; |
1359 | blankline = 0; |
1360 | |
1361 | /* Get indentation level */ |
1362 | if (tok->atbol) { Branch (1362:9): [True: 3.36M, False: 6.82M]
|
1363 | int col = 0; |
1364 | int altcol = 0; |
1365 | tok->atbol = 0; |
1366 | int cont_line_col = 0; |
1367 | for (;;) { |
1368 | c = tok_nextc(tok); |
1369 | if (c == ' ') { Branch (1369:17): [True: 4.90M, False: 3.36M]
|
1370 | col++, altcol++; |
1371 | } |
1372 | else if (c == '\t') { Branch (1372:22): [True: 907, False: 3.36M]
|
1373 | col = (col / tok->tabsize + 1) * tok->tabsize; |
1374 | altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE; |
1375 | } |
1376 | else if (c == '\014') {/* Control-L (formfeed) */ Branch (1376:22): [True: 7, False: 3.36M]
|
1377 | col = altcol = 0; /* For Emacs users */ |
1378 | } |
1379 | else if (c == '\\') { Branch (1379:22): [True: 30, False: 3.36M]
|
1380 | // Indentation cannot be split over multiple physical lines |
1381 | // using backslashes. This means that if we found a backslash |
1382 | // preceded by whitespace, **the first one we find** determines |
1383 | // the level of indentation of whatever comes next. |
1384 | cont_line_col = cont_line_col ? cont_line_col6 : col24 ; Branch (1384:33): [True: 6, False: 24]
|
1385 | if ((c = tok_continuation_line(tok)) == -1) { Branch (1385:21): [True: 1, False: 29]
|
1386 | return ERRORTOKEN; |
1387 | } |
1388 | } |
1389 | else { |
1390 | break; |
1391 | } |
1392 | } |
1393 | tok_backup(tok, c); |
1394 | if (c == '#' || c == '\n'3.33M ) { Branch (1394:13): [True: 33.8k, False: 3.33M]
Branch (1394:25): [True: 2.62M, False: 707k]
|
1395 | /* Lines with only whitespace and/or comments |
1396 | shouldn't affect the indentation and are |
1397 | not passed to the parser as NEWLINE tokens, |
1398 | except *totally* empty lines in interactive |
1399 | mode, which signal the end of a command group. */ |
1400 | if (col == 0 && c == '\n'61.5k && tok->prompt != NULL49.4k ) { Branch (1400:17): [True: 61.5k, False: 2.59M]
Branch (1400:29): [True: 49.4k, False: 12.1k]
Branch (1400:42): [True: 0, False: 49.4k]
|
1401 | blankline = 0; /* Let it through */ |
1402 | } |
1403 | else if (tok->prompt != NULL && tok->lineno == 10 ) { Branch (1403:22): [True: 0, False: 2.65M]
Branch (1403:45): [True: 0, False: 0]
|
1404 | /* In interactive mode, if the first line contains |
1405 | only spaces and/or a comment, let it through. */ |
1406 | blankline = 0; |
1407 | col = altcol = 0; |
1408 | } |
1409 | else { |
1410 | blankline = 1; /* Ignore completely */ |
1411 | } |
1412 | /* We can't jump back right here since we still |
1413 | may need to skip to the end of a comment */ |
1414 | } |
1415 | if (!blankline && tok->level == 0707k ) { Branch (1415:13): [True: 707k, False: 2.65M]
Branch (1415:27): [True: 593k, False: 114k]
|
1416 | col = cont_line_col ? cont_line_col10 : col593k ; Branch (1416:19): [True: 10, False: 593k]
|
1417 | altcol = cont_line_col ? cont_line_col10 : altcol593k ; Branch (1417:22): [True: 10, False: 593k]
|
1418 | if (col == tok->indstack[tok->indent]) { Branch (1418:17): [True: 445k, False: 147k]
|
1419 | /* No change */ |
1420 | if (altcol != tok->altindstack[tok->indent]) { Branch (1420:21): [True: 2, False: 445k]
|
1421 | return indenterror(tok); |
1422 | } |
1423 | } |
1424 | else if (col > tok->indstack[tok->indent]) { Branch (1424:22): [True: 81.6k, False: 65.7k]
|
1425 | /* Indent -- always one */ |
1426 | if (tok->indent+1 >= MAXINDENT) { Branch (1426:21): [True: 0, False: 81.6k]
|
1427 | tok->done = E_TOODEEP; |
1428 | tok->cur = tok->inp; |
1429 | return ERRORTOKEN; |
1430 | } |
1431 | if (altcol <= tok->altindstack[tok->indent]) { Branch (1431:21): [True: 0, False: 81.6k]
|
1432 | return indenterror(tok); |
1433 | } |
1434 | tok->pendin++; |
1435 | tok->indstack[++tok->indent] = col; |
1436 | tok->altindstack[tok->indent] = altcol; |
1437 | } |
1438 | else /* col < tok->indstack[tok->indent] */ { |
1439 | /* Dedent -- any number, must be consistent */ |
1440 | while (tok->indent > 0 && Branch (1440:24): [True: 132k, False: 14.9k]
|
1441 | col < tok->indstack[tok->indent]132k ) { Branch (1441:21): [True: 81.6k, False: 50.7k]
|
1442 | tok->pendin--; |
1443 | tok->indent--; |
1444 | } |
1445 | if (col != tok->indstack[tok->indent]) { Branch (1445:21): [True: 6, False: 65.7k]
|
1446 | tok->done = E_DEDENT; |
1447 | tok->cur = tok->inp; |
1448 | return ERRORTOKEN; |
1449 | } |
1450 | if (altcol != tok->altindstack[tok->indent]) { Branch (1450:21): [True: 0, False: 65.7k]
|
1451 | return indenterror(tok); |
1452 | } |
1453 | } |
1454 | } |
1455 | } |
1456 | |
1457 | tok->start = tok->cur; |
1458 | |
1459 | /* Return pending indents/dedents */ |
1460 | if (tok->pendin != 0) { Branch (1460:9): [True: 163k, False: 10.0M]
|
1461 | if (tok->pendin < 0) { Branch (1461:13): [True: 81.6k, False: 81.6k]
|
1462 | tok->pendin++; |
1463 | return DEDENT; |
1464 | } |
1465 | else { |
1466 | tok->pendin--; |
1467 | return INDENT; |
1468 | } |
1469 | } |
1470 | |
1471 | /* Peek ahead at the next character */ |
1472 | c = tok_nextc(tok); |
1473 | tok_backup(tok, c); |
1474 | /* Check if we are closing an async function */ |
1475 | if (tok->async_def Branch (1475:9): [True: 141, False: 10.0M]
|
1476 | && !blankline141 Branch (1476:12): [True: 135, False: 6]
|
1477 | /* Due to some implementation artifacts of type comments, |
1478 | * a TYPE_COMMENT at the start of a function won't set an |
1479 | * indentation level and it will produce a NEWLINE after it. |
1480 | * To avoid spuriously ending an async function due to this, |
1481 | * wait until we have some non-newline char in front of us. */ |
1482 | && c != '\n'135 Branch (1482:12): [True: 114, False: 21]
|
1483 | && tok->level == 0114 Branch (1483:12): [True: 75, False: 39]
|
1484 | /* There was a NEWLINE after ASYNC DEF, |
1485 | so we're past the signature. */ |
1486 | && tok->async_def_nl75 Branch (1486:12): [True: 36, False: 39]
|
1487 | /* Current indentation level is less than where |
1488 | the async function was defined */ |
1489 | && tok->async_def_indent >= tok->indent36 ) Branch (1489:12): [True: 9, False: 27]
|
1490 | { |
1491 | tok->async_def = 0; |
1492 | tok->async_def_indent = 0; |
1493 | tok->async_def_nl = 0; |
1494 | } |
1495 | |
1496 | again: |
1497 | tok->start = NULL; |
1498 | /* Skip spaces */ |
1499 | do { |
1500 | c = tok_nextc(tok); |
1501 | } while (c == ' ' || c == '\t'10.0M || c == '\014'10.0M ); Branch (1501:14): [True: 1.18M, False: 10.0M]
Branch (1501:26): [True: 4, False: 10.0M]
Branch (1501:39): [True: 0, False: 10.0M]
|
1502 | |
1503 | /* Set start of current token */ |
1504 | tok->start = tok->cur - 1; |
1505 | |
1506 | /* Skip comment, unless it's a type comment */ |
1507 | if (c == '#') { Branch (1507:9): [True: 38.2k, False: 9.99M]
|
1508 | const char *prefix, *p, *type_start; |
1509 | |
1510 | while (c != EOF && c != '\n'1.73M ) { Branch (1510:16): [True: 1.73M, False: 1]
Branch (1510:28): [True: 1.69M, False: 38.2k]
|
1511 | c = tok_nextc(tok); |
1512 | } |
1513 | |
1514 | if (tok->type_comments) { Branch (1514:13): [True: 611, False: 37.6k]
|
1515 | p = tok->start; |
1516 | prefix = type_comment_prefix; |
1517 | while (*prefix && p < tok->cur4.88k ) { Branch (1517:20): [True: 4.88k, False: 611]
Branch (1517:31): [True: 4.88k, False: 0]
|
1518 | if (*prefix == ' ') { Branch (1518:21): [True: 1.22k, False: 3.66k]
|
1519 | while (*p == ' ' || *p == '\t'1.22k ) { Branch (1519:28): [True: 1.22k, False: 1.22k]
Branch (1519:41): [True: 0, False: 1.22k]
|
1520 | p++; |
1521 | } |
1522 | } else if (*prefix == *p) { Branch (1522:28): [True: 3.66k, False: 0]
|
1523 | p++; |
1524 | } else { |
1525 | break; |
1526 | } |
1527 | |
1528 | prefix++; |
1529 | } |
1530 | |
1531 | /* This is a type comment if we matched all of type_comment_prefix. */ |
1532 | if (!*prefix) { Branch (1532:17): [True: 611, False: 0]
|
1533 | int is_type_ignore = 1; |
1534 | const char *ignore_end = p + 6; |
1535 | tok_backup(tok, c); /* don't eat the newline or EOF */ |
1536 | |
1537 | type_start = p; |
1538 | |
1539 | /* A TYPE_IGNORE is "type: ignore" followed by the end of the token |
1540 | * or anything ASCII and non-alphanumeric. */ |
1541 | is_type_ignore = ( |
1542 | tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0165 Branch (1542:21): [True: 165, False: 446]
Branch (1542:47): [True: 92, False: 73]
|
1543 | && !(92 tok->cur > ignore_end92 Branch (1543:26): [True: 60, False: 32]
|
1544 | && (60 (unsigned char)ignore_end[0] >= 12860 || Py_ISALNUM51 (ignore_end[0])))); Branch (1544:30): [True: 9, False: 51]
|
1545 | |
1546 | if (is_type_ignore) { Branch (1546:21): [True: 74, False: 537]
|
1547 | *p_start = ignore_end; |
1548 | *p_end = tok->cur; |
1549 | |
1550 | /* If this type ignore is the only thing on the line, consume the newline also. */ |
1551 | if (blankline) { Branch (1551:25): [True: 0, False: 74]
|
1552 | tok_nextc(tok); |
1553 | tok->atbol = 1; |
1554 | } |
1555 | return TYPE_IGNORE; |
1556 | } else { |
1557 | *p_start = type_start; /* after type_comment_prefix */ |
1558 | *p_end = tok->cur; |
1559 | return TYPE_COMMENT; |
1560 | } |
1561 | } |
1562 | } |
1563 | } |
1564 | |
1565 | if (tok->done == E_INTERACT_STOP) { Branch (1565:9): [True: 0, False: 10.0M]
|
1566 | return ENDMARKER; |
1567 | } |
1568 | |
1569 | /* Check for EOF and errors now */ |
1570 | if (c == EOF) { Branch (1570:9): [True: 55.7k, False: 9.97M]
|
1571 | if (tok->level) { Branch (1571:13): [True: 185, False: 55.5k]
|
1572 | return ERRORTOKEN; |
1573 | } |
1574 | return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN0 ; Branch (1574:16): [True: 55.5k, False: 0]
|
1575 | } |
1576 | |
1577 | /* Identifier (most frequent token!) */ |
1578 | nonascii = 0; |
1579 | if (is_potential_identifier_start(c)) { |
1580 | /* Process the various legal combinations of b"", r"", u"", and f"". */ |
1581 | int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0; |
1582 | while (1) { Branch (1582:16): [Folded - Ignored]
|
1583 | if (!(saw_b || saw_u2.33M || saw_f2.32M ) && (2.27M c == 'b'2.27M || c == 'B'1.74M )) Branch (1583:19): [True: 524k, False: 2.33M]
Branch (1583:28): [True: 6.35k, False: 2.32M]
Branch (1583:37): [True: 52.4k, False: 2.27M]
Branch (1583:48): [True: 523k, False: 1.74M]
Branch (1583:60): [True: 3.92k, False: 1.74M]
|
1584 | saw_b = 1; |
1585 | /* Since this is a backwards compatibility support literal we don't |
1586 | want to support it in arbitrary order like byte literals. */ |
1587 | else if (!(saw_b || saw_u1.80M || saw_r1.79M || saw_f1.74M ) Branch (1587:24): [True: 524k, False: 1.80M]
Branch (1587:33): [True: 6.35k, False: 1.79M]
Branch (1587:42): [True: 53.3k, False: 1.74M]
Branch (1587:51): [True: 47.2k, False: 1.69M]
|
1588 | && (1.69M c == 'u'1.69M || c == 'U'1.69M )) { Branch (1588:26): [True: 5.14k, False: 1.69M]
Branch (1588:37): [True: 1.25k, False: 1.69M]
|
1589 | saw_u = 1; |
1590 | } |
1591 | /* ur"" and ru"" are not supported */ |
1592 | else if (!(saw_r || saw_u2.26M ) && (2.26M c == 'r'2.26M || c == 'R'2.20M )) { Branch (1592:24): [True: 54.7k, False: 2.26M]
Branch (1592:33): [True: 6.35k, False: 2.26M]
Branch (1592:44): [True: 54.0k, False: 2.20M]
Branch (1592:56): [True: 1.55k, False: 2.20M]
|
1593 | saw_r = 1; |
1594 | } |
1595 | else if (!(saw_f || saw_b2.21M || saw_u1.69M ) && (1.68M c == 'f'1.68M || c == 'F'1.64M )) { Branch (1595:24): [True: 47.3k, False: 2.21M]
Branch (1595:33): [True: 523k, False: 1.69M]
Branch (1595:42): [True: 6.35k, False: 1.68M]
Branch (1595:53): [True: 46.8k, False: 1.64M]
Branch (1595:65): [True: 4.91k, False: 1.63M]
|
1596 | saw_f = 1; |
1597 | } |
1598 | else { |
1599 | break; |
1600 | } |
1601 | c = tok_nextc(tok); |
1602 | if (c == '"' || c == '\''639k ) { Branch (1602:17): [True: 2.49k, False: 639k]
Branch (1602:29): [True: 7.24k, False: 631k]
|
1603 | goto letter_quote; |
1604 | } |
1605 | } |
1606 | while (2.21M is_potential_identifier_char(c)) { |
1607 | if (c >= 128) { Branch (1607:17): [True: 98, False: 5.69M]
|
1608 | nonascii = 1; |
1609 | } |
1610 | c = tok_nextc(tok); |
1611 | } |
1612 | tok_backup(tok, c); |
1613 | if (nonascii && !verify_identifier(tok)48 ) { Branch (1613:13): [True: 48, False: 2.21M]
Branch (1613:25): [True: 12, False: 36]
|
1614 | return ERRORTOKEN; |
1615 | } |
1616 | |
1617 | *p_start = tok->start; |
1618 | *p_end = tok->cur; |
1619 | |
1620 | /* async/await parsing block. */ |
1621 | if (tok->cur - tok->start == 5 && tok->start[0] == 'a'75.4k ) { Branch (1621:13): [True: 75.4k, False: 2.13M]
Branch (1621:43): [True: 2.20k, False: 73.2k]
|
1622 | /* May be an 'async' or 'await' token. For Python 3.7 or |
1623 | later we recognize them unconditionally. For Python |
1624 | 3.5 or 3.6 we recognize 'async' in front of 'def', and |
1625 | either one inside of 'async def'. (Technically we |
1626 | shouldn't recognize these at all for 3.4 or earlier, |
1627 | but there's no *valid* Python 3.4 code that would be |
1628 | rejected, and async functions will be rejected in a |
1629 | later phase.) */ |
1630 | if (!tok->async_hacks || tok->async_def24 ) { Branch (1630:17): [True: 2.18k, False: 24]
Branch (1630:38): [True: 9, False: 15]
|
1631 | /* Always recognize the keywords. */ |
1632 | if (memcmp(tok->start, "async", 5) == 0) { Branch (1632:21): [True: 957, False: 1.23k]
|
1633 | return ASYNC; |
1634 | } |
1635 | if (memcmp(tok->start, "await", 5) == 0) { Branch (1635:21): [True: 219, False: 1.01k]
|
1636 | return AWAIT; |
1637 | } |
1638 | } |
1639 | else if (memcmp(tok->start, "async", 5) == 0) { Branch (1639:22): [True: 12, False: 3]
|
1640 | /* The current token is 'async'. |
1641 | Look ahead one token to see if that is 'def'. */ |
1642 | |
1643 | struct tok_state ahead_tok; |
1644 | const char *ahead_tok_start = NULL; |
1645 | const char *ahead_tok_end = NULL; |
1646 | int ahead_tok_kind; |
1647 | |
1648 | memcpy(&ahead_tok, tok, sizeof(ahead_tok)); |
1649 | ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start, |
1650 | &ahead_tok_end); |
1651 | |
1652 | if (ahead_tok_kind == NAME Branch (1652:21): [True: 9, False: 3]
|
1653 | && ahead_tok.cur - ahead_tok.start == 39 Branch (1653:24): [True: 9, False: 0]
|
1654 | && memcmp(ahead_tok.start, "def", 3) == 09 ) Branch (1654:24): [True: 9, False: 0]
|
1655 | { |
1656 | /* The next token is going to be 'def', so instead of |
1657 | returning a plain NAME token, return ASYNC. */ |
1658 | tok->async_def_indent = tok->indent; |
1659 | tok->async_def = 1; |
1660 | return ASYNC; |
1661 | } |
1662 | } |
1663 | } |
1664 | |
1665 | return NAME; |
1666 | } |
1667 | |
1668 | /* Newline */ |
1669 | if (c == '\n') { Branch (1669:9): [True: 3.31M, False: 4.44M]
|
1670 | tok->atbol = 1; |
1671 | if (blankline || tok->level > 0651k ) { Branch (1671:13): [True: 2.65M, False: 651k]
Branch (1671:26): [True: 114k, False: 536k]
|
1672 | goto nextline; |
1673 | } |
1674 | *p_start = tok->start; |
1675 | *p_end = tok->cur - 1; /* Leave '\n' out of the string */ |
1676 | tok->cont_line = 0; |
1677 | if (tok->async_def) { Branch (1677:13): [True: 21, False: 536k]
|
1678 | /* We're somewhere inside an 'async def' function, and |
1679 | we've encountered a NEWLINE after its signature. */ |
1680 | tok->async_def_nl = 1; |
1681 | } |
1682 | return NEWLINE; |
1683 | } |
1684 | |
1685 | /* Period or number starting with period? */ |
1686 | if (c == '.') { Branch (1686:9): [True: 423k, False: 4.01M]
|
1687 | c = tok_nextc(tok); |
1688 | if (isdigit(c)) { |
1689 | goto fraction; |
1690 | } else if (c == '.') { Branch (1690:20): [True: 1.40k, False: 422k]
|
1691 | c = tok_nextc(tok); |
1692 | if (c == '.') { Branch (1692:17): [True: 1.39k, False: 15]
|
1693 | *p_start = tok->start; |
1694 | *p_end = tok->cur; |
1695 | return ELLIPSIS; |
1696 | } |
1697 | else { |
1698 | tok_backup(tok, c); |
1699 | } |
1700 | tok_backup(tok, '.'); |
1701 | } |
1702 | else { |
1703 | tok_backup(tok, c); |
1704 | } |
1705 | *p_start = tok->start; |
1706 | *p_end = tok->cur; |
1707 | return DOT; |
1708 | } |
1709 | |
1710 | /* Number */ |
1711 | if (isdigit(c)) { |
1712 | if (c == '0') { Branch (1712:13): [True: 320k, False: 325k]
|
1713 | /* Hex, octal or binary -- maybe. */ |
1714 | c = tok_nextc(tok); |
1715 | if (c == 'x' || c == 'X'318k ) { Branch (1715:17): [True: 2.30k, False: 318k]
Branch (1715:29): [True: 2, False: 318k]
|
1716 | /* Hex */ |
1717 | c = tok_nextc(tok); |
1718 | do { |
1719 | if (c == '_') { Branch (1719:25): [True: 17, False: 2.30k]
|
1720 | c = tok_nextc(tok); |
1721 | } |
1722 | if (!isxdigit(c)) { Branch (1722:25): [True: 16, False: 2.30k]
|
1723 | tok_backup(tok, c); |
1724 | return syntaxerror(tok, "invalid hexadecimal literal"); |
1725 | } |
1726 | do 2.30k { |
1727 | c = tok_nextc(tok); |
1728 | } while (isxdigit(c)); |
1729 | } while (c == '_'); Branch (1729:26): [True: 13, False: 2.29k]
|
1730 | if (!verify_end_of_number(tok, c, "hexadecimal")) { Branch (1730:21): [True: 12, False: 2.28k]
|
1731 | return ERRORTOKEN; |
1732 | } |
1733 | } |
1734 | else if (c == 'o' || c == 'O'318k ) { Branch (1734:22): [True: 142, False: 318k]
Branch (1734:34): [True: 3, False: 318k]
|
1735 | /* Octal */ |
1736 | c = tok_nextc(tok); |
1737 | do { |
1738 | if (c == '_') { Branch (1738:25): [True: 12, False: 143]
|
1739 | c = tok_nextc(tok); |
1740 | } |
1741 | if (c < '0' || c >= '8'150 ) { Branch (1741:25): [True: 5, False: 150]
Branch (1741:36): [True: 7, False: 143]
|
1742 | if (isdigit(c)) { |
1743 | return syntaxerror(tok, |
1744 | "invalid digit '%c' in octal literal", c); |
1745 | } |
1746 | else { |
1747 | tok_backup(tok, c); |
1748 | return syntaxerror(tok, "invalid octal literal"); |
1749 | } |
1750 | } |
1751 | do 143 { |
1752 | c = tok_nextc(tok); |
1753 | } while ('0' <= c && c < '8'527 ); Branch (1753:30): [True: 527, False: 95]
Branch (1753:42): [True: 479, False: 48]
|
1754 | } while (c == '_'); Branch (1754:26): [True: 10, False: 133]
|
1755 | if (isdigit(c)) { |
1756 | return syntaxerror(tok, |
1757 | "invalid digit '%c' in octal literal", c); |
1758 | } |
1759 | if (!verify_end_of_number(tok, c, "octal")) { Branch (1759:21): [True: 11, False: 119]
|
1760 | return ERRORTOKEN; |
1761 | } |
1762 | } |
1763 | else if (c == 'b' || c == 'B'318k ) { Branch (1763:22): [True: 54, False: 318k]
Branch (1763:34): [True: 3, False: 318k]
|
1764 | /* Binary */ |
1765 | c = tok_nextc(tok); |
1766 | do { |
1767 | if (c == '_') { Branch (1767:25): [True: 10, False: 55]
|
1768 | c = tok_nextc(tok); |
1769 | } |
1770 | if (c != '0' && c != '1'59 ) { Branch (1770:25): [True: 59, False: 6]
Branch (1770:37): [True: 12, False: 47]
|
1771 | if (isdigit(c)) { |
1772 | return syntaxerror(tok, |
1773 | "invalid digit '%c' in binary literal", c); |
1774 | } |
1775 | else { |
1776 | tok_backup(tok, c); |
1777 | return syntaxerror(tok, "invalid binary literal"); |
1778 | } |
1779 | } |
1780 | do 53 { |
1781 | c = tok_nextc(tok); |
1782 | } while (c == '0' || c == '1'384 ); Branch (1782:30): [True: 235, False: 384]
Branch (1782:42): [True: 331, False: 53]
|
1783 | } while (c == '_'); Branch (1783:26): [True: 8, False: 45]
|
1784 | if (isdigit(c)) { |
1785 | return syntaxerror(tok, |
1786 | "invalid digit '%c' in binary literal", c); |
1787 | } |
1788 | if (!verify_end_of_number(tok, c, "binary")) { Branch (1788:21): [True: 11, False: 32]
|
1789 | return ERRORTOKEN; |
1790 | } |
1791 | } |
1792 | else { |
1793 | int nonzero = 0; |
1794 | /* maybe old-style octal; c is first char of it */ |
1795 | /* in any case, allow '0' as a literal */ |
1796 | while (1) { Branch (1796:24): [Folded - Ignored]
|
1797 | if (c == '_') { Branch (1797:25): [True: 13, False: 318k]
|
1798 | c = tok_nextc(tok); |
1799 | if (!isdigit(c)) { Branch (1799:29): [True: 4, False: 9]
|
1800 | tok_backup(tok, c); |
1801 | return syntaxerror(tok, "invalid decimal literal"); |
1802 | } |
1803 | } |
1804 | if (c != '0') { Branch (1804:25): [True: 318k, False: 139]
|
1805 | break; |
1806 | } |
1807 | c = tok_nextc(tok); |
1808 | } |
1809 | char* zeros_end = tok->cur; |
1810 | if (isdigit(c)) { |
1811 | nonzero = 1; |
1812 | c = tok_decimal_tail(tok); |
1813 | if (c == 0) { Branch (1813:25): [True: 0, False: 26]
|
1814 | return ERRORTOKEN; |
1815 | } |
1816 | } |
1817 | if (c == '.') { Branch (1817:21): [True: 445, False: 317k]
|
1818 | c = tok_nextc(tok); |
1819 | goto fraction; |
1820 | } |
1821 | else if (c == 'e' || c == 'E'317k ) { Branch (1821:26): [True: 11, False: 317k]
Branch (1821:38): [True: 2, False: 317k]
|
1822 | goto exponent; |
1823 | } |
1824 | else if (c == 'j' || c == 'J'317k ) { Branch (1824:26): [True: 128, False: 317k]
Branch (1824:38): [True: 0, False: 317k]
|
1825 | goto imaginary; |
1826 | } |
1827 | else if (nonzero) { Branch (1827:26): [True: 13, False: 317k]
|
1828 | /* Old-style octal: now disallowed. */ |
1829 | tok_backup(tok, c); |
1830 | return syntaxerror_known_range( |
1831 | tok, (int)(tok->start + 1 - tok->line_start), |
1832 | (int)(zeros_end - tok->line_start), |
1833 | "leading zeros in decimal integer " |
1834 | "literals are not permitted; " |
1835 | "use an 0o prefix for octal integers"); |
1836 | } |
1837 | if (!verify_end_of_number(tok, c, "decimal")) { Branch (1837:21): [True: 7, False: 317k]
|
1838 | return ERRORTOKEN; |
1839 | } |
1840 | } |
1841 | } |
1842 | else { |
1843 | /* Decimal */ |
1844 | c = tok_decimal_tail(tok); |
1845 | if (c == 0) { Branch (1845:17): [True: 11, False: 325k]
|
1846 | return ERRORTOKEN; |
1847 | } |
1848 | { |
1849 | /* Accept floating point numbers. */ |
1850 | if (c == '.') { Branch (1850:21): [True: 2.05k, False: 323k]
|
1851 | c = tok_nextc(tok); |
1852 | fraction: |
1853 | /* Fraction */ |
1854 | if (isdigit(c)) { |
1855 | c = tok_decimal_tail(tok); |
1856 | if (c == 0) { Branch (1856:29): [True: 10, False: 2.42k]
|
1857 | return ERRORTOKEN; |
1858 | } |
1859 | } |
1860 | } |
1861 | if (c == 'e' || c == 'E'324k ) { Branch (1861:21): [True: 1.52k, False: 324k]
Branch (1861:33): [True: 1.02k, False: 323k]
|
1862 | int e; |
1863 | exponent: |
1864 | e = c; |
1865 | /* Exponent part */ |
1866 | c = tok_nextc(tok); |
1867 | if (c == '+' || c == '-'1.95k ) { Branch (1867:25): [True: 604, False: 1.95k]
Branch (1867:37): [True: 1.23k, False: 714]
|
1868 | c = tok_nextc(tok); |
1869 | if (!isdigit(c)) { Branch (1869:29): [True: 8, False: 1.83k]
|
1870 | tok_backup(tok, c); |
1871 | return syntaxerror(tok, "invalid decimal literal"); |
1872 | } |
1873 | } else if (714 !isdigit714 (c)) { Branch (1873:32): [True: 15, False: 699]
|
1874 | tok_backup(tok, c); |
1875 | if (!verify_end_of_number(tok, e, "decimal")) { Branch (1875:29): [True: 10, False: 5]
|
1876 | return ERRORTOKEN; |
1877 | } |
1878 | tok_backup(tok, e); |
1879 | *p_start = tok->start; |
1880 | *p_end = tok->cur; |
1881 | return NUMBER; |
1882 | } |
1883 | c = tok_decimal_tail(tok); |
1884 | if (c == 0) { Branch (1884:25): [True: 6, False: 2.52k]
|
1885 | return ERRORTOKEN; |
1886 | } |
1887 | } |
1888 | if (c == 'j' || c == 'J'325k ) { Branch (1888:21): [True: 490, False: 325k]
Branch (1888:33): [True: 0, False: 325k]
|
1889 | /* Imaginary part */ |
1890 | imaginary: |
1891 | c = tok_nextc(tok); |
1892 | if (!verify_end_of_number(tok, c, "imaginary")) { Branch (1892:25): [True: 10, False: 608]
|
1893 | return ERRORTOKEN; |
1894 | } |
1895 | } |
1896 | else if (!verify_end_of_number(tok, c, "decimal")) { Branch (1896:26): [True: 27, False: 325k]
|
1897 | return ERRORTOKEN; |
1898 | } |
1899 | } |
1900 | } |
1901 | tok_backup(tok, c); |
1902 | *p_start = tok->start; |
1903 | *p_end = tok->cur; |
1904 | return NUMBER; |
1905 | } |
1906 | |
1907 | letter_quote: |
1908 | /* String */ |
1909 | if (c == '\'' || c == '"'3.27M ) { Branch (1909:9): [True: 107k, False: 3.27M]
Branch (1909:22): [True: 98.8k, False: 3.17M]
|
1910 | int quote = c; |
1911 | int quote_size = 1; /* 1 or 3 */ |
1912 | int end_quote_size = 0; |
1913 | |
1914 | /* Nodes of type STRING, especially multi line strings |
1915 | must be handled differently in order to get both |
1916 | the starting line number and the column offset right. |
1917 | (cf. issue 16806) */ |
1918 | tok->first_lineno = tok->lineno; |
1919 | tok->multi_line_start = tok->line_start; |
1920 | |
1921 | /* Find the quote size and start of string */ |
1922 | c = tok_nextc(tok); |
1923 | if (c == quote) { Branch (1923:13): [True: 76.2k, False: 129k]
|
1924 | c = tok_nextc(tok); |
1925 | if (c == quote) { Branch (1925:17): [True: 8.52k, False: 67.6k]
|
1926 | quote_size = 3; |
1927 | } |
1928 | else { |
1929 | end_quote_size = 1; /* empty string found */ |
1930 | } |
1931 | } |
1932 | if (c != quote) { Branch (1932:13): [True: 197k, False: 8.52k]
|
1933 | tok_backup(tok, c); |
1934 | } |
1935 | |
1936 | /* Get rest of string */ |
1937 | while (end_quote_size != quote_size) { Branch (1937:16): [True: 4.20M, False: 206k]
|
1938 | c = tok_nextc(tok); |
1939 | if (c == EOF || (4.20M quote_size == 14.20M && c == '\n'1.72M )) { Branch (1939:17): [True: 11, False: 4.20M]
Branch (1939:30): [True: 1.72M, False: 2.48M]
Branch (1939:49): [True: 5, False: 1.72M]
|
1940 | assert(tok->multi_line_start != NULL); |
1941 | // shift the tok_state's location into |
1942 | // the start of string, and report the error |
1943 | // from the initial quote character |
1944 | tok->cur = (char *)tok->start; |
1945 | tok->cur++; |
1946 | tok->line_start = tok->multi_line_start; |
1947 | int start = tok->lineno; |
1948 | tok->lineno = tok->first_lineno; |
1949 | if (quote_size == 3) { Branch (1949:21): [True: 5, False: 11]
|
1950 | syntaxerror(tok, "unterminated triple-quoted string literal" |
1951 | " (detected at line %d)", start); |
1952 | if (c != '\n') { Branch (1952:25): [True: 5, False: 0]
|
1953 | tok->done = E_EOFS; |
1954 | } |
1955 | return ERRORTOKEN; |
1956 | } |
1957 | else { |
1958 | syntaxerror(tok, "unterminated string literal (detected at" |
1959 | " line %d)", start); |
1960 | if (c != '\n') { Branch (1960:25): [True: 6, False: 5]
|
1961 | tok->done = E_EOLS; |
1962 | } |
1963 | return ERRORTOKEN; |
1964 | } |
1965 | } |
1966 | if (c == quote) { Branch (1966:17): [True: 160k, False: 4.04M]
|
1967 | end_quote_size += 1; |
1968 | } |
1969 | else { |
1970 | end_quote_size = 0; |
1971 | if (c == '\\') { Branch (1971:21): [True: 29.6k, False: 4.01M]
|
1972 | tok_nextc(tok); /* skip escaped char */ |
1973 | } |
1974 | } |
1975 | } |
1976 | |
1977 | *p_start = tok->start; |
1978 | *p_end = tok->cur; |
1979 | return STRING; |
1980 | } |
1981 | |
1982 | /* Line continuation */ |
1983 | if (c == '\\') { Branch (1983:9): [True: 463, False: 3.17M]
|
1984 | if ((c = tok_continuation_line(tok)) == -1) { Branch (1984:13): [True: 15, False: 448]
|
1985 | return ERRORTOKEN; |
1986 | } |
1987 | tok->cont_line = 1; |
1988 | goto again; /* Read next line */ |
1989 | } |
1990 | |
1991 | /* Check for two-character token */ |
1992 | { |
1993 | int c2 = tok_nextc(tok); |
1994 | int token = _PyToken_TwoChars(c, c2); |
1995 | if (token != OP) { Branch (1995:13): [True: 18.4k, False: 3.15M]
|
1996 | int c3 = tok_nextc(tok); |
1997 | int token3 = _PyToken_ThreeChars(c, c2, c3); |
1998 | if (token3 != OP) { Branch (1998:17): [True: 89, False: 18.3k]
|
1999 | token = token3; |
2000 | } |
2001 | else { |
2002 | tok_backup(tok, c3); |
2003 | } |
2004 | *p_start = tok->start; |
2005 | *p_end = tok->cur; |
2006 | return token; |
2007 | } |
2008 | tok_backup(tok, c2); |
2009 | } |
2010 | |
2011 | /* Keep track of parentheses nesting level */ |
2012 | switch (c) { Branch (2012:13): [True: 1.35M, False: 1.80M]
|
2013 | case '(': Branch (2013:5): [True: 565k, False: 2.58M]
|
2014 | case '[': Branch (2014:5): [True: 332k, False: 2.82M]
|
2015 | case '{': Branch (2015:5): [True: 4.78k, False: 3.15M]
|
2016 | if (tok->level >= MAXLEVEL) { Branch (2016:13): [True: 1, False: 902k]
|
2017 | return syntaxerror(tok, "too many nested parentheses"); |
2018 | } |
2019 | tok->parenstack[tok->level] = c; |
2020 | tok->parenlinenostack[tok->level] = tok->lineno; |
2021 | tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start); |
2022 | tok->level++; |
2023 | break; |
2024 | case ')': Branch (2024:5): [True: 565k, False: 2.59M]
|
2025 | case ']': Branch (2025:5): [True: 332k, False: 2.82M]
|
2026 | case '}': Branch (2026:5): [True: 4.72k, False: 3.15M]
|
2027 | if (!tok->level) { Branch (2027:13): [True: 12, False: 902k]
|
2028 | return syntaxerror(tok, "unmatched '%c'", c); |
2029 | } |
2030 | tok->level--; |
2031 | int opening = tok->parenstack[tok->level]; |
2032 | if (!((opening == '(' && c == ')'565k ) || Branch (2032:16): [True: 565k, False: 336k]
Branch (2032:34): [True: 565k, False: 2]
|
2033 | (336k opening == '['336k && c == ']'332k ) || Branch (2033:16): [True: 332k, False: 4.72k]
Branch (2033:34): [True: 332k, False: 2]
|
2034 | (4.72k opening == '{'4.72k && c == '}'4.72k ))) Branch (2034:16): [True: 4.72k, False: 4]
Branch (2034:34): [True: 4.72k, False: 1]
|
2035 | { |
2036 | if (tok->parenlinenostack[tok->level] != tok->lineno) { Branch (2036:17): [True: 0, False: 5]
|
2037 | return syntaxerror(tok, |
2038 | "closing parenthesis '%c' does not match " |
2039 | "opening parenthesis '%c' on line %d", |
2040 | c, opening, tok->parenlinenostack[tok->level]); |
2041 | } |
2042 | else { |
2043 | return syntaxerror(tok, |
2044 | "closing parenthesis '%c' does not match " |
2045 | "opening parenthesis '%c'", |
2046 | c, opening); |
2047 | } |
2048 | } |
2049 | break; |
2050 | } |
2051 | |
2052 | if (!Py_UNICODE_ISPRINTABLE(c)) { Branch (2052:9): [True: 1, False: 3.15M]
|
2053 | char hex[9]; |
2054 | (void)PyOS_snprintf(hex, sizeof(hex), "%04X", c); |
2055 | return syntaxerror(tok, "invalid non-printable character U+%s", hex); |
2056 | } |
2057 | |
2058 | /* Punctuation character */ |
2059 | *p_start = tok->start; |
2060 | *p_end = tok->cur; |
2061 | return _PyToken_OneChar(c); |
2062 | } |
2063 | |
2064 | int |
2065 | _PyTokenizer_Get(struct tok_state *tok, |
2066 | const char **p_start, const char **p_end) |
2067 | { |
2068 | int result = tok_get(tok, p_start, p_end); |
2069 | if (tok->decoding_erred) { Branch (2069:9): [True: 0, False: 7.42M]
|
2070 | result = ERRORTOKEN; |
2071 | tok->done = E_DECODE; |
2072 | } |
2073 | return result; |
2074 | } |
2075 | |
2076 | #if defined(__wasi__) || (defined(__EMSCRIPTEN__) && (__EMSCRIPTEN_major__ >= 3)) |
2077 | // fdopen() with borrowed fd. WASI does not provide dup() and Emscripten's |
2078 | // dup() emulation with open() is slow. |
2079 | typedef union { |
2080 | void *cookie; |
2081 | int fd; |
2082 | } borrowed; |
2083 | |
2084 | static ssize_t |
2085 | borrow_read(void *cookie, char *buf, size_t size) |
2086 | { |
2087 | borrowed b = {.cookie = cookie}; |
2088 | return read(b.fd, (void *)buf, size); |
2089 | } |
2090 | |
2091 | static FILE * |
2092 | fdopen_borrow(int fd) { |
2093 | // supports only reading. seek fails. close and write are no-ops. |
2094 | cookie_io_functions_t io_cb = {borrow_read, NULL, NULL, NULL}; |
2095 | borrowed b = {.fd = fd}; |
2096 | return fopencookie(b.cookie, "r", io_cb); |
2097 | } |
2098 | #else |
2099 | static FILE * |
2100 | fdopen_borrow(int fd) { |
2101 | fd = _Py_dup(fd); |
2102 | if (fd < 0) { Branch (2102:9): [True: 0, False: 256]
|
2103 | return NULL; |
2104 | } |
2105 | return fdopen(fd, "r"); |
2106 | } |
2107 | #endif |
2108 | |
2109 | /* Get the encoding of a Python file. Check for the coding cookie and check if |
2110 | the file starts with a BOM. |
2111 | |
2112 | _PyTokenizer_FindEncodingFilename() returns NULL when it can't find the |
2113 | encoding in the first or second line of the file (in which case the encoding |
2114 | should be assumed to be UTF-8). |
2115 | |
2116 | The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed |
2117 | by the caller. */ |
2118 | |
2119 | char * |
2120 | _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) |
2121 | { |
2122 | struct tok_state *tok; |
2123 | FILE *fp; |
2124 | const char *p_start = NULL; |
2125 | const char *p_end = NULL; |
2126 | char *encoding = NULL; |
2127 | |
2128 | fp = fdopen_borrow(fd); |
2129 | if (fp == NULL) { Branch (2129:9): [True: 0, False: 256]
|
2130 | return NULL; |
2131 | } |
2132 | tok = _PyTokenizer_FromFile(fp, NULL, NULL, NULL); |
2133 | if (tok == NULL) { Branch (2133:9): [True: 0, False: 256]
|
2134 | fclose(fp); |
2135 | return NULL; |
2136 | } |
2137 | if (filename != NULL) { Branch (2137:9): [True: 256, False: 0]
|
2138 | Py_INCREF(filename); |
2139 | tok->filename = filename; |
2140 | } |
2141 | else { |
2142 | tok->filename = PyUnicode_FromString("<string>"); |
2143 | if (tok->filename == NULL) { Branch (2143:13): [True: 0, False: 0]
|
2144 | fclose(fp); |
2145 | _PyTokenizer_Free(tok); |
2146 | return encoding; |
2147 | } |
2148 | } |
2149 | while (256 tok->lineno < 2 && tok->done == 622 E_OK622 ) { Branch (2149:12): [True: 622, False: 254]
Branch (2149:31): [True: 620, False: 2]
|
2150 | _PyTokenizer_Get(tok, &p_start, &p_end); |
2151 | } |
2152 | fclose(fp); |
2153 | if (tok->encoding) { Branch (2153:9): [True: 0, False: 256]
|
2154 | encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1); |
2155 | if (encoding) { Branch (2155:13): [True: 0, False: 0]
|
2156 | strcpy(encoding, tok->encoding); |
2157 | } |
2158 | } |
2159 | _PyTokenizer_Free(tok); |
2160 | return encoding; |
2161 | } |
2162 | |
2163 | #ifdef Py_DEBUG |
2164 | void |
2165 | tok_dump(int type, char *start, char *end) |
2166 | { |
2167 | fprintf(stderr, "%s", _PyParser_TokenNames[type]); |
2168 | if (type == NAME || type == NUMBER || type == STRING || type == OP) |
2169 | fprintf(stderr, "(%.*s)", (int)(end - start), start); |
2170 | } |
2171 | #endif // Py_DEBUG |