/home/mdboom/Work/builds/cpython/Parser/string_parser.c
Line | Count | Source (jump to first uncovered line) |
1 | #include <stdbool.h> |
2 | |
3 | #include <Python.h> |
4 | |
5 | #include "tokenizer.h" |
6 | #include "pegen.h" |
7 | #include "string_parser.h" |
8 | |
9 | //// STRING HANDLING FUNCTIONS //// |
10 | |
11 | static int |
12 | warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t) |
13 | { |
14 | unsigned char c = *first_invalid_escape; |
15 | int octal = ('4' <= c && c <= '7'645 ); Branch (15:18): [True: 645, False: 86]
Branch (15:30): [True: 516, False: 129]
|
16 | PyObject *msg = |
17 | octal Branch (17:9): [True: 516, False: 215]
|
18 | ? PyUnicode_FromFormat("invalid octal escape sequence '\\%.3s'", |
19 | first_invalid_escape) |
20 | : PyUnicode_FromFormat("invalid escape sequence '\\%c'", c)215 ; |
21 | if (msg == NULL) { Branch (21:9): [True: 0, False: 731]
|
22 | return -1; |
23 | } |
24 | if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename, Branch (24:9): [True: 4, False: 727]
|
25 | t->lineno, NULL, NULL) < 0) { |
26 | if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) { Branch (26:13): [True: 4, False: 0]
|
27 | /* Replace the DeprecationWarning exception with a SyntaxError |
28 | to get a more accurate error report */ |
29 | PyErr_Clear(); |
30 | |
31 | /* This is needed, in order for the SyntaxError to point to the token t, |
32 | since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the |
33 | error location, if p->known_err_token is not set. */ |
34 | p->known_err_token = t; |
35 | if (octal) { Branch (35:17): [True: 2, False: 2]
|
36 | RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'", |
37 | first_invalid_escape); |
38 | } |
39 | else { |
40 | RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c); |
41 | } |
42 | } |
43 | Py_DECREF(msg); |
44 | return -1; |
45 | } |
46 | Py_DECREF(msg); |
47 | return 0; |
48 | } |
49 | |
50 | static PyObject * |
51 | decode_utf8(const char **sPtr, const char *end) |
52 | { |
53 | const char *s; |
54 | const char *t; |
55 | t = s = *sPtr; |
56 | while (s < end && (*s & 0x80)4.87k ) { Branch (56:12): [True: 4.87k, False: 23]
Branch (56:23): [True: 3.30k, False: 1.56k]
|
57 | s++; |
58 | } |
59 | *sPtr = s; |
60 | return PyUnicode_DecodeUTF8(t, s - t, NULL); |
61 | } |
62 | |
63 | static PyObject * |
64 | decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t) |
65 | { |
66 | PyObject *v; |
67 | PyObject *u; |
68 | char *buf; |
69 | char *p; |
70 | const char *end; |
71 | |
72 | /* check for integer overflow */ |
73 | if (len > SIZE_MAX / 6) { Branch (73:9): [True: 0, False: 86.2k]
|
74 | return NULL; |
75 | } |
76 | /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5 |
77 | "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */ |
78 | u = PyBytes_FromStringAndSize((char *)NULL, len * 6); |
79 | if (u == NULL) { Branch (79:9): [True: 0, False: 86.2k]
|
80 | return NULL; |
81 | } |
82 | p = buf = PyBytes_AsString(u); |
83 | if (p == NULL) { Branch (83:9): [True: 0, False: 86.2k]
|
84 | return NULL; |
85 | } |
86 | end = s + len; |
87 | while (s < end) { Branch (87:12): [True: 396k, False: 86.2k]
|
88 | if (*s == '\\') { Branch (88:13): [True: 16.3k, False: 379k]
|
89 | *p++ = *s++; |
90 | if (s >= end || *s & 0x8016.3k ) { Branch (90:17): [True: 1, False: 16.3k]
Branch (90:29): [True: 0, False: 16.3k]
|
91 | strcpy(p, "u005c"); |
92 | p += 5; |
93 | if (s >= end) { Branch (93:21): [True: 1, False: 0]
|
94 | break; |
95 | } |
96 | } |
97 | } |
98 | if (*s & 0x80) { Branch (98:13): [True: 1.59k, False: 394k]
|
99 | PyObject *w; |
100 | int kind; |
101 | const void *data; |
102 | Py_ssize_t w_len; |
103 | Py_ssize_t i; |
104 | w = decode_utf8(&s, end); |
105 | if (w == NULL) { Branch (105:17): [True: 0, False: 1.59k]
|
106 | Py_DECREF(u); |
107 | return NULL; |
108 | } |
109 | kind = PyUnicode_KIND(w); |
110 | data = PyUnicode_DATA(w); |
111 | w_len = PyUnicode_GET_LENGTH(w); |
112 | for (i = 0; i < w_len; i++1.63k ) { Branch (112:25): [True: 1.63k, False: 1.59k]
|
113 | Py_UCS4 chr = PyUnicode_READ(kind, data, i); |
114 | sprintf(p, "\\U%08x", chr); |
115 | p += 10; |
116 | } |
117 | /* Should be impossible to overflow */ |
118 | assert(p - buf <= PyBytes_GET_SIZE(u)); |
119 | Py_DECREF(w); |
120 | } |
121 | else { |
122 | *p++ = *s++; |
123 | } |
124 | } |
125 | len = p - buf; |
126 | s = buf; |
127 | |
128 | const char *first_invalid_escape; |
129 | v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape); |
130 | |
131 | if (v != NULL && first_invalid_escape != NULL86.1k ) { Branch (131:9): [True: 86.1k, False: 33]
Branch (131:22): [True: 364, False: 85.8k]
|
132 | if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) { Branch (132:13): [True: 2, False: 362]
|
133 | /* We have not decref u before because first_invalid_escape points |
134 | inside u. */ |
135 | Py_XDECREF(u); |
136 | Py_DECREF(v); |
137 | return NULL; |
138 | } |
139 | } |
140 | Py_XDECREF(u); |
141 | return v; |
142 | } |
143 | |
144 | static PyObject * |
145 | decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t) |
146 | { |
147 | const char *first_invalid_escape; |
148 | PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape); |
149 | if (result == NULL) { Branch (149:9): [True: 2, False: 2.69k]
|
150 | return NULL; |
151 | } |
152 | |
153 | if (first_invalid_escape != NULL) { Branch (153:9): [True: 366, False: 2.32k]
|
154 | if (warn_invalid_escape_sequence(p, first_invalid_escape, t) < 0) { Branch (154:13): [True: 2, False: 364]
|
155 | Py_DECREF(result); |
156 | return NULL; |
157 | } |
158 | } |
159 | return result; |
160 | } |
161 | |
162 | /* s must include the bracketing quote characters, and r, b, u, |
163 | &/or f prefixes (if any), and embedded escape sequences (if any). |
164 | _PyPegen_parsestr parses it, and sets *result to decoded Python string object. |
165 | If the string is an f-string, set *fstr and *fstrlen to the unparsed |
166 | string object. Return 0 if no errors occurred. */ |
167 | int |
168 | _PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result, |
169 | const char **fstr, Py_ssize_t *fstrlen, Token *t) |
170 | { |
171 | const char *s = PyBytes_AsString(t->bytes); |
172 | if (s == NULL) { Branch (172:9): [True: 0, False: 205k]
|
173 | return -1; |
174 | } |
175 | |
176 | size_t len; |
177 | int quote = Py_CHARMASK(*s); |
178 | int fmode = 0; |
179 | *bytesmode = 0; |
180 | *rawmode = 0; |
181 | *result = NULL; |
182 | *fstr = NULL; |
183 | if (Py_ISALPHA(quote)) { |
184 | while (!*bytesmode || !*rawmode4.43k ) { Branch (184:16): [True: 15.0k, False: 4.43k]
Branch (184:31): [True: 4.38k, False: 51]
|
185 | if (quote == 'b' || quote == 'B'15.0k ) { Branch (185:17): [True: 4.39k, False: 15.0k]
Branch (185:33): [True: 0, False: 15.0k]
|
186 | quote =(unsigned char)*++s; |
187 | *bytesmode = 1; |
188 | } |
189 | else if (quote == 'u' || quote == 'U'14.9k ) { Branch (189:22): [True: 27, False: 14.9k]
Branch (189:38): [True: 1, False: 14.9k]
|
190 | quote = (unsigned char)*++s; |
191 | } |
192 | else if (quote == 'r' || quote == 'R'14.0k ) { Branch (192:22): [True: 910, False: 14.0k]
Branch (192:38): [True: 0, False: 14.0k]
|
193 | quote = (unsigned char)*++s; |
194 | *rawmode = 1; |
195 | } |
196 | else if (quote == 'f' || quote == 'F'9.64k ) { Branch (196:22): [True: 4.43k, False: 9.64k]
Branch (196:38): [True: 0, False: 9.64k]
|
197 | quote = (unsigned char)*++s; |
198 | fmode = 1; |
199 | } |
200 | else { |
201 | break; |
202 | } |
203 | } |
204 | } |
205 | |
206 | /* fstrings are only allowed in Python 3.6 and greater */ |
207 | if (fmode && p->feature_version < 64.43k ) { Branch (207:9): [True: 4.43k, False: 201k]
Branch (207:18): [True: 2, False: 4.43k]
|
208 | p->error_indicator = 1; |
209 | RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater"); |
210 | return -1; |
211 | } |
212 | |
213 | if (fmode && *bytesmode4.43k ) { Branch (213:9): [True: 4.43k, False: 201k]
Branch (213:18): [True: 0, False: 4.43k]
|
214 | PyErr_BadInternalCall(); |
215 | return -1; |
216 | } |
217 | if (quote != '\'' && quote != '\"'98.6k ) { Branch (217:9): [True: 98.6k, False: 107k]
Branch (217:26): [True: 0, False: 98.6k]
|
218 | PyErr_BadInternalCall(); |
219 | return -1; |
220 | } |
221 | /* Skip the leading quote char. */ |
222 | s++; |
223 | len = strlen(s); |
224 | if (len > INT_MAX) { Branch (224:9): [True: 0, False: 205k]
|
225 | PyErr_SetString(PyExc_OverflowError, "string to parse is too long"); |
226 | return -1; |
227 | } |
228 | if (s[--len] != quote) { Branch (228:9): [True: 0, False: 205k]
|
229 | /* Last quote char must match the first. */ |
230 | PyErr_BadInternalCall(); |
231 | return -1; |
232 | } |
233 | if (len >= 4 && s[0] == quote111k && s[1] == quote8.36k ) { Branch (233:9): [True: 111k, False: 94.7k]
Branch (233:21): [True: 8.36k, False: 102k]
Branch (233:38): [True: 8.36k, False: 0]
|
234 | /* A triple quoted string. We've already skipped one quote at |
235 | the start and one at the end of the string. Now skip the |
236 | two at the start. */ |
237 | s += 2; |
238 | len -= 2; |
239 | /* And check that the last two match. */ |
240 | if (s[--len] != quote || s[--len] != quote) { Branch (240:13): [True: 0, False: 8.36k]
Branch (240:34): [True: 0, False: 8.36k]
|
241 | PyErr_BadInternalCall(); |
242 | return -1; |
243 | } |
244 | } |
245 | |
246 | if (fmode) { Branch (246:9): [True: 4.43k, False: 201k]
|
247 | /* Just return the bytes. The caller will parse the resulting |
248 | string. */ |
249 | *fstr = s; |
250 | *fstrlen = len; |
251 | return 0; |
252 | } |
253 | |
254 | /* Not an f-string. */ |
255 | /* Avoid invoking escape decoding routines if possible. */ |
256 | *rawmode = *rawmode || strchr(s, '\\') == NULL200k ; Branch (256:16): [True: 892, False: 200k]
Branch (256:28): [True: 184k, False: 16.1k]
|
257 | if (*bytesmode) { Branch (257:9): [True: 4.39k, False: 196k]
|
258 | /* Disallow non-ASCII characters. */ |
259 | const char *ch; |
260 | for (ch = s; *ch; ch++53.1k ) { Branch (260:22): [True: 53.2k, False: 4.25k]
|
261 | if (Py_CHARMASK(*ch) >= 0x80) { Branch (261:17): [True: 138, False: 53.1k]
|
262 | RAISE_SYNTAX_ERROR( |
263 | "bytes can only contain ASCII " |
264 | "literal characters"); |
265 | return -1; |
266 | } |
267 | } |
268 | if (*rawmode) { Branch (268:13): [True: 1.56k, False: 2.69k]
|
269 | *result = PyBytes_FromStringAndSize(s, len); |
270 | } |
271 | else { |
272 | *result = decode_bytes_with_escapes(p, s, len, t); |
273 | } |
274 | } |
275 | else { |
276 | if (*rawmode) { Branch (276:13): [True: 183k, False: 13.4k]
|
277 | *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL); |
278 | } |
279 | else { |
280 | *result = decode_unicode_with_escapes(p, s, len, t); |
281 | } |
282 | } |
283 | return *result == NULL ? -137 : 0201k ; Branch (283:12): [True: 37, False: 201k]
|
284 | } |
285 | |
286 | |
287 | |
288 | // FSTRING STUFF |
289 | |
290 | /* Fix locations for the given node and its children. |
291 | |
292 | `parent` is the enclosing node. |
293 | `expr_start` is the starting position of the expression (pointing to the open brace). |
294 | `n` is the node which locations are going to be fixed relative to parent. |
295 | `expr_str` is the child node's string representation, including braces. |
296 | */ |
297 | static bool |
298 | fstring_find_expr_location(Token *parent, const char* expr_start, char *expr_str, int *p_lines, int *p_cols) |
299 | { |
300 | *p_lines = 0; |
301 | *p_cols = 0; |
302 | assert(expr_start != NULL && *expr_start == '{'); |
303 | if (parent && parent->bytes) { Branch (303:9): [True: 73.0k, False: 0]
Branch (303:19): [True: 73.0k, False: 0]
|
304 | const char *parent_str = PyBytes_AsString(parent->bytes); |
305 | if (!parent_str) { Branch (305:13): [True: 0, False: 73.0k]
|
306 | return false; |
307 | } |
308 | // The following is needed, in order to correctly shift the column |
309 | // offset, in the case that (disregarding any whitespace) a newline |
310 | // immediately follows the opening curly brace of the fstring expression. |
311 | bool newline_after_brace = 1; |
312 | const char *start = expr_start + 1; |
313 | while (start && *start != '}' && *start != '\n') { Branch (313:16): [True: 73.0k, False: 0]
Branch (313:25): [True: 73.0k, False: 0]
Branch (313:42): [True: 73.0k, False: 6]
|
314 | if (*start != ' ' && *start != '\t'73.0k && *start != '\f'73.0k ) { Branch (314:17): [True: 73.0k, False: 22]
Branch (314:34): [True: 73.0k, False: 0]
Branch (314:52): [True: 73.0k, False: 0]
|
315 | newline_after_brace = 0; |
316 | break; |
317 | } |
318 | start++; |
319 | } |
320 | |
321 | // Account for the characters from the last newline character to our |
322 | // left until the beginning of expr_start. |
323 | if (!newline_after_brace) { Branch (323:13): [True: 73.0k, False: 6]
|
324 | start = expr_start; |
325 | while (start > parent_str && *start != '\n'35.5M ) { Branch (325:20): [True: 35.5M, False: 72.9k]
Branch (325:42): [True: 35.5M, False: 64]
|
326 | start--; |
327 | } |
328 | *p_cols += (int)(expr_start - start); |
329 | } |
330 | /* adjust the start based on the number of newlines encountered |
331 | before the f-string expression */ |
332 | for (const char *p = parent_str; p < expr_start; p++35.5M ) { Branch (332:42): [True: 35.5M, False: 73.0k]
|
333 | if (*p == '\n') { Branch (333:17): [True: 140, False: 35.5M]
|
334 | (*p_lines)++; |
335 | } |
336 | } |
337 | } |
338 | return true; |
339 | } |
340 | |
341 | |
342 | /* Compile this expression in to an expr_ty. Add parens around the |
343 | expression, in order to allow leading spaces in the expression. */ |
344 | static expr_ty |
345 | fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end, |
346 | Token *t) |
347 | { |
348 | expr_ty expr = NULL; |
349 | char *str; |
350 | Py_ssize_t len; |
351 | const char *s; |
352 | expr_ty result = NULL; |
353 | |
354 | assert(expr_end >= expr_start); |
355 | assert(*(expr_start-1) == '{'); |
356 | assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' || |
357 | *expr_end == '='); |
358 | |
359 | /* If the substring is all whitespace, it's an error. We need to catch this |
360 | here, and not when we call PyParser_SimpleParseStringFlagsFilename, |
361 | because turning the expression '' in to '()' would go from being invalid |
362 | to valid. */ |
363 | for (s = expr_start; s != expr_end; s++101 ) { Branch (363:26): [True: 73.1k, False: 28]
|
364 | char c = *s; |
365 | /* The Python parser ignores only the following whitespace |
366 | characters (\r already is converted to \n). */ |
367 | if (!(c == ' ' || c == '\t'73.0k || c == '\n'73.0k || c == '\f'73.0k )) { Branch (367:15): [True: 83, False: 73.0k]
Branch (367:27): [True: 4, False: 73.0k]
Branch (367:40): [True: 10, False: 73.0k]
Branch (367:53): [True: 4, False: 73.0k]
|
368 | break; |
369 | } |
370 | } |
371 | |
372 | if (s == expr_end) { Branch (372:9): [True: 28, False: 73.0k]
|
373 | if (*expr_end == '!' || *expr_end == ':'16 || *expr_end == '='11 ) { Branch (373:13): [True: 12, False: 16]
Branch (373:33): [True: 5, False: 11]
Branch (373:53): [True: 6, False: 5]
|
374 | RAISE_SYNTAX_ERROR("f-string: expression required before '%c'", *expr_end); |
375 | return NULL; |
376 | } |
377 | RAISE_SYNTAX_ERROR("f-string: empty expression not allowed"); |
378 | return NULL; |
379 | } |
380 | |
381 | len = expr_end - expr_start; |
382 | /* Allocate 3 extra bytes: open paren, close paren, null byte. */ |
383 | str = PyMem_Calloc(len + 3, sizeof(char)); |
384 | if (str == NULL) { Branch (384:9): [True: 0, False: 73.0k]
|
385 | PyErr_NoMemory(); |
386 | return NULL; |
387 | } |
388 | |
389 | // The call to fstring_find_expr_location is responsible for finding the column offset |
390 | // the generated AST nodes need to be shifted to the right, which is equal to the number |
391 | // of the f-string characters before the expression starts. |
392 | memcpy(str+1, expr_start, len); |
393 | int lines, cols; |
394 | if (!fstring_find_expr_location(t, expr_start-1, str+1, &lines, &cols)) { Branch (394:9): [True: 0, False: 73.0k]
|
395 | PyMem_Free(str); |
396 | return NULL; |
397 | } |
398 | |
399 | // The parentheses are needed in order to allow for leading whitespace within |
400 | // the f-string expression. This consequently gets parsed as a group (see the |
401 | // group rule in python.gram). |
402 | str[0] = '('; |
403 | str[len+1] = ')'; |
404 | |
405 | struct tok_state* tok = _PyTokenizer_FromString(str, 1); |
406 | if (tok == NULL) { Branch (406:9): [True: 0, False: 73.0k]
|
407 | PyMem_Free(str); |
408 | return NULL; |
409 | } |
410 | Py_INCREF(p->tok->filename); |
411 | |
412 | tok->filename = p->tok->filename; |
413 | tok->lineno = t->lineno + lines - 1; |
414 | |
415 | Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version, |
416 | NULL, p->arena); |
417 | |
418 | p2->starting_lineno = t->lineno + lines; |
419 | p2->starting_col_offset = t->col_offset + cols; |
420 | |
421 | expr = _PyPegen_run_parser(p2); |
422 | |
423 | if (expr == NULL) { Branch (423:9): [True: 15, False: 73.0k]
|
424 | goto exit; |
425 | } |
426 | result = expr; |
427 | |
428 | exit: |
429 | PyMem_Free(str); |
430 | _PyPegen_Parser_Free(p2); |
431 | _PyTokenizer_Free(tok); |
432 | return result; |
433 | } |
434 | |
435 | /* Return -1 on error. |
436 | |
437 | Return 0 if we reached the end of the literal. |
438 | |
439 | Return 1 if we haven't reached the end of the literal, but we want |
440 | the caller to process the literal up to this point. Used for |
441 | doubled braces. |
442 | */ |
443 | static int |
444 | fstring_find_literal(Parser *p, const char **str, const char *end, int raw, |
445 | PyObject **literal, int recurse_lvl, Token *t) |
446 | { |
447 | /* Get any literal string. It ends when we hit an un-doubled left |
448 | brace (which isn't part of a unicode name escape such as |
449 | "\N{EULER CONSTANT}"), or the end of the string. */ |
450 | |
451 | const char *s = *str; |
452 | const char *literal_start = s; |
453 | int result = 0; |
454 | |
455 | assert(*literal == NULL); |
456 | while (s < end) { Branch (456:12): [True: 199k, False: 4.31k]
|
457 | char ch = *s++; |
458 | if (!raw && ch == '\\'198k && s < end366 ) { Branch (458:13): [True: 198k, False: 1.06k]
Branch (458:21): [True: 366, False: 198k]
Branch (458:35): [True: 366, False: 0]
|
459 | ch = *s++; |
460 | if (ch == 'N') { Branch (460:17): [True: 16, False: 350]
|
461 | /* We need to look at and skip matching braces for "\N{name}" |
462 | sequences because otherwise we'll think the opening '{' |
463 | starts an expression, which is not the case with "\N". |
464 | Keep looking for either a matched '{' '}' pair, or the end |
465 | of the string. */ |
466 | |
467 | if (s < end && *s++ == '{'15 ) { Branch (467:21): [True: 15, False: 1]
Branch (467:32): [True: 13, False: 2]
|
468 | while (s < end && *s++ != '}'274 ) { Branch (468:28): [True: 274, False: 2]
Branch (468:39): [True: 263, False: 11]
|
469 | } |
470 | continue; |
471 | } |
472 | |
473 | /* This is an invalid "\N" sequence, since it's a "\N" not |
474 | followed by a "{". Just keep parsing this literal. This |
475 | error will be caught later by |
476 | decode_unicode_with_escapes(). */ |
477 | continue; |
478 | } |
479 | if (ch == '{' && warn_invalid_escape_sequence(p, s-1, t) < 01 ) { Branch (479:17): [True: 1, False: 349]
Branch (479:30): [True: 0, False: 1]
|
480 | return -1; |
481 | } |
482 | } |
483 | if (ch == '{' || ch == '}'126k ) { Branch (483:13): [True: 73.2k, False: 126k]
Branch (483:26): [True: 290, False: 125k]
|
484 | /* Check for doubled braces, but only at the top level. If |
485 | we checked at every level, then f'{0:{3}}' would fail |
486 | with the two closing braces. */ |
487 | if (recurse_lvl == 0) { Branch (487:17): [True: 73.2k, False: 281]
|
488 | if (s < end && *s == ch73.2k ) { Branch (488:21): [True: 73.2k, False: 10]
Branch (488:32): [True: 155, False: 73.0k]
|
489 | /* We're going to tell the caller that the literal ends |
490 | here, but that they should continue scanning. But also |
491 | skip over the second brace when we resume scanning. */ |
492 | *str = s + 1; |
493 | result = 1; |
494 | goto done; |
495 | } |
496 | |
497 | /* Where a single '{' is the start of a new expression, a |
498 | single '}' is not allowed. */ |
499 | if (ch == '}') { Branch (499:21): [True: 8, False: 73.0k]
|
500 | *str = s - 1; |
501 | RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed"); |
502 | return -1; |
503 | } |
504 | } |
505 | /* We're either at a '{', which means we're starting another |
506 | expression; or a '}', which means we're at the end of this |
507 | f-string (for a nested format_spec). */ |
508 | s--; |
509 | break; |
510 | } |
511 | } |
512 | *str = s; |
513 | assert(s <= end); |
514 | assert(s == end || *s == '{' || *s == '}'); |
515 | done: |
516 | if (literal_start != s) { Branch (516:9): [True: 72.8k, False: 5.00k]
|
517 | if (raw) { Branch (517:13): [True: 35, False: 72.7k]
|
518 | *literal = PyUnicode_DecodeUTF8Stateful(literal_start, |
519 | s - literal_start, |
520 | NULL, NULL); |
521 | } |
522 | else { |
523 | *literal = decode_unicode_with_escapes(p, literal_start, |
524 | s - literal_start, t); |
525 | } |
526 | if (!*literal) { Branch (526:13): [True: 5, False: 72.7k]
|
527 | return -1; |
528 | } |
529 | } |
530 | return result; |
531 | } |
532 | |
533 | /* Forward declaration because parsing is recursive. */ |
534 | static expr_ty |
535 | fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl, |
536 | Token *first_token, Token* t, Token *last_token); |
537 | |
538 | /* Parse the f-string at *str, ending at end. We know *str starts an |
539 | expression (so it must be a '{'). Returns the FormattedValue node, which |
540 | includes the expression, conversion character, format_spec expression, and |
541 | optionally the text of the expression (if = is used). |
542 | |
543 | Note that I don't do a perfect job here: I don't make sure that a |
544 | closing brace doesn't match an opening paren, for example. It |
545 | doesn't need to error on all invalid expressions, just correctly |
546 | find the end of all valid ones. Any errors inside the expression |
547 | will be caught when we parse it later. |
548 | |
549 | *expression is set to the expression. For an '=' "debug" expression, |
550 | *expr_text is set to the debug text (the original text of the expression, |
551 | including the '=' and any whitespace around it, as a string object). If |
552 | not a debug expression, *expr_text set to NULL. */ |
553 | static int |
554 | fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl, |
555 | PyObject **expr_text, expr_ty *expression, Token *first_token, |
556 | Token *t, Token *last_token) |
557 | { |
558 | /* Return -1 on error, else 0. */ |
559 | |
560 | const char *expr_start; |
561 | const char *expr_end; |
562 | expr_ty simple_expression; |
563 | expr_ty format_spec = NULL; /* Optional format specifier. */ |
564 | int conversion = -1; /* The conversion char. Use default if not |
565 | specified, or !r if using = and no format |
566 | spec. */ |
567 | |
568 | /* 0 if we're not in a string, else the quote char we're trying to |
569 | match (single or double quote). */ |
570 | char quote_char = 0; |
571 | |
572 | /* If we're inside a string, 1=normal, 3=triple-quoted. */ |
573 | int string_type = 0; |
574 | |
575 | /* Keep track of nesting level for braces/parens/brackets in |
576 | expressions. */ |
577 | Py_ssize_t nested_depth = 0; |
578 | char parenstack[MAXLEVEL]; |
579 | |
580 | *expr_text = NULL; |
581 | |
582 | /* Can only nest one level deep. */ |
583 | if (recurse_lvl >= 2) { Branch (583:9): [True: 1, False: 73.1k]
|
584 | RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply"); |
585 | goto error; |
586 | } |
587 | |
588 | /* The first char must be a left brace, or we wouldn't have gotten |
589 | here. Skip over it. */ |
590 | assert(**str == '{'); |
591 | *str += 1; |
592 | |
593 | expr_start = *str; |
594 | for (; *str < end; (*str)++102k ) { Branch (594:12): [True: 175k, False: 15]
|
595 | char ch; |
596 | |
597 | /* Loop invariants. */ |
598 | assert(nested_depth >= 0); |
599 | assert(*str >= expr_start && *str < end); |
600 | if (quote_char) { Branch (600:13): [True: 1.03k, False: 174k]
|
601 | assert(string_type == 1 || string_type == 3); |
602 | } else { |
603 | assert(string_type == 0); |
604 | } |
605 | |
606 | ch = **str; |
607 | /* Nowhere inside an expression is a backslash allowed. */ |
608 | if (ch == '\\') { Branch (608:13): [True: 8, False: 175k]
|
609 | /* Error: can't include a backslash character, inside |
610 | parens or strings or not. */ |
611 | RAISE_SYNTAX_ERROR( |
612 | "f-string expression part " |
613 | "cannot include a backslash"); |
614 | goto error; |
615 | } |
616 | if (quote_char) { Branch (616:13): [True: 1.03k, False: 174k]
|
617 | /* We're inside a string. See if we're at the end. */ |
618 | /* This code needs to implement the same non-error logic |
619 | as tok_get from tokenizer.c, at the letter_quote |
620 | label. To actually share that code would be a |
621 | nightmare. But, it's unlikely to change and is small, |
622 | so duplicate it here. Note we don't need to catch all |
623 | of the errors, since they'll be caught when parsing the |
624 | expression. We just need to match the non-error |
625 | cases. Thus we can ignore \n in single-quoted strings, |
626 | for example. Or non-terminated strings. */ |
627 | if (ch == quote_char) { Branch (627:17): [True: 236, False: 797]
|
628 | /* Does this match the string_type (single or triple |
629 | quoted)? */ |
630 | if (string_type == 3) { Branch (630:21): [True: 42, False: 194]
|
631 | if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch25 ) { Branch (631:25): [True: 42, False: 0]
Branch (631:41): [True: 25, False: 17]
Branch (631:60): [True: 25, False: 0]
|
632 | /* We're at the end of a triple quoted string. */ |
633 | *str += 2; |
634 | string_type = 0; |
635 | quote_char = 0; |
636 | continue; |
637 | } |
638 | } else { |
639 | /* We're at the end of a normal string. */ |
640 | quote_char = 0; |
641 | string_type = 0; |
642 | continue; |
643 | } |
644 | } |
645 | } else if (ch == '\'' || ch == '"'174k ) { Branch (645:20): [True: 102, False: 174k]
Branch (645:34): [True: 122, False: 174k]
|
646 | /* Is this a triple quoted string? */ |
647 | if (*str+2 < end && *(*str+1) == ch222 && *(*str+2) == ch55 ) { Branch (647:17): [True: 222, False: 2]
Branch (647:33): [True: 55, False: 167]
Branch (647:52): [True: 25, False: 30]
|
648 | string_type = 3; |
649 | *str += 2; |
650 | } else { |
651 | /* Start of a normal string. */ |
652 | string_type = 1; |
653 | } |
654 | /* Start looking for the end of the string. */ |
655 | quote_char = ch; |
656 | } else if (ch == '[' || ch == '{'174k || ch == '('174k ) { Branch (656:20): [True: 80, False: 174k]
Branch (656:33): [True: 20, False: 174k]
Branch (656:46): [True: 475, False: 174k]
|
657 | if (nested_depth >= MAXLEVEL) { Branch (657:17): [True: 1, False: 574]
|
658 | RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis"); |
659 | goto error; |
660 | } |
661 | parenstack[nested_depth] = ch; |
662 | nested_depth++; |
663 | } else if (ch == '#') { Branch (663:20): [True: 3, False: 174k]
|
664 | /* Error: can't include a comment character, inside parens |
665 | or not. */ |
666 | RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'"); |
667 | goto error; |
668 | } else if (nested_depth == 0 && Branch (668:20): [True: 171k, False: 2.33k]
|
669 | (171k ch == '!'171k || ch == ':'170k || ch == '}'170k || Branch (669:21): [True: 1.51k, False: 170k]
Branch (669:34): [True: 148, False: 170k]
Branch (669:47): [True: 71.3k, False: 98.8k]
|
670 | ch == '='98.8k || ch == '>'98.7k || ch == '<'98.6k )) { Branch (670:21): [True: 99, False: 98.7k]
Branch (670:34): [True: 15, False: 98.6k]
Branch (670:47): [True: 3, False: 98.6k]
|
671 | /* See if there's a next character. */ |
672 | if (*str+1 < end) { Branch (672:17): [True: 71.3k, False: 1.76k]
|
673 | char next = *(*str+1); |
674 | |
675 | /* For "!=". since '=' is not an allowed conversion character, |
676 | nothing is lost in this test. */ |
677 | if ((ch == '!' && next == '='1.51k ) || /* != */ Branch (677:22): [True: 1.51k, False: 69.8k]
Branch (677:35): [True: 12, False: 1.49k]
|
678 | (71.3k ch == '='71.3k && next == '='96 ) || /* == */ Branch (678:22): [True: 96, False: 71.2k]
Branch (678:35): [True: 4, False: 92]
|
679 | (71.3k ch == '<'71.3k && next == '='2 ) || /* <= */ Branch (679:22): [True: 2, False: 71.3k]
Branch (679:35): [True: 2, False: 0]
|
680 | (71.3k ch == '>'71.3k && next == '='14 ) /* >= */ Branch (680:22): [True: 14, False: 71.3k]
Branch (680:35): [True: 2, False: 12]
|
681 | ) { |
682 | *str += 1; |
683 | continue; |
684 | } |
685 | } |
686 | /* Don't get out of the loop for these, if they're single |
687 | chars (not part of 2-char tokens). If by themselves, they |
688 | don't end an expression (unlike say '!'). */ |
689 | if (ch == '>' || ch == '<'73.0k ) { Branch (689:17): [True: 13, False: 73.0k]
Branch (689:30): [True: 1, False: 73.0k]
|
690 | continue; |
691 | } |
692 | |
693 | /* Normal way out of this loop. */ |
694 | break; |
695 | } else if (ch == ']' || ch == '}'100k || ch == ')'100k ) { Branch (695:20): [True: 79, False: 100k]
Branch (695:33): [True: 23, False: 100k]
Branch (695:46): [True: 270, False: 100k]
|
696 | if (!nested_depth) { Branch (696:17): [True: 2, False: 370]
|
697 | RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch); |
698 | goto error; |
699 | } |
700 | nested_depth--; |
701 | int opening = (unsigned char)parenstack[nested_depth]; |
702 | if (!((opening == '(' && ch == ')'270 ) || Branch (702:20): [True: 270, False: 100]
Branch (702:38): [True: 267, False: 3]
|
703 | (103 opening == '['103 && ch == ']'80 ) || Branch (703:20): [True: 80, False: 23]
Branch (703:38): [True: 78, False: 2]
|
704 | (25 opening == '{'25 && ch == '}'20 ))) Branch (704:20): [True: 20, False: 5]
Branch (704:38): [True: 20, False: 0]
|
705 | { |
706 | RAISE_SYNTAX_ERROR( |
707 | "f-string: closing parenthesis '%c' " |
708 | "does not match opening parenthesis '%c'", |
709 | ch, opening); |
710 | goto error; |
711 | } |
712 | } else { |
713 | /* Just consume this char and loop around. */ |
714 | } |
715 | } |
716 | expr_end = *str; |
717 | /* If we leave the above loop in a string or with mismatched parens, we |
718 | don't really care. We'll get a syntax error when compiling the |
719 | expression. But, we can produce a better error message, so let's just |
720 | do that.*/ |
721 | if (quote_char) { Branch (721:9): [True: 4, False: 73.1k]
|
722 | RAISE_SYNTAX_ERROR("f-string: unterminated string"); |
723 | goto error; |
724 | } |
725 | if (nested_depth) { Branch (725:9): [True: 0, False: 73.1k]
|
726 | int opening = (unsigned char)parenstack[nested_depth - 1]; |
727 | RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening); |
728 | goto error; |
729 | } |
730 | |
731 | if (*str >= end) { Branch (731:9): [True: 11, False: 73.0k]
|
732 | goto unexpected_end_of_string; |
733 | } |
734 | |
735 | /* Compile the expression as soon as possible, so we show errors |
736 | related to the expression before errors related to the |
737 | conversion or format_spec. */ |
738 | simple_expression = fstring_compile_expr(p, expr_start, expr_end, t); |
739 | if (!simple_expression) { Branch (739:9): [True: 43, False: 73.0k]
|
740 | goto error; |
741 | } |
742 | |
743 | /* Check for =, which puts the text value of the expression in |
744 | expr_text. */ |
745 | if (**str == '=') { Branch (745:9): [True: 88, False: 72.9k]
|
746 | if (p->feature_version < 8) { Branch (746:13): [True: 1, False: 87]
|
747 | RAISE_SYNTAX_ERROR("f-string: self documenting expressions are " |
748 | "only supported in Python 3.8 and greater"); |
749 | goto error; |
750 | } |
751 | *str += 1; |
752 | |
753 | /* Skip over ASCII whitespace. No need to test for end of string |
754 | here, since we know there's at least a trailing quote somewhere |
755 | ahead. */ |
756 | while (Py_ISSPACE(**str)) { |
757 | *str += 1; |
758 | } |
759 | if (*str >= end) { Branch (759:13): [True: 1, False: 86]
|
760 | goto unexpected_end_of_string; |
761 | } |
762 | /* Set *expr_text to the text of the expression. */ |
763 | *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start); |
764 | if (!*expr_text) { Branch (764:13): [True: 0, False: 86]
|
765 | goto error; |
766 | } |
767 | } |
768 | |
769 | /* Check for a conversion char, if present. */ |
770 | if (**str == '!') { Branch (770:9): [True: 1.53k, False: 71.5k]
|
771 | *str += 1; |
772 | const char *conv_start = *str; |
773 | while (1) { Branch (773:16): [Folded - Ignored]
|
774 | if (*str >= end) { Branch (774:17): [True: 5, False: 3.07k]
|
775 | goto unexpected_end_of_string; |
776 | } |
777 | if (**str == '}' || **str == ':'1.59k ) { Branch (777:17): [True: 1.47k, False: 1.59k]
Branch (777:33): [True: 52, False: 1.54k]
|
778 | break; |
779 | } |
780 | *str += 1; |
781 | } |
782 | if (*str == conv_start) { Branch (782:13): [True: 3, False: 1.52k]
|
783 | RAISE_SYNTAX_ERROR( |
784 | "f-string: missed conversion character"); |
785 | goto error; |
786 | } |
787 | |
788 | conversion = (unsigned char)*conv_start; |
789 | /* Validate the conversion. */ |
790 | if ((*str != conv_start + 1) || Branch (790:13): [True: 11, False: 1.51k]
|
791 | !(1.51k conversion == 's'1.51k || conversion == 'r'1.48k || conversion == 'a'39 )) Branch (791:15): [True: 35, False: 1.48k]
Branch (791:36): [True: 1.44k, False: 39]
Branch (791:57): [True: 34, False: 5]
|
792 | { |
793 | PyObject *conv_obj = PyUnicode_FromStringAndSize(conv_start, |
794 | *str-conv_start); |
795 | if (conv_obj) { Branch (795:17): [True: 16, False: 0]
|
796 | RAISE_SYNTAX_ERROR( |
797 | "f-string: invalid conversion character %R: " |
798 | "expected 's', 'r', or 'a'", |
799 | conv_obj); |
800 | Py_DECREF(conv_obj); |
801 | } |
802 | goto error; |
803 | } |
804 | |
805 | } |
806 | |
807 | /* Check for the format spec, if present. */ |
808 | assert(*str < end); |
809 | if (**str == ':') { Branch (809:9): [True: 214, False: 72.8k]
|
810 | *str += 1; |
811 | if (*str >= end) { Branch (811:13): [True: 2, False: 212]
|
812 | goto unexpected_end_of_string; |
813 | } |
814 | |
815 | /* Parse the format spec. */ |
816 | format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1, |
817 | first_token, t, last_token); |
818 | if (!format_spec) { Branch (818:13): [True: 7, False: 205]
|
819 | goto error; |
820 | } |
821 | } |
822 | |
823 | if (*str >= end || **str != '}') { Branch (823:9): [True: 0, False: 73.0k]
Branch (823:24): [True: 1, False: 73.0k]
|
824 | goto unexpected_end_of_string; |
825 | } |
826 | |
827 | /* We're at a right brace. Consume it. */ |
828 | assert(*str < end); |
829 | assert(**str == '}'); |
830 | *str += 1; |
831 | |
832 | /* If we're in = mode (detected by non-NULL expr_text), and have no format |
833 | spec and no explicit conversion, set the conversion to 'r'. */ |
834 | if (*expr_text && format_spec == NULL85 && conversion == -147 ) { Branch (834:9): [True: 85, False: 72.9k]
Branch (834:23): [True: 47, False: 38]
Branch (834:46): [True: 16, False: 31]
|
835 | conversion = 'r'; |
836 | } |
837 | |
838 | /* And now create the FormattedValue node that represents this |
839 | entire expression with the conversion and format spec. */ |
840 | //TODO: Fix this |
841 | *expression = _PyAST_FormattedValue(simple_expression, conversion, |
842 | format_spec, first_token->lineno, |
843 | first_token->col_offset, |
844 | last_token->end_lineno, |
845 | last_token->end_col_offset, p->arena); |
846 | if (!*expression) { Branch (846:9): [True: 0, False: 73.0k]
|
847 | goto error; |
848 | } |
849 | |
850 | return 0; |
851 | |
852 | unexpected_end_of_string: |
853 | RAISE_SYNTAX_ERROR("f-string: expecting '}'"); |
854 | /* Falls through to error. */ |
855 | |
856 | error: |
857 | Py_XDECREF(*expr_text); |
858 | return -1; |
859 | |
860 | } |
861 | |
862 | /* Return -1 on error. |
863 | |
864 | Return 0 if we have a literal (possible zero length) and an |
865 | expression (zero length if at the end of the string. |
866 | |
867 | Return 1 if we have a literal, but no expression, and we want the |
868 | caller to call us again. This is used to deal with doubled |
869 | braces. |
870 | |
871 | When called multiple times on the string 'a{{b{0}c', this function |
872 | will return: |
873 | |
874 | 1. the literal 'a{' with no expression, and a return value |
875 | of 1. Despite the fact that there's no expression, the return |
876 | value of 1 means we're not finished yet. |
877 | |
878 | 2. the literal 'b' and the expression '0', with a return value of |
879 | 0. The fact that there's an expression means we're not finished. |
880 | |
881 | 3. literal 'c' with no expression and a return value of 0. The |
882 | combination of the return value of 0 with no expression means |
883 | we're finished. |
884 | */ |
885 | static int |
886 | fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw, |
887 | int recurse_lvl, PyObject **literal, |
888 | PyObject **expr_text, expr_ty *expression, |
889 | Token *first_token, Token *t, Token *last_token) |
890 | { |
891 | int result; |
892 | |
893 | assert(*literal == NULL && *expression == NULL); |
894 | |
895 | /* Get any literal string. */ |
896 | result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t); |
897 | if (result < 0) { Branch (897:9): [True: 13, False: 77.8k]
|
898 | goto error; |
899 | } |
900 | |
901 | assert(result == 0 || result == 1); |
902 | |
903 | if (result == 1) { Branch (903:9): [True: 155, False: 77.6k]
|
904 | /* We have a literal, but don't look at the expression. */ |
905 | return 1; |
906 | } |
907 | |
908 | if (*str >= end || **str == '}'73.3k ) { Branch (908:9): [True: 4.31k, False: 73.3k]
Branch (908:24): [True: 205, False: 73.1k]
|
909 | /* We're at the end of the string or the end of a nested |
910 | f-string: no expression. The top-level error case where we |
911 | expect to be at the end of the string but we're at a '}' is |
912 | handled later. */ |
913 | return 0; |
914 | } |
915 | |
916 | /* We must now be the start of an expression, on a '{'. */ |
917 | assert(**str == '{'); |
918 | |
919 | if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text, Branch (919:9): [True: 114, False: 73.0k]
|
920 | expression, first_token, t, last_token) < 0) { |
921 | goto error; |
922 | } |
923 | |
924 | return 0; |
925 | |
926 | error: |
927 | Py_CLEAR(*literal); |
928 | return -1; |
929 | } |
930 | |
931 | #ifdef NDEBUG |
932 | #define ExprList_check_invariants(l) |
933 | #else |
934 | static void |
935 | ExprList_check_invariants(ExprList *l) |
936 | { |
937 | /* Check our invariants. Make sure this object is "live", and |
938 | hasn't been deallocated. */ |
939 | assert(l->size >= 0); |
940 | assert(l->p != NULL); |
941 | if (l->size <= EXPRLIST_N_CACHED) { |
942 | assert(l->data == l->p); |
943 | } |
944 | } |
945 | #endif |
946 | |
947 | static void |
948 | ExprList_Init(ExprList *l) |
949 | { |
950 | l->allocated = EXPRLIST_N_CACHED; |
951 | l->size = 0; |
952 | |
953 | /* Until we start allocating dynamically, p points to data. */ |
954 | l->p = l->data; |
955 | |
956 | ExprList_check_invariants(l); |
957 | } |
958 | |
959 | static int |
960 | ExprList_Append(ExprList *l, expr_ty exp) |
961 | { |
962 | ExprList_check_invariants(l); |
963 | if (l->size >= l->allocated) { Branch (963:9): [True: 58, False: 146k]
|
964 | /* We need to alloc (or realloc) the memory. */ |
965 | Py_ssize_t new_size = l->allocated * 2; |
966 | |
967 | /* See if we've ever allocated anything dynamically. */ |
968 | if (l->p == l->data) { Branch (968:13): [True: 15, False: 43]
|
969 | Py_ssize_t i; |
970 | /* We're still using the cached data. Switch to |
971 | alloc-ing. */ |
972 | l->p = PyMem_Malloc(sizeof(expr_ty) * new_size); |
973 | if (!l->p) { Branch (973:17): [True: 0, False: 15]
|
974 | return -1; |
975 | } |
976 | /* Copy the cached data into the new buffer. */ |
977 | for (i = 0; 15 i < l->size; i++960 ) { Branch (977:25): [True: 960, False: 15]
|
978 | l->p[i] = l->data[i]; |
979 | } |
980 | } else { |
981 | /* Just realloc. */ |
982 | expr_ty *tmp = PyMem_Realloc(l->p, sizeof(expr_ty) * new_size); |
983 | if (!tmp) { Branch (983:17): [True: 0, False: 43]
|
984 | PyMem_Free(l->p); |
985 | l->p = NULL; |
986 | return -1; |
987 | } |
988 | l->p = tmp; |
989 | } |
990 | |
991 | l->allocated = new_size; |
992 | assert(l->allocated == 2 * l->size); |
993 | } |
994 | |
995 | l->p[l->size++] = exp; |
996 | |
997 | ExprList_check_invariants(l); |
998 | return 0; |
999 | } |
1000 | |
1001 | static void |
1002 | ExprList_Dealloc(ExprList *l) |
1003 | { |
1004 | ExprList_check_invariants(l); |
1005 | |
1006 | /* If there's been an error, or we've never dynamically allocated, |
1007 | do nothing. */ |
1008 | if (!l->p || l->p == l->data) { Branch (1008:9): [True: 0, False: 3.32k]
Branch (1008:18): [True: 3.30k, False: 15]
|
1009 | /* Do nothing. */ |
1010 | } else { |
1011 | /* We have dynamically allocated. Free the memory. */ |
1012 | PyMem_Free(l->p); |
1013 | } |
1014 | l->p = NULL; |
1015 | l->size = -1; |
1016 | } |
1017 | |
1018 | static asdl_expr_seq * |
1019 | ExprList_Finish(ExprList *l, PyArena *arena) |
1020 | { |
1021 | asdl_expr_seq *seq; |
1022 | |
1023 | ExprList_check_invariants(l); |
1024 | |
1025 | /* Allocate the asdl_seq and copy the expressions in to it. */ |
1026 | seq = _Py_asdl_expr_seq_new(l->size, arena); |
1027 | if (seq) { Branch (1027:9): [True: 3.01k, False: 0]
|
1028 | Py_ssize_t i; |
1029 | for (i = 0; i < l->size; i++146k ) { Branch (1029:21): [True: 146k, False: 3.01k]
|
1030 | asdl_seq_SET(seq, i, l->p[i]); |
1031 | } |
1032 | } |
1033 | ExprList_Dealloc(l); |
1034 | return seq; |
1035 | } |
1036 | |
1037 | #ifdef NDEBUG |
1038 | #define FstringParser_check_invariants(state) |
1039 | #else |
1040 | static void |
1041 | FstringParser_check_invariants(FstringParser *state) |
1042 | { |
1043 | if (state->last_str) { |
1044 | assert(PyUnicode_CheckExact(state->last_str)); |
1045 | } |
1046 | ExprList_check_invariants(&state->expr_list); |
1047 | } |
1048 | #endif |
1049 | |
1050 | void |
1051 | _PyPegen_FstringParser_Init(FstringParser *state) |
1052 | { |
1053 | state->last_str = NULL; |
1054 | state->fmode = 0; |
1055 | ExprList_Init(&state->expr_list); |
1056 | FstringParser_check_invariants(state); |
1057 | } |
1058 | |
1059 | void |
1060 | _PyPegen_FstringParser_Dealloc(FstringParser *state) |
1061 | { |
1062 | FstringParser_check_invariants(state); |
1063 | |
1064 | Py_XDECREF(state->last_str); |
1065 | ExprList_Dealloc(&state->expr_list); |
1066 | } |
1067 | |
1068 | /* Make a Constant node, but decref the PyUnicode object being added. */ |
1069 | static expr_ty |
1070 | make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token) |
1071 | { |
1072 | PyObject *s = *str; |
1073 | PyObject *kind = NULL; |
1074 | *str = NULL; |
1075 | assert(PyUnicode_CheckExact(s)); |
1076 | if (_PyArena_AddPyObject(p->arena, s) < 0) { Branch (1076:9): [True: 0, False: 248k]
|
1077 | Py_DECREF(s); |
1078 | return NULL; |
1079 | } |
1080 | const char* the_str = PyBytes_AsString(first_token->bytes); |
1081 | if (the_str && the_str[0] == 'u') { Branch (1081:9): [True: 248k, False: 0]
Branch (1081:20): [True: 27, False: 248k]
|
1082 | kind = _PyPegen_new_identifier(p, "u"); |
1083 | } |
1084 | |
1085 | if (kind == NULL && PyErr_Occurred()248k ) { Branch (1085:9): [True: 248k, False: 27]
Branch (1085:25): [True: 0, False: 248k]
|
1086 | return NULL; |
1087 | } |
1088 | |
1089 | return _PyAST_Constant(s, kind, first_token->lineno, first_token->col_offset, |
1090 | last_token->end_lineno, last_token->end_col_offset, |
1091 | p->arena); |
1092 | |
1093 | } |
1094 | |
1095 | |
1096 | /* Add a non-f-string (that is, a regular literal string). str is |
1097 | decref'd. */ |
1098 | int |
1099 | _PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str) |
1100 | { |
1101 | FstringParser_check_invariants(state); |
1102 | |
1103 | assert(PyUnicode_CheckExact(str)); |
1104 | |
1105 | if (PyUnicode_GET_LENGTH(str) == 0) { Branch (1105:9): [True: 67.3k, False: 202k]
|
1106 | Py_DECREF(str); |
1107 | return 0; |
1108 | } |
1109 | |
1110 | if (!state->last_str) { Branch (1110:9): [True: 181k, False: 21.3k]
|
1111 | /* We didn't have a string before, so just remember this one. */ |
1112 | state->last_str = str; |
1113 | } else { |
1114 | /* Concatenate this with the previous string. */ |
1115 | PyUnicode_AppendAndDel(&state->last_str, str); |
1116 | if (!state->last_str) { Branch (1116:13): [True: 0, False: 21.3k]
|
1117 | return -1; |
1118 | } |
1119 | } |
1120 | FstringParser_check_invariants(state); |
1121 | return 0; |
1122 | } |
1123 | |
1124 | /* Parse an f-string. The f-string is in *str to end, with no |
1125 | 'f' or quotes. */ |
1126 | int |
1127 | _PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str, |
1128 | const char *end, int raw, int recurse_lvl, |
1129 | Token *first_token, Token* t, Token *last_token) |
1130 | { |
1131 | FstringParser_check_invariants(state); |
1132 | state->fmode = 1; |
1133 | |
1134 | /* Parse the f-string. */ |
1135 | while (1) { Branch (1135:12): [Folded - Ignored]
|
1136 | PyObject *literal = NULL; |
1137 | PyObject *expr_text = NULL; |
1138 | expr_ty expression = NULL; |
1139 | |
1140 | /* If there's a zero length literal in front of the |
1141 | expression, literal will be NULL. If we're at the end of |
1142 | the f-string, expression will be NULL (unless result == 1, |
1143 | see below). */ |
1144 | int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl, |
1145 | &literal, &expr_text, |
1146 | &expression, first_token, t, last_token); |
1147 | if (result < 0) { Branch (1147:13): [True: 127, False: 77.6k]
|
1148 | return -1; |
1149 | } |
1150 | |
1151 | /* Add the literal, if any. */ |
1152 | if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 072.7k ) { Branch (1152:13): [True: 72.7k, False: 4.90k]
Branch (1152:24): [True: 0, False: 72.7k]
|
1153 | Py_XDECREF(expr_text); |
1154 | return -1; |
1155 | } |
1156 | /* Add the expr_text, if any. */ |
1157 | if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 085 ) { Branch (1157:13): [True: 85, False: 77.6k]
Branch (1157:26): [True: 0, False: 85]
|
1158 | return -1; |
1159 | } |
1160 | |
1161 | /* We've dealt with the literal and expr_text, their ownership has |
1162 | been transferred to the state object. Don't look at them again. */ |
1163 | |
1164 | /* See if we should just loop around to get the next literal |
1165 | and expression, while ignoring the expression this |
1166 | time. This is used for un-doubling braces, as an |
1167 | optimization. */ |
1168 | if (result == 1) { Branch (1168:13): [True: 155, False: 77.5k]
|
1169 | continue; |
1170 | } |
1171 | |
1172 | if (!expression) { Branch (1172:13): [True: 4.51k, False: 73.0k]
|
1173 | /* We're done with this f-string. */ |
1174 | break; |
1175 | } |
1176 | |
1177 | /* We know we have an expression. Convert any existing string |
1178 | to a Constant node. */ |
1179 | if (state->last_str) { Branch (1179:13): [True: 71.9k, False: 1.09k]
|
1180 | /* Convert the existing last_str literal to a Constant node. */ |
1181 | expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token); |
1182 | if (!last_str || ExprList_Append(&state->expr_list, last_str) < 0) { Branch (1182:17): [True: 0, False: 71.9k]
Branch (1182:30): [True: 0, False: 71.9k]
|
1183 | return -1; |
1184 | } |
1185 | } |
1186 | |
1187 | if (ExprList_Append(&state->expr_list, expression) < 0) { Branch (1187:13): [True: 0, False: 73.0k]
|
1188 | return -1; |
1189 | } |
1190 | } |
1191 | |
1192 | /* If recurse_lvl is zero, then we must be at the end of the |
1193 | string. Otherwise, we must be at a right brace. */ |
1194 | |
1195 | if (recurse_lvl == 0 && *str < end-14.30k ) { Branch (1195:9): [True: 4.30k, False: 207]
Branch (1195:29): [True: 0, False: 4.30k]
|
1196 | RAISE_SYNTAX_ERROR("f-string: unexpected end of string"); |
1197 | return -1; |
1198 | } |
1199 | if (recurse_lvl != 0 && **str != '}'207 ) { Branch (1199:9): [True: 207, False: 4.30k]
Branch (1199:29): [True: 2, False: 205]
|
1200 | RAISE_SYNTAX_ERROR("f-string: expecting '}'"); |
1201 | return -1; |
1202 | } |
1203 | |
1204 | FstringParser_check_invariants(state); |
1205 | return 0; |
1206 | } |
1207 | |
1208 | /* Convert the partial state reflected in last_str and expr_list to an |
1209 | expr_ty. The expr_ty can be a Constant, or a JoinedStr. */ |
1210 | expr_ty |
1211 | _PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token, |
1212 | Token *last_token) |
1213 | { |
1214 | asdl_expr_seq *seq; |
1215 | |
1216 | FstringParser_check_invariants(state); |
1217 | |
1218 | /* If we're just a constant string with no expressions, return |
1219 | that. */ |
1220 | if (!state->fmode) { Branch (1220:9): [True: 174k, False: 3.01k]
|
1221 | assert(!state->expr_list.size); |
1222 | if (!state->last_str) { Branch (1222:13): [True: 67.3k, False: 107k]
|
1223 | /* Create a zero length string. */ |
1224 | state->last_str = PyUnicode_FromStringAndSize(NULL, 0); |
1225 | if (!state->last_str) { Branch (1225:17): [True: 0, False: 67.3k]
|
1226 | goto error; |
1227 | } |
1228 | } |
1229 | return make_str_node_and_del(p, &state->last_str, first_token, last_token); |
1230 | } |
1231 | |
1232 | /* Create a Constant node out of last_str, if needed. It will be the |
1233 | last node in our expression list. */ |
1234 | if (state->last_str) { Branch (1234:9): [True: 1.71k, False: 1.29k]
|
1235 | expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token); |
1236 | if (!str || ExprList_Append(&state->expr_list, str) < 0) { Branch (1236:13): [True: 0, False: 1.71k]
Branch (1236:21): [True: 0, False: 1.71k]
|
1237 | goto error; |
1238 | } |
1239 | } |
1240 | /* This has already been freed. */ |
1241 | assert(state->last_str == NULL); |
1242 | |
1243 | seq = ExprList_Finish(&state->expr_list, p->arena); |
1244 | if (!seq) { Branch (1244:9): [True: 0, False: 3.01k]
|
1245 | goto error; |
1246 | } |
1247 | |
1248 | return _PyAST_JoinedStr(seq, first_token->lineno, first_token->col_offset, |
1249 | last_token->end_lineno, last_token->end_col_offset, |
1250 | p->arena); |
1251 | |
1252 | error: |
1253 | _PyPegen_FstringParser_Dealloc(state); |
1254 | return NULL; |
1255 | } |
1256 | |
1257 | /* Given an f-string (with no 'f' or quotes) that's in *str and ends |
1258 | at end, parse it into an expr_ty. Return NULL on error. Adjust |
1259 | str to point past the parsed portion. */ |
1260 | static expr_ty |
1261 | fstring_parse(Parser *p, const char **str, const char *end, int raw, |
1262 | int recurse_lvl, Token *first_token, Token* t, Token *last_token) |
1263 | { |
1264 | FstringParser state; |
1265 | |
1266 | _PyPegen_FstringParser_Init(&state); |
1267 | if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl, Branch (1267:9): [True: 7, False: 205]
|
1268 | first_token, t, last_token) < 0) { |
1269 | _PyPegen_FstringParser_Dealloc(&state); |
1270 | return NULL; |
1271 | } |
1272 | |
1273 | return _PyPegen_FstringParser_Finish(p, &state, t, t); |
1274 | } |