Line data Source code
1 : #include <stdbool.h>
2 :
3 : #include <Python.h>
4 :
5 : #include "tokenizer.h"
6 : #include "pegen.h"
7 : #include "string_parser.h"
8 :
9 : //// STRING HANDLING FUNCTIONS ////
10 :
11 : static int
12 732 : warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t)
13 : {
14 732 : unsigned char c = *first_invalid_escape;
15 732 : int octal = ('4' <= c && c <= '7');
16 732 : PyObject *msg =
17 : octal
18 516 : ? PyUnicode_FromFormat("invalid octal escape sequence '\\%.3s'",
19 : first_invalid_escape)
20 732 : : PyUnicode_FromFormat("invalid escape sequence '\\%c'", c);
21 732 : if (msg == NULL) {
22 0 : return -1;
23 : }
24 732 : if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
25 : t->lineno, NULL, NULL) < 0) {
26 5 : if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
27 : /* Replace the DeprecationWarning exception with a SyntaxError
28 : to get a more accurate error report */
29 5 : PyErr_Clear();
30 :
31 : /* This is needed, in order for the SyntaxError to point to the token t,
32 : since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
33 : error location, if p->known_err_token is not set. */
34 5 : p->known_err_token = t;
35 5 : if (octal) {
36 2 : RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'",
37 : first_invalid_escape);
38 : }
39 : else {
40 3 : RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c);
41 : }
42 : }
43 5 : Py_DECREF(msg);
44 5 : return -1;
45 : }
46 727 : Py_DECREF(msg);
47 727 : return 0;
48 : }
49 :
50 : static PyObject *
51 22168 : decode_utf8(const char **sPtr, const char *end)
52 : {
53 : const char *s;
54 : const char *t;
55 22168 : t = s = *sPtr;
56 152655 : while (s < end && (*s & 0x80)) {
57 130487 : s++;
58 : }
59 22168 : *sPtr = s;
60 22168 : return PyUnicode_DecodeUTF8(t, s - t, NULL);
61 : }
62 :
63 : static PyObject *
64 240000 : decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
65 : {
66 : PyObject *v;
67 : PyObject *u;
68 : char *buf;
69 : char *p;
70 : const char *end;
71 :
72 : /* check for integer overflow */
73 240000 : if (len > SIZE_MAX / 6) {
74 0 : return NULL;
75 : }
76 : /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
77 : "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
78 240000 : u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
79 240000 : if (u == NULL) {
80 0 : return NULL;
81 : }
82 240000 : p = buf = PyBytes_AsString(u);
83 240000 : if (p == NULL) {
84 0 : return NULL;
85 : }
86 240000 : end = s + len;
87 4389100 : while (s < end) {
88 4149100 : if (*s == '\\') {
89 227883 : *p++ = *s++;
90 227883 : if (s >= end || *s & 0x80) {
91 5 : strcpy(p, "u005c");
92 5 : p += 5;
93 5 : if (s >= end) {
94 1 : break;
95 : }
96 : }
97 : }
98 4149100 : if (*s & 0x80) {
99 : PyObject *w;
100 : int kind;
101 : const void *data;
102 : Py_ssize_t w_len;
103 : Py_ssize_t i;
104 22168 : w = decode_utf8(&s, end);
105 22168 : if (w == NULL) {
106 0 : Py_DECREF(u);
107 0 : return NULL;
108 : }
109 22168 : kind = PyUnicode_KIND(w);
110 22168 : data = PyUnicode_DATA(w);
111 22168 : w_len = PyUnicode_GET_LENGTH(w);
112 60555 : for (i = 0; i < w_len; i++) {
113 38387 : Py_UCS4 chr = PyUnicode_READ(kind, data, i);
114 38387 : sprintf(p, "\\U%08x", chr);
115 38387 : p += 10;
116 : }
117 : /* Should be impossible to overflow */
118 22168 : assert(p - buf <= PyBytes_GET_SIZE(u));
119 22168 : Py_DECREF(w);
120 : }
121 : else {
122 4126930 : *p++ = *s++;
123 : }
124 : }
125 240000 : len = p - buf;
126 240000 : s = buf;
127 :
128 : const char *first_invalid_escape;
129 240000 : v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
130 :
131 240000 : if (v != NULL && first_invalid_escape != NULL) {
132 365 : if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) {
133 : /* We have not decref u before because first_invalid_escape points
134 : inside u. */
135 3 : Py_XDECREF(u);
136 3 : Py_DECREF(v);
137 3 : return NULL;
138 : }
139 : }
140 239997 : Py_XDECREF(u);
141 239997 : return v;
142 : }
143 :
144 : static PyObject *
145 19186 : decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
146 : {
147 : const char *first_invalid_escape;
148 19186 : PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
149 19186 : if (result == NULL) {
150 2 : return NULL;
151 : }
152 :
153 19184 : if (first_invalid_escape != NULL) {
154 366 : if (warn_invalid_escape_sequence(p, first_invalid_escape, t) < 0) {
155 2 : Py_DECREF(result);
156 2 : return NULL;
157 : }
158 : }
159 19182 : return result;
160 : }
161 :
162 : /* s must include the bracketing quote characters, and r, b, u,
163 : &/or f prefixes (if any), and embedded escape sequences (if any).
164 : _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
165 : If the string is an f-string, set *fstr and *fstrlen to the unparsed
166 : string object. Return 0 if no errors occurred. */
167 : int
168 1691440 : _PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
169 : const char **fstr, Py_ssize_t *fstrlen, Token *t)
170 : {
171 1691440 : const char *s = PyBytes_AsString(t->bytes);
172 1691440 : if (s == NULL) {
173 0 : return -1;
174 : }
175 :
176 : size_t len;
177 1691440 : int quote = Py_CHARMASK(*s);
178 1691440 : int fmode = 0;
179 1691440 : *bytesmode = 0;
180 1691440 : *rawmode = 0;
181 1691440 : *result = NULL;
182 1691440 : *fstr = NULL;
183 1691440 : if (Py_ISALPHA(quote)) {
184 182880 : while (!*bytesmode || !*rawmode) {
185 182111 : if (quote == 'b' || quote == 'B') {
186 52450 : quote =(unsigned char)*++s;
187 52450 : *bytesmode = 1;
188 : }
189 129661 : else if (quote == 'u' || quote == 'U') {
190 909 : quote = (unsigned char)*++s;
191 : }
192 128752 : else if (quote == 'r' || quote == 'R') {
193 16220 : quote = (unsigned char)*++s;
194 16220 : *rawmode = 1;
195 : }
196 112532 : else if (quote == 'f' || quote == 'F') {
197 22299 : quote = (unsigned char)*++s;
198 22299 : fmode = 1;
199 : }
200 : else {
201 : break;
202 : }
203 : }
204 : }
205 :
206 : /* fstrings are only allowed in Python 3.6 and greater */
207 1691440 : if (fmode && p->feature_version < 6) {
208 2 : p->error_indicator = 1;
209 2 : RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
210 2 : return -1;
211 : }
212 :
213 1691440 : if (fmode && *bytesmode) {
214 0 : PyErr_BadInternalCall();
215 0 : return -1;
216 : }
217 1691440 : if (quote != '\'' && quote != '\"') {
218 0 : PyErr_BadInternalCall();
219 0 : return -1;
220 : }
221 : /* Skip the leading quote char. */
222 1691440 : s++;
223 1691440 : len = strlen(s);
224 1691440 : if (len > INT_MAX) {
225 0 : PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
226 0 : return -1;
227 : }
228 1691440 : if (s[--len] != quote) {
229 : /* Last quote char must match the first. */
230 0 : PyErr_BadInternalCall();
231 0 : return -1;
232 : }
233 1691440 : if (len >= 4 && s[0] == quote && s[1] == quote) {
234 : /* A triple quoted string. We've already skipped one quote at
235 : the start and one at the end of the string. Now skip the
236 : two at the start. */
237 122154 : s += 2;
238 122154 : len -= 2;
239 : /* And check that the last two match. */
240 122154 : if (s[--len] != quote || s[--len] != quote) {
241 0 : PyErr_BadInternalCall();
242 0 : return -1;
243 : }
244 : }
245 :
246 1691440 : if (fmode) {
247 : /* Just return the bytes. The caller will parse the resulting
248 : string. */
249 22297 : *fstr = s;
250 22297 : *fstrlen = len;
251 22297 : return 0;
252 : }
253 :
254 : /* Not an f-string. */
255 : /* Avoid invoking escape decoding routines if possible. */
256 1669140 : *rawmode = *rawmode || strchr(s, '\\') == NULL;
257 1669140 : if (*bytesmode) {
258 : /* Disallow non-ASCII characters. */
259 : const char *ch;
260 739404 : for (ch = s; *ch; ch++) {
261 687092 : if (Py_CHARMASK(*ch) >= 0x80) {
262 138 : RAISE_SYNTAX_ERROR(
263 : "bytes can only contain ASCII "
264 : "literal characters");
265 138 : return -1;
266 : }
267 : }
268 52312 : if (*rawmode) {
269 33126 : *result = PyBytes_FromStringAndSize(s, len);
270 : }
271 : else {
272 19186 : *result = decode_bytes_with_escapes(p, s, len, t);
273 : }
274 : }
275 : else {
276 1616690 : if (*rawmode) {
277 1477780 : *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
278 : }
279 : else {
280 138910 : *result = decode_unicode_with_escapes(p, s, len, t);
281 : }
282 : }
283 1669000 : return *result == NULL ? -1 : 0;
284 : }
285 :
286 :
287 :
288 : // FSTRING STUFF
289 :
290 : /* Fix locations for the given node and its children.
291 :
292 : `parent` is the enclosing node.
293 : `expr_start` is the starting position of the expression (pointing to the open brace).
294 : `n` is the node which locations are going to be fixed relative to parent.
295 : `expr_str` is the child node's string representation, including braces.
296 : */
297 : static bool
298 96110 : fstring_find_expr_location(Token *parent, const char* expr_start, char *expr_str, int *p_lines, int *p_cols)
299 : {
300 96110 : *p_lines = 0;
301 96110 : *p_cols = 0;
302 96110 : assert(expr_start != NULL && *expr_start == '{');
303 96110 : if (parent && parent->bytes) {
304 96110 : const char *parent_str = PyBytes_AsString(parent->bytes);
305 96110 : if (!parent_str) {
306 0 : return false;
307 : }
308 : // The following is needed, in order to correctly shift the column
309 : // offset, in the case that (disregarding any whitespace) a newline
310 : // immediately follows the opening curly brace of the fstring expression.
311 96110 : bool newline_after_brace = 1;
312 96110 : const char *start = expr_start + 1;
313 96152 : while (start && *start != '}' && *start != '\n') {
314 96145 : if (*start != ' ' && *start != '\t' && *start != '\f') {
315 96103 : newline_after_brace = 0;
316 96103 : break;
317 : }
318 42 : start++;
319 : }
320 :
321 : // Account for the characters from the last newline character to our
322 : // left until the beginning of expr_start.
323 96110 : if (!newline_after_brace) {
324 96103 : start = expr_start;
325 36140700 : while (start > parent_str && *start != '\n') {
326 36044600 : start--;
327 : }
328 96103 : *p_cols += (int)(expr_start - start);
329 : }
330 : /* adjust the start based on the number of newlines encountered
331 : before the f-string expression */
332 36266400 : for (const char *p = parent_str; p < expr_start; p++) {
333 36170200 : if (*p == '\n') {
334 4706 : (*p_lines)++;
335 : }
336 : }
337 : }
338 96110 : return true;
339 : }
340 :
341 :
342 : /* Compile this expression in to an expr_ty. Add parens around the
343 : expression, in order to allow leading spaces in the expression. */
344 : static expr_ty
345 96139 : fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
346 : Token *t)
347 : {
348 96139 : expr_ty expr = NULL;
349 : char *str;
350 : Py_ssize_t len;
351 : const char *s;
352 96139 : expr_ty result = NULL;
353 :
354 96139 : assert(expr_end >= expr_start);
355 96139 : assert(*(expr_start-1) == '{');
356 96139 : assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
357 : *expr_end == '=');
358 :
359 : /* If the substring is all whitespace, it's an error. We need to catch this
360 : here, and not when we call PyParser_SimpleParseStringFlagsFilename,
361 : because turning the expression '' in to '()' would go from being invalid
362 : to valid. */
363 96261 : for (s = expr_start; s != expr_end; s++) {
364 96232 : char c = *s;
365 : /* The Python parser ignores only the following whitespace
366 : characters (\r already is converted to \n). */
367 96232 : if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
368 96110 : break;
369 : }
370 : }
371 :
372 96139 : if (s == expr_end) {
373 29 : if (*expr_end == '!' || *expr_end == ':' || *expr_end == '=') {
374 23 : RAISE_SYNTAX_ERROR("f-string: expression required before '%c'", *expr_end);
375 23 : return NULL;
376 : }
377 6 : RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
378 6 : return NULL;
379 : }
380 :
381 96110 : len = expr_end - expr_start;
382 : /* Allocate 3 extra bytes: open paren, close paren, null byte. */
383 96110 : str = PyMem_Calloc(len + 3, sizeof(char));
384 96110 : if (str == NULL) {
385 0 : PyErr_NoMemory();
386 0 : return NULL;
387 : }
388 :
389 : // The call to fstring_find_expr_location is responsible for finding the column offset
390 : // the generated AST nodes need to be shifted to the right, which is equal to the number
391 : // of the f-string characters before the expression starts.
392 96110 : memcpy(str+1, expr_start, len);
393 : int lines, cols;
394 96110 : if (!fstring_find_expr_location(t, expr_start-1, str+1, &lines, &cols)) {
395 0 : PyMem_Free(str);
396 0 : return NULL;
397 : }
398 :
399 : // The parentheses are needed in order to allow for leading whitespace within
400 : // the f-string expression. This consequently gets parsed as a group (see the
401 : // group rule in python.gram).
402 96110 : str[0] = '(';
403 96110 : str[len+1] = ')';
404 :
405 96110 : struct tok_state* tok = _PyTokenizer_FromString(str, 1);
406 96110 : if (tok == NULL) {
407 0 : PyMem_Free(str);
408 0 : return NULL;
409 : }
410 96110 : Py_INCREF(p->tok->filename);
411 :
412 96110 : tok->filename = p->tok->filename;
413 96110 : tok->lineno = t->lineno + lines - 1;
414 :
415 96110 : Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
416 : NULL, p->arena);
417 :
418 96110 : p2->starting_lineno = t->lineno + lines;
419 96110 : p2->starting_col_offset = t->col_offset + cols;
420 :
421 96110 : expr = _PyPegen_run_parser(p2);
422 :
423 96110 : if (expr == NULL) {
424 16 : goto exit;
425 : }
426 96094 : result = expr;
427 :
428 96110 : exit:
429 96110 : PyMem_Free(str);
430 96110 : _PyPegen_Parser_Free(p2);
431 96110 : _PyTokenizer_Free(tok);
432 96110 : return result;
433 : }
434 :
435 : /* Return -1 on error.
436 :
437 : Return 0 if we reached the end of the literal.
438 :
439 : Return 1 if we haven't reached the end of the literal, but we want
440 : the caller to process the literal up to this point. Used for
441 : doubled braces.
442 : */
443 : static int
444 119840 : fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
445 : PyObject **literal, int recurse_lvl, Token *t)
446 : {
447 : /* Get any literal string. It ends when we hit an un-doubled left
448 : brace (which isn't part of a unicode name escape such as
449 : "\N{EULER CONSTANT}"), or the end of the string. */
450 :
451 119840 : const char *s = *str;
452 119840 : const char *literal_start = s;
453 119840 : int result = 0;
454 :
455 119840 : assert(*literal == NULL);
456 672380 : while (s < end) {
457 650201 : char ch = *s++;
458 650201 : if (!raw && ch == '\\' && s < end) {
459 3182 : ch = *s++;
460 3182 : if (ch == 'N') {
461 : /* We need to look at and skip matching braces for "\N{name}"
462 : sequences because otherwise we'll think the opening '{'
463 : starts an expression, which is not the case with "\N".
464 : Keep looking for either a matched '{' '}' pair, or the end
465 : of the string. */
466 :
467 27 : if (s < end && *s++ == '{') {
468 524 : while (s < end && *s++ != '}') {
469 : }
470 24 : continue;
471 : }
472 :
473 : /* This is an invalid "\N" sequence, since it's a "\N" not
474 : followed by a "{". Just keep parsing this literal. This
475 : error will be caught later by
476 : decode_unicode_with_escapes(). */
477 3 : continue;
478 : }
479 3155 : if (ch == '{' && warn_invalid_escape_sequence(p, s-1, t) < 0) {
480 0 : return -1;
481 : }
482 : }
483 650174 : if (ch == '{' || ch == '}') {
484 : /* Check for doubled braces, but only at the top level. If
485 : we checked at every level, then f'{0:{3}}' would fail
486 : with the two closing braces. */
487 97661 : if (recurse_lvl == 0) {
488 96553 : if (s < end && *s == ch) {
489 : /* We're going to tell the caller that the literal ends
490 : here, but that they should continue scanning. But also
491 : skip over the second brace when we resume scanning. */
492 518 : *str = s + 1;
493 518 : result = 1;
494 518 : goto done;
495 : }
496 :
497 : /* Where a single '{' is the start of a new expression, a
498 : single '}' is not allowed. */
499 96035 : if (ch == '}') {
500 8 : *str = s - 1;
501 8 : RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
502 8 : return -1;
503 : }
504 : }
505 : /* We're either at a '{', which means we're starting another
506 : expression; or a '}', which means we're at the end of this
507 : f-string (for a nested format_spec). */
508 97135 : s--;
509 97135 : break;
510 : }
511 : }
512 119314 : *str = s;
513 119314 : assert(s <= end);
514 119314 : assert(s == end || *s == '{' || *s == '}');
515 119314 : done:
516 119832 : if (literal_start != s) {
517 101446 : if (raw) {
518 356 : *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
519 : s - literal_start,
520 : NULL, NULL);
521 : }
522 : else {
523 101090 : *literal = decode_unicode_with_escapes(p, literal_start,
524 101090 : s - literal_start, t);
525 : }
526 101446 : if (!*literal) {
527 5 : return -1;
528 : }
529 : }
530 119827 : return result;
531 : }
532 :
533 : /* Forward declaration because parsing is recursive. */
534 : static expr_ty
535 : fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
536 : Token *first_token, Token* t, Token *last_token);
537 :
538 : /* Parse the f-string at *str, ending at end. We know *str starts an
539 : expression (so it must be a '{'). Returns the FormattedValue node, which
540 : includes the expression, conversion character, format_spec expression, and
541 : optionally the text of the expression (if = is used).
542 :
543 : Note that I don't do a perfect job here: I don't make sure that a
544 : closing brace doesn't match an opening paren, for example. It
545 : doesn't need to error on all invalid expressions, just correctly
546 : find the end of all valid ones. Any errors inside the expression
547 : will be caught when we parse it later.
548 :
549 : *expression is set to the expression. For an '=' "debug" expression,
550 : *expr_text is set to the debug text (the original text of the expression,
551 : including the '=' and any whitespace around it, as a string object). If
552 : not a debug expression, *expr_text set to NULL. */
553 : static int
554 96174 : fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
555 : PyObject **expr_text, expr_ty *expression, Token *first_token,
556 : Token *t, Token *last_token)
557 : {
558 : /* Return -1 on error, else 0. */
559 :
560 : const char *expr_start;
561 : const char *expr_end;
562 : expr_ty simple_expression;
563 96174 : expr_ty format_spec = NULL; /* Optional format specifier. */
564 96174 : int conversion = -1; /* The conversion char. Use default if not
565 : specified, or !r if using = and no format
566 : spec. */
567 :
568 : /* 0 if we're not in a string, else the quote char we're trying to
569 : match (single or double quote). */
570 96174 : char quote_char = 0;
571 :
572 : /* If we're inside a string, 1=normal, 3=triple-quoted. */
573 96174 : int string_type = 0;
574 :
575 : /* Keep track of nesting level for braces/parens/brackets in
576 : expressions. */
577 96174 : Py_ssize_t nested_depth = 0;
578 : char parenstack[MAXLEVEL];
579 :
580 96174 : *expr_text = NULL;
581 :
582 : /* Can only nest one level deep. */
583 96174 : if (recurse_lvl >= 2) {
584 1 : RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
585 1 : goto error;
586 : }
587 :
588 : /* The first char must be a left brace, or we wouldn't have gotten
589 : here. Skip over it. */
590 96173 : assert(**str == '{');
591 96173 : *str += 1;
592 :
593 96173 : expr_start = *str;
594 429328 : for (; *str < end; (*str)++) {
595 : char ch;
596 :
597 : /* Loop invariants. */
598 429313 : assert(nested_depth >= 0);
599 429313 : assert(*str >= expr_start && *str < end);
600 429313 : if (quote_char) {
601 3288 : assert(string_type == 1 || string_type == 3);
602 : } else {
603 426025 : assert(string_type == 0);
604 : }
605 :
606 429313 : ch = **str;
607 : /* Nowhere inside an expression is a backslash allowed. */
608 429313 : if (ch == '\\') {
609 : /* Error: can't include a backslash character, inside
610 : parens or strings or not. */
611 8 : RAISE_SYNTAX_ERROR(
612 : "f-string expression part "
613 : "cannot include a backslash");
614 8 : goto error;
615 : }
616 429305 : if (quote_char) {
617 : /* We're inside a string. See if we're at the end. */
618 : /* This code needs to implement the same non-error logic
619 : as tok_get from tokenizer.c, at the letter_quote
620 : label. To actually share that code would be a
621 : nightmare. But, it's unlikely to change and is small,
622 : so duplicate it here. Note we don't need to catch all
623 : of the errors, since they'll be caught when parsing the
624 : expression. We just need to match the non-error
625 : cases. Thus we can ignore \n in single-quoted strings,
626 : for example. Or non-terminated strings. */
627 3287 : if (ch == quote_char) {
628 : /* Does this match the string_type (single or triple
629 : quoted)? */
630 921 : if (string_type == 3) {
631 62 : if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
632 : /* We're at the end of a triple quoted string. */
633 38 : *str += 2;
634 38 : string_type = 0;
635 38 : quote_char = 0;
636 38 : continue;
637 : }
638 : } else {
639 : /* We're at the end of a normal string. */
640 859 : quote_char = 0;
641 859 : string_type = 0;
642 859 : continue;
643 : }
644 : }
645 426018 : } else if (ch == '\'' || ch == '"') {
646 : /* Is this a triple quoted string? */
647 902 : if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
648 38 : string_type = 3;
649 38 : *str += 2;
650 : } else {
651 : /* Start of a normal string. */
652 864 : string_type = 1;
653 : }
654 : /* Start looking for the end of the string. */
655 902 : quote_char = ch;
656 425116 : } else if (ch == '[' || ch == '{' || ch == '(') {
657 3682 : if (nested_depth >= MAXLEVEL) {
658 1 : RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
659 1 : goto error;
660 : }
661 3681 : parenstack[nested_depth] = ch;
662 3681 : nested_depth++;
663 421434 : } else if (ch == '#') {
664 : /* Error: can't include a comment character, inside parens
665 : or not. */
666 3 : RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
667 3 : goto error;
668 421431 : } else if (nested_depth == 0 &&
669 391385 : (ch == '!' || ch == ':' || ch == '}' ||
670 302144 : ch == '=' || ch == '>' || ch == '<')) {
671 : /* See if there's a next character. */
672 96252 : if (*str+1 < end) {
673 89444 : char next = *(*str+1);
674 :
675 : /* For "!=". since '=' is not an allowed conversion character,
676 : nothing is lost in this test. */
677 89444 : if ((ch == '!' && next == '=') || /* != */
678 89426 : (ch == '=' && next == '=') || /* == */
679 89419 : (ch == '<' && next == '=') || /* <= */
680 83 : (ch == '>' && next == '=') /* >= */
681 : ) {
682 31 : *str += 1;
683 31 : continue;
684 : }
685 : }
686 : /* Don't get out of the loop for these, if they're single
687 : chars (not part of 2-char tokens). If by themselves, they
688 : don't end an expression (unlike say '!'). */
689 96221 : if (ch == '>' || ch == '<') {
690 82 : continue;
691 : }
692 :
693 : /* Normal way out of this loop. */
694 96139 : break;
695 325179 : } else if (ch == ']' || ch == '}' || ch == ')') {
696 3479 : if (!nested_depth) {
697 2 : RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
698 2 : goto error;
699 : }
700 3477 : nested_depth--;
701 3477 : int opening = (unsigned char)parenstack[nested_depth];
702 3479 : if (!((opening == '(' && ch == ')') ||
703 924 : (opening == '[' && ch == ']') ||
704 23 : (opening == '{' && ch == '}')))
705 : {
706 5 : RAISE_SYNTAX_ERROR(
707 : "f-string: closing parenthesis '%c' "
708 : "does not match opening parenthesis '%c'",
709 : ch, opening);
710 5 : goto error;
711 : }
712 : } else {
713 : /* Just consume this char and loop around. */
714 : }
715 : }
716 96154 : expr_end = *str;
717 : /* If we leave the above loop in a string or with mismatched parens, we
718 : don't really care. We'll get a syntax error when compiling the
719 : expression. But, we can produce a better error message, so let's just
720 : do that.*/
721 96154 : if (quote_char) {
722 4 : RAISE_SYNTAX_ERROR("f-string: unterminated string");
723 4 : goto error;
724 : }
725 96150 : if (nested_depth) {
726 0 : int opening = (unsigned char)parenstack[nested_depth - 1];
727 0 : RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
728 0 : goto error;
729 : }
730 :
731 96150 : if (*str >= end) {
732 11 : goto unexpected_end_of_string;
733 : }
734 :
735 : /* Compile the expression as soon as possible, so we show errors
736 : related to the expression before errors related to the
737 : conversion or format_spec. */
738 96139 : simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
739 96139 : if (!simple_expression) {
740 45 : goto error;
741 : }
742 :
743 : /* Check for =, which puts the text value of the expression in
744 : expr_text. */
745 96094 : if (**str == '=') {
746 169 : if (p->feature_version < 8) {
747 1 : RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
748 : "only supported in Python 3.8 and greater");
749 1 : goto error;
750 : }
751 168 : *str += 1;
752 :
753 : /* Skip over ASCII whitespace. No need to test for end of string
754 : here, since we know there's at least a trailing quote somewhere
755 : ahead. */
756 178 : while (Py_ISSPACE(**str)) {
757 10 : *str += 1;
758 : }
759 168 : if (*str >= end) {
760 1 : goto unexpected_end_of_string;
761 : }
762 : /* Set *expr_text to the text of the expression. */
763 167 : *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
764 167 : if (!*expr_text) {
765 0 : goto error;
766 : }
767 : }
768 :
769 : /* Check for a conversion char, if present. */
770 96092 : if (**str == '!') {
771 6945 : *str += 1;
772 6945 : const char *conv_start = *str;
773 : while (1) {
774 13900 : if (*str >= end) {
775 5 : goto unexpected_end_of_string;
776 : }
777 13895 : if (**str == '}' || **str == ':') {
778 : break;
779 : }
780 6955 : *str += 1;
781 : }
782 6940 : if (*str == conv_start) {
783 3 : RAISE_SYNTAX_ERROR(
784 : "f-string: missed conversion character");
785 3 : goto error;
786 : }
787 :
788 6937 : conversion = (unsigned char)*conv_start;
789 : /* Validate the conversion. */
790 6937 : if ((*str != conv_start + 1) ||
791 6840 : !(conversion == 's' || conversion == 'r' || conversion == 'a'))
792 : {
793 16 : PyObject *conv_obj = PyUnicode_FromStringAndSize(conv_start,
794 16 : *str-conv_start);
795 16 : if (conv_obj) {
796 16 : RAISE_SYNTAX_ERROR(
797 : "f-string: invalid conversion character %R: "
798 : "expected 's', 'r', or 'a'",
799 : conv_obj);
800 16 : Py_DECREF(conv_obj);
801 : }
802 16 : goto error;
803 : }
804 :
805 : }
806 :
807 : /* Check for the format spec, if present. */
808 96068 : assert(*str < end);
809 96068 : if (**str == ':') {
810 970 : *str += 1;
811 970 : if (*str >= end) {
812 2 : goto unexpected_end_of_string;
813 : }
814 :
815 : /* Parse the format spec. */
816 968 : format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
817 : first_token, t, last_token);
818 968 : if (!format_spec) {
819 7 : goto error;
820 : }
821 : }
822 :
823 96059 : if (*str >= end || **str != '}') {
824 1 : goto unexpected_end_of_string;
825 : }
826 :
827 : /* We're at a right brace. Consume it. */
828 96058 : assert(*str < end);
829 96058 : assert(**str == '}');
830 96058 : *str += 1;
831 :
832 : /* If we're in = mode (detected by non-NULL expr_text), and have no format
833 : spec and no explicit conversion, set the conversion to 'r'. */
834 96058 : if (*expr_text && format_spec == NULL && conversion == -1) {
835 62 : conversion = 'r';
836 : }
837 :
838 : /* And now create the FormattedValue node that represents this
839 : entire expression with the conversion and format spec. */
840 : //TODO: Fix this
841 96058 : *expression = _PyAST_FormattedValue(simple_expression, conversion,
842 : format_spec, first_token->lineno,
843 : first_token->col_offset,
844 : last_token->end_lineno,
845 : last_token->end_col_offset, p->arena);
846 96058 : if (!*expression) {
847 0 : goto error;
848 : }
849 :
850 96058 : return 0;
851 :
852 20 : unexpected_end_of_string:
853 20 : RAISE_SYNTAX_ERROR("f-string: expecting '}'");
854 : /* Falls through to error. */
855 :
856 116 : error:
857 116 : Py_XDECREF(*expr_text);
858 116 : return -1;
859 :
860 : }
861 :
862 : /* Return -1 on error.
863 :
864 : Return 0 if we have a literal (possible zero length) and an
865 : expression (zero length if at the end of the string.
866 :
867 : Return 1 if we have a literal, but no expression, and we want the
868 : caller to call us again. This is used to deal with doubled
869 : braces.
870 :
871 : When called multiple times on the string 'a{{b{0}c', this function
872 : will return:
873 :
874 : 1. the literal 'a{' with no expression, and a return value
875 : of 1. Despite the fact that there's no expression, the return
876 : value of 1 means we're not finished yet.
877 :
878 : 2. the literal 'b' and the expression '0', with a return value of
879 : 0. The fact that there's an expression means we're not finished.
880 :
881 : 3. literal 'c' with no expression and a return value of 0. The
882 : combination of the return value of 0 with no expression means
883 : we're finished.
884 : */
885 : static int
886 119840 : fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
887 : int recurse_lvl, PyObject **literal,
888 : PyObject **expr_text, expr_ty *expression,
889 : Token *first_token, Token *t, Token *last_token)
890 : {
891 : int result;
892 :
893 119840 : assert(*literal == NULL && *expression == NULL);
894 :
895 : /* Get any literal string. */
896 119840 : result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
897 119840 : if (result < 0) {
898 13 : goto error;
899 : }
900 :
901 119827 : assert(result == 0 || result == 1);
902 :
903 119827 : if (result == 1) {
904 : /* We have a literal, but don't look at the expression. */
905 518 : return 1;
906 : }
907 :
908 119309 : if (*str >= end || **str == '}') {
909 : /* We're at the end of the string or the end of a nested
910 : f-string: no expression. The top-level error case where we
911 : expect to be at the end of the string but we're at a '}' is
912 : handled later. */
913 23135 : return 0;
914 : }
915 :
916 : /* We must now be the start of an expression, on a '{'. */
917 96174 : assert(**str == '{');
918 :
919 96174 : if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
920 : expression, first_token, t, last_token) < 0) {
921 116 : goto error;
922 : }
923 :
924 96058 : return 0;
925 :
926 129 : error:
927 129 : Py_CLEAR(*literal);
928 129 : return -1;
929 : }
930 :
931 : #ifdef NDEBUG
932 : #define ExprList_check_invariants(l)
933 : #else
934 : static void
935 8590440 : ExprList_check_invariants(ExprList *l)
936 : {
937 : /* Check our invariants. Make sure this object is "live", and
938 : hasn't been deallocated. */
939 8590440 : assert(l->size >= 0);
940 8590440 : assert(l->p != NULL);
941 8590440 : if (l->size <= EXPRLIST_N_CACHED) {
942 8172420 : assert(l->data == l->p);
943 : }
944 8590440 : }
945 : #endif
946 :
947 : static void
948 1608640 : ExprList_Init(ExprList *l)
949 : {
950 1608640 : l->allocated = EXPRLIST_N_CACHED;
951 1608640 : l->size = 0;
952 :
953 : /* Until we start allocating dynamically, p points to data. */
954 1608640 : l->p = l->data;
955 :
956 1608640 : ExprList_check_invariants(l);
957 1608640 : }
958 :
959 : static int
960 196241 : ExprList_Append(ExprList *l, expr_ty exp)
961 : {
962 196241 : ExprList_check_invariants(l);
963 196241 : if (l->size >= l->allocated) {
964 : /* We need to alloc (or realloc) the memory. */
965 58 : Py_ssize_t new_size = l->allocated * 2;
966 :
967 : /* See if we've ever allocated anything dynamically. */
968 58 : if (l->p == l->data) {
969 : Py_ssize_t i;
970 : /* We're still using the cached data. Switch to
971 : alloc-ing. */
972 15 : l->p = PyMem_Malloc(sizeof(expr_ty) * new_size);
973 15 : if (!l->p) {
974 0 : return -1;
975 : }
976 : /* Copy the cached data into the new buffer. */
977 975 : for (i = 0; i < l->size; i++) {
978 960 : l->p[i] = l->data[i];
979 : }
980 : } else {
981 : /* Just realloc. */
982 43 : expr_ty *tmp = PyMem_Realloc(l->p, sizeof(expr_ty) * new_size);
983 43 : if (!tmp) {
984 0 : PyMem_Free(l->p);
985 0 : l->p = NULL;
986 0 : return -1;
987 : }
988 43 : l->p = tmp;
989 : }
990 :
991 58 : l->allocated = new_size;
992 58 : assert(l->allocated == 2 * l->size);
993 : }
994 :
995 196241 : l->p[l->size++] = exp;
996 :
997 196241 : ExprList_check_invariants(l);
998 196241 : return 0;
999 : }
1000 :
1001 : static void
1002 20124 : ExprList_Dealloc(ExprList *l)
1003 : {
1004 20124 : ExprList_check_invariants(l);
1005 :
1006 : /* If there's been an error, or we've never dynamically allocated,
1007 : do nothing. */
1008 20124 : if (!l->p || l->p == l->data) {
1009 : /* Do nothing. */
1010 : } else {
1011 : /* We have dynamically allocated. Free the memory. */
1012 15 : PyMem_Free(l->p);
1013 : }
1014 20124 : l->p = NULL;
1015 20124 : l->size = -1;
1016 20124 : }
1017 :
1018 : static asdl_expr_seq *
1019 19350 : ExprList_Finish(ExprList *l, PyArena *arena)
1020 : {
1021 : asdl_expr_seq *seq;
1022 :
1023 19350 : ExprList_check_invariants(l);
1024 :
1025 : /* Allocate the asdl_seq and copy the expressions in to it. */
1026 19350 : seq = _Py_asdl_expr_seq_new(l->size, arena);
1027 19350 : if (seq) {
1028 : Py_ssize_t i;
1029 215589 : for (i = 0; i < l->size; i++) {
1030 196239 : asdl_seq_SET(seq, i, l->p[i]);
1031 : }
1032 : }
1033 19350 : ExprList_Dealloc(l);
1034 19350 : return seq;
1035 : }
1036 :
1037 : #ifdef NDEBUG
1038 : #define FstringParser_check_invariants(state)
1039 : #else
1040 : static void
1041 6549840 : FstringParser_check_invariants(FstringParser *state)
1042 : {
1043 6549840 : if (state->last_str) {
1044 3161850 : assert(PyUnicode_CheckExact(state->last_str));
1045 : }
1046 6549840 : ExprList_check_invariants(&state->expr_list);
1047 6549840 : }
1048 : #endif
1049 :
1050 : void
1051 1608640 : _PyPegen_FstringParser_Init(FstringParser *state)
1052 : {
1053 1608640 : state->last_str = NULL;
1054 1608640 : state->fmode = 0;
1055 1608640 : ExprList_Init(&state->expr_list);
1056 1608640 : FstringParser_check_invariants(state);
1057 1608640 : }
1058 :
1059 : void
1060 774 : _PyPegen_FstringParser_Dealloc(FstringParser *state)
1061 : {
1062 774 : FstringParser_check_invariants(state);
1063 :
1064 774 : Py_XDECREF(state->last_str);
1065 774 : ExprList_Dealloc(&state->expr_list);
1066 774 : }
1067 :
1068 : /* Make a Constant node, but decref the PyUnicode object being added. */
1069 : static expr_ty
1070 1639960 : make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
1071 : {
1072 1639960 : PyObject *s = *str;
1073 1639960 : PyObject *kind = NULL;
1074 1639960 : *str = NULL;
1075 1639960 : assert(PyUnicode_CheckExact(s));
1076 1639960 : if (_PyArena_AddPyObject(p->arena, s) < 0) {
1077 0 : Py_DECREF(s);
1078 0 : return NULL;
1079 : }
1080 1639960 : const char* the_str = PyBytes_AsString(first_token->bytes);
1081 1639960 : if (the_str && the_str[0] == 'u') {
1082 844 : kind = _PyPegen_new_identifier(p, "u");
1083 : }
1084 :
1085 1639960 : if (kind == NULL && PyErr_Occurred()) {
1086 0 : return NULL;
1087 : }
1088 :
1089 1639960 : return _PyAST_Constant(s, kind, first_token->lineno, first_token->col_offset,
1090 : last_token->end_lineno, last_token->end_col_offset,
1091 : p->arena);
1092 :
1093 : }
1094 :
1095 :
1096 : /* Add a non-f-string (that is, a regular literal string). str is
1097 : decref'd. */
1098 : int
1099 1717790 : _PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
1100 : {
1101 1717790 : FstringParser_check_invariants(state);
1102 :
1103 1717790 : assert(PyUnicode_CheckExact(str));
1104 :
1105 1717790 : if (PyUnicode_GET_LENGTH(str) == 0) {
1106 100678 : Py_DECREF(str);
1107 100678 : return 0;
1108 : }
1109 :
1110 1617110 : if (!state->last_str) {
1111 : /* We didn't have a string before, so just remember this one. */
1112 1539340 : state->last_str = str;
1113 : } else {
1114 : /* Concatenate this with the previous string. */
1115 77766 : PyUnicode_AppendAndDel(&state->last_str, str);
1116 77766 : if (!state->last_str) {
1117 0 : return -1;
1118 : }
1119 : }
1120 1617110 : FstringParser_check_invariants(state);
1121 1617110 : return 0;
1122 : }
1123 :
1124 : /* Parse an f-string. The f-string is in *str to end, with no
1125 : 'f' or quotes. */
1126 : int
1127 23264 : _PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
1128 : const char *end, int raw, int recurse_lvl,
1129 : Token *first_token, Token* t, Token *last_token)
1130 : {
1131 23264 : FstringParser_check_invariants(state);
1132 23264 : state->fmode = 1;
1133 :
1134 : /* Parse the f-string. */
1135 96576 : while (1) {
1136 119840 : PyObject *literal = NULL;
1137 119840 : PyObject *expr_text = NULL;
1138 119840 : expr_ty expression = NULL;
1139 :
1140 : /* If there's a zero length literal in front of the
1141 : expression, literal will be NULL. If we're at the end of
1142 : the f-string, expression will be NULL (unless result == 1,
1143 : see below). */
1144 119840 : int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
1145 : &literal, &expr_text,
1146 : &expression, first_token, t, last_token);
1147 119840 : if (result < 0) {
1148 129 : return -1;
1149 : }
1150 :
1151 : /* Add the literal, if any. */
1152 119711 : if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
1153 0 : Py_XDECREF(expr_text);
1154 0 : return -1;
1155 : }
1156 : /* Add the expr_text, if any. */
1157 119711 : if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
1158 0 : return -1;
1159 : }
1160 :
1161 : /* We've dealt with the literal and expr_text, their ownership has
1162 : been transferred to the state object. Don't look at them again. */
1163 :
1164 : /* See if we should just loop around to get the next literal
1165 : and expression, while ignoring the expression this
1166 : time. This is used for un-doubling braces, as an
1167 : optimization. */
1168 119711 : if (result == 1) {
1169 518 : continue;
1170 : }
1171 :
1172 119193 : if (!expression) {
1173 : /* We're done with this f-string. */
1174 23135 : break;
1175 : }
1176 :
1177 : /* We know we have an expression. Convert any existing string
1178 : to a Constant node. */
1179 96058 : if (state->last_str) {
1180 : /* Convert the existing last_str literal to a Constant node. */
1181 89599 : expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1182 89599 : if (!last_str || ExprList_Append(&state->expr_list, last_str) < 0) {
1183 0 : return -1;
1184 : }
1185 : }
1186 :
1187 96058 : if (ExprList_Append(&state->expr_list, expression) < 0) {
1188 0 : return -1;
1189 : }
1190 : }
1191 :
1192 : /* If recurse_lvl is zero, then we must be at the end of the
1193 : string. Otherwise, we must be at a right brace. */
1194 :
1195 23135 : if (recurse_lvl == 0 && *str < end-1) {
1196 0 : RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
1197 0 : return -1;
1198 : }
1199 23135 : if (recurse_lvl != 0 && **str != '}') {
1200 2 : RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1201 2 : return -1;
1202 : }
1203 :
1204 23133 : FstringParser_check_invariants(state);
1205 23133 : return 0;
1206 : }
1207 :
1208 : /* Convert the partial state reflected in last_str and expr_list to an
1209 : expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
1210 : expr_ty
1211 1559130 : _PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
1212 : Token *last_token)
1213 : {
1214 : asdl_expr_seq *seq;
1215 :
1216 1559130 : FstringParser_check_invariants(state);
1217 :
1218 : /* If we're just a constant string with no expressions, return
1219 : that. */
1220 1559130 : if (!state->fmode) {
1221 1539780 : assert(!state->expr_list.size);
1222 1539780 : if (!state->last_str) {
1223 : /* Create a zero length string. */
1224 100622 : state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
1225 100622 : if (!state->last_str) {
1226 0 : goto error;
1227 : }
1228 : }
1229 1539780 : return make_str_node_and_del(p, &state->last_str, first_token, last_token);
1230 : }
1231 :
1232 : /* Create a Constant node out of last_str, if needed. It will be the
1233 : last node in our expression list. */
1234 19350 : if (state->last_str) {
1235 10584 : expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1236 10584 : if (!str || ExprList_Append(&state->expr_list, str) < 0) {
1237 0 : goto error;
1238 : }
1239 : }
1240 : /* This has already been freed. */
1241 19350 : assert(state->last_str == NULL);
1242 :
1243 19350 : seq = ExprList_Finish(&state->expr_list, p->arena);
1244 19350 : if (!seq) {
1245 0 : goto error;
1246 : }
1247 :
1248 19350 : return _PyAST_JoinedStr(seq, first_token->lineno, first_token->col_offset,
1249 : last_token->end_lineno, last_token->end_col_offset,
1250 : p->arena);
1251 :
1252 0 : error:
1253 0 : _PyPegen_FstringParser_Dealloc(state);
1254 0 : return NULL;
1255 : }
1256 :
1257 : /* Given an f-string (with no 'f' or quotes) that's in *str and ends
1258 : at end, parse it into an expr_ty. Return NULL on error. Adjust
1259 : str to point past the parsed portion. */
1260 : static expr_ty
1261 968 : fstring_parse(Parser *p, const char **str, const char *end, int raw,
1262 : int recurse_lvl, Token *first_token, Token* t, Token *last_token)
1263 : {
1264 : FstringParser state;
1265 :
1266 968 : _PyPegen_FstringParser_Init(&state);
1267 968 : if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
1268 : first_token, t, last_token) < 0) {
1269 7 : _PyPegen_FstringParser_Dealloc(&state);
1270 7 : return NULL;
1271 : }
1272 :
1273 961 : return _PyPegen_FstringParser_Finish(p, &state, t, t);
1274 : }
|