/home/mdboom/Work/builds/cpython/Parser/pegen_errors.c
Line | Count | Source (jump to first uncovered line) |
1 | #include <Python.h> |
2 | #include <errcode.h> |
3 | |
4 | #include "tokenizer.h" |
5 | #include "pegen.h" |
6 | |
7 | // TOKENIZER ERRORS |
8 | |
9 | void |
10 | _PyPegen_raise_tokenizer_init_error(PyObject *filename) |
11 | { |
12 | if (!(PyErr_ExceptionMatches(PyExc_LookupError) Branch (12:11): [True: 5, False: 21]
|
13 | || PyErr_ExceptionMatches(PyExc_SyntaxError)21 Branch (13:14): [True: 20, False: 1]
|
14 | || PyErr_ExceptionMatches(PyExc_ValueError)1 Branch (14:14): [True: 1, False: 0]
|
15 | || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)0 )) { Branch (15:14): [True: 0, False: 0]
|
16 | return; |
17 | } |
18 | PyObject *errstr = NULL; |
19 | PyObject *tuple = NULL; |
20 | PyObject *type; |
21 | PyObject *value; |
22 | PyObject *tback; |
23 | PyErr_Fetch(&type, &value, &tback); |
24 | errstr = PyObject_Str(value); |
25 | if (!errstr) { Branch (25:9): [True: 0, False: 26]
|
26 | goto error; |
27 | } |
28 | |
29 | PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None); |
30 | if (!tmp) { Branch (30:9): [True: 0, False: 26]
|
31 | goto error; |
32 | } |
33 | |
34 | tuple = PyTuple_Pack(2, errstr, tmp); |
35 | Py_DECREF(tmp); |
36 | if (!value) { Branch (36:9): [True: 0, False: 26]
|
37 | goto error; |
38 | } |
39 | PyErr_SetObject(PyExc_SyntaxError, tuple); |
40 | |
41 | error: |
42 | Py_XDECREF(type); |
43 | Py_XDECREF(value); |
44 | Py_XDECREF(tback); |
45 | Py_XDECREF(errstr); |
46 | Py_XDECREF(tuple); |
47 | } |
48 | |
49 | static inline void |
50 | raise_unclosed_parentheses_error(Parser *p) { |
51 | int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; |
52 | int error_col = p->tok->parencolstack[p->tok->level-1]; |
53 | RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, |
54 | error_lineno, error_col, error_lineno, -1, |
55 | "'%c' was never closed", |
56 | p->tok->parenstack[p->tok->level-1]); |
57 | } |
58 | |
59 | int |
60 | _Pypegen_tokenizer_error(Parser *p) |
61 | { |
62 | if (PyErr_Occurred()) { Branch (62:9): [True: 198, False: 178]
|
63 | return -1; |
64 | } |
65 | |
66 | const char *msg = NULL; |
67 | PyObject* errtype = PyExc_SyntaxError; |
68 | Py_ssize_t col_offset = -1; |
69 | switch (p->tok->done) { |
70 | case E_TOKEN: Branch (70:9): [True: 0, False: 178]
|
71 | msg = "invalid token"; |
72 | break; |
73 | case E_EOF: Branch (73:9): [True: 163, False: 15]
|
74 | if (p->tok->level) { Branch (74:17): [True: 155, False: 8]
|
75 | raise_unclosed_parentheses_error(p); |
76 | } else { |
77 | RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); |
78 | } |
79 | return -1; |
80 | case E_DEDENT: Branch (80:9): [True: 6, False: 172]
|
81 | RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level"); |
82 | return -1; |
83 | case E_INTR: Branch (83:9): [True: 0, False: 178]
|
84 | if (!PyErr_Occurred()) { Branch (84:17): [True: 0, False: 0]
|
85 | PyErr_SetNone(PyExc_KeyboardInterrupt); |
86 | } |
87 | return -1; |
88 | case E_NOMEM: Branch (88:9): [True: 0, False: 178]
|
89 | PyErr_NoMemory(); |
90 | return -1; |
91 | case E_TABSPACE: Branch (91:9): [True: 2, False: 176]
|
92 | errtype = PyExc_TabError; |
93 | msg = "inconsistent use of tabs and spaces in indentation"; |
94 | break; |
95 | case E_TOODEEP: Branch (95:9): [True: 0, False: 178]
|
96 | errtype = PyExc_IndentationError; |
97 | msg = "too many levels of indentation"; |
98 | break; |
99 | case E_LINECONT: { Branch (99:9): [True: 7, False: 171]
|
100 | col_offset = p->tok->cur - p->tok->buf - 1; |
101 | msg = "unexpected character after line continuation character"; |
102 | break; |
103 | } |
104 | default: Branch (104:9): [True: 0, False: 178]
|
105 | msg = "unknown parsing error"; |
106 | } |
107 | |
108 | RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno, |
109 | col_offset >= 0 ? col_offset7 : 02 , Branch (109:32): [True: 7, False: 2]
|
110 | p->tok->lineno, -1, msg); |
111 | return -1; |
112 | } |
113 | |
114 | int |
115 | _Pypegen_raise_decode_error(Parser *p) |
116 | { |
117 | assert(PyErr_Occurred()); |
118 | const char *errtype = NULL; |
119 | if (PyErr_ExceptionMatches(PyExc_UnicodeError)) { Branch (119:9): [True: 40, False: 265]
|
120 | errtype = "unicode error"; |
121 | } |
122 | else if (PyErr_ExceptionMatches(PyExc_ValueError)) { Branch (122:14): [True: 2, False: 263]
|
123 | errtype = "value error"; |
124 | } |
125 | if (errtype) { Branch (125:9): [True: 42, False: 263]
|
126 | PyObject *type; |
127 | PyObject *value; |
128 | PyObject *tback; |
129 | PyObject *errstr; |
130 | PyErr_Fetch(&type, &value, &tback); |
131 | errstr = PyObject_Str(value); |
132 | if (errstr) { Branch (132:13): [True: 42, False: 0]
|
133 | RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr); |
134 | Py_DECREF(errstr); |
135 | } |
136 | else { |
137 | PyErr_Clear(); |
138 | RAISE_SYNTAX_ERROR("(%s) unknown error", errtype); |
139 | } |
140 | Py_XDECREF(type); |
141 | Py_XDECREF(value); |
142 | Py_XDECREF(tback); |
143 | } |
144 | |
145 | return -1; |
146 | } |
147 | |
148 | static int |
149 | _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) { |
150 | // Tokenize the whole input to see if there are any tokenization |
151 | // errors such as mistmatching parentheses. These will get priority |
152 | // over generic syntax errors only if the line number of the error is |
153 | // before the one that we had for the generic error. |
154 | |
155 | // We don't want to tokenize to the end for interactive input |
156 | if (p->tok->prompt != NULL) { Branch (156:9): [True: 0, False: 954]
|
157 | return 0; |
158 | } |
159 | |
160 | PyObject *type, *value, *traceback; |
161 | PyErr_Fetch(&type, &value, &traceback); |
162 | |
163 | Token *current_token = p->known_err_token != NULL ? p->known_err_token0 : p->tokens[p->fill - 1]; Branch (163:28): [True: 0, False: 954]
|
164 | Py_ssize_t current_err_line = current_token->lineno; |
165 | |
166 | int ret = 0; |
167 | |
168 | for (;;) { |
169 | const char *start; |
170 | const char *end; |
171 | switch (_PyTokenizer_Get(p->tok, &start, &end)) { |
172 | case ERRORTOKEN: Branch (172:13): [True: 31, False: 4.05k]
|
173 | if (p->tok->level != 0) { Branch (173:21): [True: 31, False: 0]
|
174 | int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; |
175 | if (current_err_line > error_lineno) { Branch (175:25): [True: 3, False: 28]
|
176 | raise_unclosed_parentheses_error(p); |
177 | ret = -1; |
178 | goto exit; |
179 | } |
180 | } |
181 | break; |
182 | case ENDMARKER: Branch (182:13): [True: 923, False: 3.16k]
|
183 | break; |
184 | default: Branch (184:13): [True: 3.13k, False: 954]
|
185 | continue; |
186 | } |
187 | break; |
188 | } |
189 | |
190 | |
191 | exit: |
192 | if (PyErr_Occurred()) { Branch (192:9): [True: 3, False: 951]
|
193 | Py_XDECREF(value); |
194 | Py_XDECREF(type); |
195 | Py_XDECREF(traceback); |
196 | } else { |
197 | PyErr_Restore(type, value, traceback); |
198 | } |
199 | return ret; |
200 | } |
201 | |
202 | // PARSER ERRORS |
203 | |
204 | void * |
205 | _PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...) |
206 | { |
207 | if (p->fill == 0) { Branch (207:9): [True: 0, False: 634]
|
208 | va_list va; |
209 | va_start(va, errmsg); |
210 | _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va); |
211 | va_end(va); |
212 | return NULL; |
213 | } |
214 | |
215 | Token *t = p->known_err_token != NULL ? p->known_err_token4 : p->tokens[p->fill - 1]630 ; Branch (215:16): [True: 4, False: 630]
|
216 | Py_ssize_t col_offset; |
217 | Py_ssize_t end_col_offset = -1; |
218 | if (t->col_offset == -1) { Branch (218:9): [True: 538, False: 96]
|
219 | if (p->tok->cur == p->tok->buf) { Branch (219:13): [True: 3, False: 535]
|
220 | col_offset = 0; |
221 | } else { |
222 | const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf0 ; Branch (222:33): [True: 535, False: 0]
|
223 | col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int); |
224 | } |
225 | } else { |
226 | col_offset = t->col_offset + 1; |
227 | } |
228 | |
229 | if (t->end_col_offset != -1) { Branch (229:9): [True: 96, False: 538]
|
230 | end_col_offset = t->end_col_offset + 1; |
231 | } |
232 | |
233 | va_list va; |
234 | va_start(va, errmsg); |
235 | _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va); |
236 | va_end(va); |
237 | |
238 | return NULL; |
239 | } |
240 | |
241 | static PyObject * |
242 | get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno) |
243 | { |
244 | /* If the file descriptor is interactive, the source lines of the current |
245 | * (multi-line) statement are stored in p->tok->interactive_src_start. |
246 | * If not, we're parsing from a string, which means that the whole source |
247 | * is stored in p->tok->str. */ |
248 | assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp == stdin); |
249 | |
250 | char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start0 : p->tok->str; Branch (250:22): [True: 0, False: 25]
|
251 | if (cur_line == NULL) { Branch (251:9): [True: 0, False: 25]
|
252 | assert(p->tok->fp_interactive); |
253 | // We can reach this point if the tokenizer buffers for interactive source have not been |
254 | // initialized because we failed to decode the original source with the given locale. |
255 | return PyUnicode_FromStringAndSize("", 0); |
256 | } |
257 | |
258 | Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 11 : lineno24 ; Branch (258:34): [True: 1, False: 24]
|
259 | const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end0 : p->tok->inp; Branch (259:27): [True: 0, False: 25]
|
260 | |
261 | for (int i = 0; i < relative_lineno - 1; i++16 ) { Branch (261:21): [True: 16, False: 25]
|
262 | char *new_line = strchr(cur_line, '\n'); |
263 | // The assert is here for debug builds but the conditional that |
264 | // follows is there so in release builds we do not crash at the cost |
265 | // to report a potentially wrong line. |
266 | assert(new_line != NULL && new_line + 1 < buf_end); |
267 | if (new_line == NULL || new_line + 1 > buf_end) { Branch (267:13): [True: 0, False: 16]
Branch (267:33): [True: 0, False: 16]
|
268 | break; |
269 | } |
270 | cur_line = new_line + 1; |
271 | } |
272 | |
273 | char *next_newline; |
274 | if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line Branch (274:9): [True: 2, False: 23]
|
275 | next_newline = cur_line + strlen(cur_line); |
276 | } |
277 | return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace"); |
278 | } |
279 | |
280 | void * |
281 | _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, |
282 | Py_ssize_t lineno, Py_ssize_t col_offset, |
283 | Py_ssize_t end_lineno, Py_ssize_t end_col_offset, |
284 | const char *errmsg, va_list va) |
285 | { |
286 | PyObject *value = NULL; |
287 | PyObject *errstr = NULL; |
288 | PyObject *error_line = NULL; |
289 | PyObject *tmp = NULL; |
290 | p->error_indicator = 1; |
291 | |
292 | if (end_lineno == CURRENT_POS) { Branch (292:9): [True: 11, False: 1.64k]
|
293 | end_lineno = p->tok->lineno; |
294 | } |
295 | if (end_col_offset == CURRENT_POS) { Branch (295:9): [True: 11, False: 1.64k]
|
296 | end_col_offset = p->tok->cur - p->tok->line_start; |
297 | } |
298 | |
299 | if (p->start_rule == Py_fstring_input) { Branch (299:9): [True: 11, False: 1.64k]
|
300 | const char *fstring_msg = "f-string: "; |
301 | Py_ssize_t len = strlen(fstring_msg) + strlen(errmsg); |
302 | |
303 | char *new_errmsg = PyMem_Malloc(len + 1); // Lengths of both strings plus NULL character |
304 | if (!new_errmsg) { Branch (304:13): [True: 0, False: 11]
|
305 | return (void *) PyErr_NoMemory(); |
306 | } |
307 | |
308 | // Copy both strings into new buffer |
309 | memcpy(new_errmsg, fstring_msg, strlen(fstring_msg)); |
310 | memcpy(new_errmsg + strlen(fstring_msg), errmsg, strlen(errmsg)); |
311 | new_errmsg[len] = 0; |
312 | errmsg = new_errmsg; |
313 | } |
314 | errstr = PyUnicode_FromFormatV(errmsg, va); |
315 | if (!errstr) { Branch (315:9): [True: 0, False: 1.65k]
|
316 | goto error; |
317 | } |
318 | |
319 | if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL0 ) { Branch (319:9): [True: 0, False: 1.65k]
Branch (319:35): [True: 0, False: 0]
|
320 | error_line = get_error_line_from_tokenizer_buffers(p, lineno); |
321 | } |
322 | else if (p->start_rule == Py_file_input) { Branch (322:14): [True: 555, False: 1.10k]
|
323 | error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename, |
324 | (int) lineno, p->tok->encoding); |
325 | } |
326 | |
327 | if (!error_line) { Branch (327:9): [True: 1.63k, False: 24]
|
328 | /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called, |
329 | then we need to find the error line from some other source, because |
330 | p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly |
331 | failed or we're parsing from a string or the REPL. There's a third edge case where |
332 | we're actually parsing from a file, which has an E_EOF SyntaxError and in that case |
333 | `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which |
334 | does not physically exist */ |
335 | assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF); |
336 | |
337 | if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf1.61k ) { Branch (337:13): [True: 1.61k, False: 23]
Branch (337:41): [True: 1.60k, False: 2]
|
338 | Py_ssize_t size = p->tok->inp - p->tok->buf; |
339 | error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace"); |
340 | } |
341 | else if (p->tok->fp == NULL || p->tok->fp == stdin0 ) { Branch (341:18): [True: 25, False: 0]
Branch (341:40): [True: 0, False: 0]
|
342 | error_line = get_error_line_from_tokenizer_buffers(p, lineno); |
343 | } |
344 | else { |
345 | error_line = PyUnicode_FromStringAndSize("", 0); |
346 | } |
347 | if (!error_line) { Branch (347:13): [True: 0, False: 1.63k]
|
348 | goto error; |
349 | } |
350 | } |
351 | |
352 | if (p->start_rule == Py_fstring_input) { Branch (352:9): [True: 11, False: 1.64k]
|
353 | col_offset -= p->starting_col_offset; |
354 | end_col_offset -= p->starting_col_offset; |
355 | } |
356 | |
357 | Py_ssize_t col_number = col_offset; |
358 | Py_ssize_t end_col_number = end_col_offset; |
359 | |
360 | if (p->tok->encoding != NULL) { Branch (360:9): [True: 1.61k, False: 41]
|
361 | col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset); |
362 | if (col_number < 0) { Branch (362:13): [True: 0, False: 1.61k]
|
363 | goto error; |
364 | } |
365 | if (end_col_number > 0) { Branch (365:13): [True: 922, False: 695]
|
366 | Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number); |
367 | if (end_col_offset < 0) { Branch (367:17): [True: 0, False: 922]
|
368 | goto error; |
369 | } else { |
370 | end_col_number = end_col_offset; |
371 | } |
372 | } |
373 | } |
374 | tmp = Py_BuildValue("(OiiNii)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number); |
375 | if (!tmp) { Branch (375:9): [True: 0, False: 1.65k]
|
376 | goto error; |
377 | } |
378 | value = PyTuple_Pack(2, errstr, tmp); |
379 | Py_DECREF(tmp); |
380 | if (!value) { Branch (380:9): [True: 0, False: 1.65k]
|
381 | goto error; |
382 | } |
383 | PyErr_SetObject(errtype, value); |
384 | |
385 | Py_DECREF(errstr); |
386 | Py_DECREF(value); |
387 | if (p->start_rule == Py_fstring_input) { Branch (387:9): [True: 11, False: 1.64k]
|
388 | PyMem_Free((void *)errmsg); |
389 | } |
390 | return NULL; |
391 | |
392 | error: |
393 | Py_XDECREF(errstr); |
394 | Py_XDECREF(error_line); |
395 | if (p->start_rule == Py_fstring_input) { Branch (395:9): [True: 0, False: 0]
|
396 | PyMem_Free((void *)errmsg); |
397 | } |
398 | return NULL; |
399 | } |
400 | |
401 | void |
402 | _Pypegen_set_syntax_error(Parser* p, Token* last_token) { |
403 | // Existing sintax error |
404 | if (PyErr_Occurred()) { Branch (404:9): [True: 1.00k, False: 493]
|
405 | // Prioritize tokenizer errors to custom syntax errors raised |
406 | // on the second phase only if the errors come from the parser. |
407 | int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK); Branch (407:26): [True: 0, False: 1.00k]
Branch (407:52): [True: 469, False: 537]
|
408 | if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)469 ) { Branch (408:13): [True: 469, False: 537]
Branch (408:26): [True: 469, False: 0]
|
409 | _PyPegen_tokenize_full_source_to_check_for_errors(p); |
410 | } |
411 | // Propagate the existing syntax error. |
412 | return; |
413 | } |
414 | // Initialization error |
415 | if (p->fill == 0) { Branch (415:9): [True: 0, False: 493]
|
416 | RAISE_SYNTAX_ERROR("error at start before reading any input"); |
417 | } |
418 | // Parser encountered EOF (End of File) unexpectedtly |
419 | if (last_token->type == ERRORTOKEN && p->tok->done == 0 E_EOF0 ) { Branch (419:9): [True: 0, False: 493]
Branch (419:43): [True: 0, False: 0]
|
420 | if (p->tok->level) { Branch (420:13): [True: 0, False: 0]
|
421 | raise_unclosed_parentheses_error(p); |
422 | } else { |
423 | RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); |
424 | } |
425 | return; |
426 | } |
427 | // Indentation error in the tokenizer |
428 | if (last_token->type == INDENT || last_token->type == 485 DEDENT485 ) { Branch (428:9): [True: 8, False: 485]
Branch (428:39): [True: 0, False: 485]
|
429 | RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent"); |
430 | return; |
431 | } |
432 | // Unknown error (generic case) |
433 | |
434 | // Use the last token we found on the first pass to avoid reporting |
435 | // incorrect locations for generic syntax errors just because we reached |
436 | // further away when trying to find specific syntax errors in the second |
437 | // pass. |
438 | RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax"); |
439 | // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing |
440 | // generic SyntaxError we just raised if errors are found. |
441 | _PyPegen_tokenize_full_source_to_check_for_errors(p); |
442 | } |