Line data Source code
1 : #include <Python.h>
2 : #include <errcode.h>
3 :
4 : #include "tokenizer.h"
5 : #include "pegen.h"
6 :
7 : // TOKENIZER ERRORS
8 :
9 : void
10 26 : _PyPegen_raise_tokenizer_init_error(PyObject *filename)
11 : {
12 26 : if (!(PyErr_ExceptionMatches(PyExc_LookupError)
13 21 : || PyErr_ExceptionMatches(PyExc_SyntaxError)
14 1 : || PyErr_ExceptionMatches(PyExc_ValueError)
15 0 : || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
16 0 : return;
17 : }
18 26 : PyObject *errstr = NULL;
19 26 : PyObject *tuple = NULL;
20 : PyObject *type;
21 : PyObject *value;
22 : PyObject *tback;
23 26 : PyErr_Fetch(&type, &value, &tback);
24 26 : errstr = PyObject_Str(value);
25 26 : if (!errstr) {
26 0 : goto error;
27 : }
28 :
29 26 : PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
30 26 : if (!tmp) {
31 0 : goto error;
32 : }
33 :
34 26 : tuple = PyTuple_Pack(2, errstr, tmp);
35 26 : Py_DECREF(tmp);
36 26 : if (!value) {
37 0 : goto error;
38 : }
39 26 : PyErr_SetObject(PyExc_SyntaxError, tuple);
40 :
41 26 : error:
42 26 : Py_XDECREF(type);
43 26 : Py_XDECREF(value);
44 26 : Py_XDECREF(tback);
45 26 : Py_XDECREF(errstr);
46 26 : Py_XDECREF(tuple);
47 : }
48 :
49 : static inline void
50 170 : raise_unclosed_parentheses_error(Parser *p) {
51 170 : int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
52 170 : int error_col = p->tok->parencolstack[p->tok->level-1];
53 170 : RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
54 : error_lineno, error_col, error_lineno, -1,
55 : "'%c' was never closed",
56 170 : p->tok->parenstack[p->tok->level-1]);
57 170 : }
58 :
59 : int
60 394 : _Pypegen_tokenizer_error(Parser *p)
61 : {
62 394 : if (PyErr_Occurred()) {
63 202 : return -1;
64 : }
65 :
66 192 : const char *msg = NULL;
67 192 : PyObject* errtype = PyExc_SyntaxError;
68 192 : Py_ssize_t col_offset = -1;
69 192 : switch (p->tok->done) {
70 0 : case E_TOKEN:
71 0 : msg = "invalid token";
72 0 : break;
73 177 : case E_EOF:
74 177 : if (p->tok->level) {
75 167 : raise_unclosed_parentheses_error(p);
76 : } else {
77 10 : RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
78 : }
79 177 : return -1;
80 6 : case E_DEDENT:
81 6 : RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
82 6 : return -1;
83 0 : case E_INTR:
84 0 : if (!PyErr_Occurred()) {
85 0 : PyErr_SetNone(PyExc_KeyboardInterrupt);
86 : }
87 0 : return -1;
88 0 : case E_NOMEM:
89 0 : PyErr_NoMemory();
90 0 : return -1;
91 2 : case E_TABSPACE:
92 2 : errtype = PyExc_TabError;
93 2 : msg = "inconsistent use of tabs and spaces in indentation";
94 2 : break;
95 0 : case E_TOODEEP:
96 0 : errtype = PyExc_IndentationError;
97 0 : msg = "too many levels of indentation";
98 0 : break;
99 7 : case E_LINECONT: {
100 7 : col_offset = p->tok->cur - p->tok->buf - 1;
101 7 : msg = "unexpected character after line continuation character";
102 7 : break;
103 : }
104 0 : default:
105 0 : msg = "unknown parsing error";
106 : }
107 :
108 9 : RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
109 : col_offset >= 0 ? col_offset : 0,
110 9 : p->tok->lineno, -1, msg);
111 9 : return -1;
112 : }
113 :
114 : int
115 774 : _Pypegen_raise_decode_error(Parser *p)
116 : {
117 774 : assert(PyErr_Occurred());
118 774 : const char *errtype = NULL;
119 774 : if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
120 503 : errtype = "unicode error";
121 : }
122 271 : else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
123 2 : errtype = "value error";
124 : }
125 774 : if (errtype) {
126 : PyObject *type;
127 : PyObject *value;
128 : PyObject *tback;
129 : PyObject *errstr;
130 505 : PyErr_Fetch(&type, &value, &tback);
131 505 : errstr = PyObject_Str(value);
132 505 : if (errstr) {
133 505 : RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
134 505 : Py_DECREF(errstr);
135 : }
136 : else {
137 0 : PyErr_Clear();
138 0 : RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
139 : }
140 505 : Py_XDECREF(type);
141 505 : Py_XDECREF(value);
142 505 : Py_XDECREF(tback);
143 : }
144 :
145 774 : return -1;
146 : }
147 :
148 : static int
149 972 : _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
150 : // Tokenize the whole input to see if there are any tokenization
151 : // errors such as mistmatching parentheses. These will get priority
152 : // over generic syntax errors only if the line number of the error is
153 : // before the one that we had for the generic error.
154 :
155 : // We don't want to tokenize to the end for interactive input
156 972 : if (p->tok->prompt != NULL) {
157 8 : return 0;
158 : }
159 :
160 : PyObject *type, *value, *traceback;
161 964 : PyErr_Fetch(&type, &value, &traceback);
162 :
163 964 : Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
164 964 : Py_ssize_t current_err_line = current_token->lineno;
165 :
166 964 : int ret = 0;
167 :
168 3142 : for (;;) {
169 : const char *start;
170 : const char *end;
171 4106 : switch (_PyTokenizer_Get(p->tok, &start, &end)) {
172 31 : case ERRORTOKEN:
173 31 : if (p->tok->level != 0) {
174 31 : int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
175 31 : if (current_err_line > error_lineno) {
176 3 : raise_unclosed_parentheses_error(p);
177 3 : ret = -1;
178 3 : goto exit;
179 : }
180 : }
181 28 : break;
182 933 : case ENDMARKER:
183 933 : break;
184 3142 : default:
185 3142 : continue;
186 : }
187 961 : break;
188 : }
189 :
190 :
191 964 : exit:
192 964 : if (PyErr_Occurred()) {
193 3 : Py_XDECREF(value);
194 3 : Py_XDECREF(type);
195 3 : Py_XDECREF(traceback);
196 : } else {
197 961 : PyErr_Restore(type, value, traceback);
198 : }
199 964 : return ret;
200 : }
201 :
202 : // PARSER ERRORS
203 :
204 : void *
205 1101 : _PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...)
206 : {
207 1101 : if (p->fill == 0) {
208 : va_list va;
209 0 : va_start(va, errmsg);
210 0 : _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
211 0 : va_end(va);
212 0 : return NULL;
213 : }
214 :
215 1101 : Token *t = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
216 : Py_ssize_t col_offset;
217 1101 : Py_ssize_t end_col_offset = -1;
218 1101 : if (t->col_offset == -1) {
219 1002 : if (p->tok->cur == p->tok->buf) {
220 4 : col_offset = 0;
221 : } else {
222 998 : const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf;
223 998 : col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
224 : }
225 : } else {
226 99 : col_offset = t->col_offset + 1;
227 : }
228 :
229 1101 : if (t->end_col_offset != -1) {
230 99 : end_col_offset = t->end_col_offset + 1;
231 : }
232 :
233 : va_list va;
234 1101 : va_start(va, errmsg);
235 1101 : _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
236 1101 : va_end(va);
237 :
238 1101 : return NULL;
239 : }
240 :
241 : static PyObject *
242 35 : get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
243 : {
244 : /* If the file descriptor is interactive, the source lines of the current
245 : * (multi-line) statement are stored in p->tok->interactive_src_start.
246 : * If not, we're parsing from a string, which means that the whole source
247 : * is stored in p->tok->str. */
248 35 : assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp == stdin);
249 :
250 35 : char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
251 35 : if (cur_line == NULL) {
252 0 : assert(p->tok->fp_interactive);
253 : // We can reach this point if the tokenizer buffers for interactive source have not been
254 : // initialized because we failed to decode the original source with the given locale.
255 0 : return PyUnicode_FromStringAndSize("", 0);
256 : }
257 :
258 35 : Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno;
259 35 : const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp;
260 :
261 53 : for (int i = 0; i < relative_lineno - 1; i++) {
262 18 : char *new_line = strchr(cur_line, '\n');
263 : // The assert is here for debug builds but the conditional that
264 : // follows is there so in release builds we do not crash at the cost
265 : // to report a potentially wrong line.
266 18 : assert(new_line != NULL && new_line + 1 < buf_end);
267 18 : if (new_line == NULL || new_line + 1 > buf_end) {
268 : break;
269 : }
270 18 : cur_line = new_line + 1;
271 : }
272 :
273 : char *next_newline;
274 35 : if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
275 2 : next_newline = cur_line + strlen(cur_line);
276 : }
277 35 : return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
278 : }
279 :
280 : void *
281 2151 : _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
282 : Py_ssize_t lineno, Py_ssize_t col_offset,
283 : Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
284 : const char *errmsg, va_list va)
285 : {
286 2151 : PyObject *value = NULL;
287 2151 : PyObject *errstr = NULL;
288 2151 : PyObject *error_line = NULL;
289 2151 : PyObject *tmp = NULL;
290 2151 : p->error_indicator = 1;
291 :
292 2151 : if (end_lineno == CURRENT_POS) {
293 11 : end_lineno = p->tok->lineno;
294 : }
295 2151 : if (end_col_offset == CURRENT_POS) {
296 11 : end_col_offset = p->tok->cur - p->tok->line_start;
297 : }
298 :
299 2151 : if (p->start_rule == Py_fstring_input) {
300 12 : const char *fstring_msg = "f-string: ";
301 12 : Py_ssize_t len = strlen(fstring_msg) + strlen(errmsg);
302 :
303 12 : char *new_errmsg = PyMem_Malloc(len + 1); // Lengths of both strings plus NULL character
304 12 : if (!new_errmsg) {
305 0 : return (void *) PyErr_NoMemory();
306 : }
307 :
308 : // Copy both strings into new buffer
309 12 : memcpy(new_errmsg, fstring_msg, strlen(fstring_msg));
310 12 : memcpy(new_errmsg + strlen(fstring_msg), errmsg, strlen(errmsg));
311 12 : new_errmsg[len] = 0;
312 12 : errmsg = new_errmsg;
313 : }
314 2151 : errstr = PyUnicode_FromFormatV(errmsg, va);
315 2151 : if (!errstr) {
316 0 : goto error;
317 : }
318 :
319 2151 : if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) {
320 10 : error_line = get_error_line_from_tokenizer_buffers(p, lineno);
321 : }
322 2141 : else if (p->start_rule == Py_file_input) {
323 574 : error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
324 574 : (int) lineno, p->tok->encoding);
325 : }
326 :
327 2151 : if (!error_line) {
328 : /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
329 : then we need to find the error line from some other source, because
330 : p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
331 : failed or we're parsing from a string or the REPL. There's a third edge case where
332 : we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
333 : `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
334 : does not physically exist */
335 2101 : assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
336 :
337 4177 : if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
338 2076 : Py_ssize_t size = p->tok->inp - p->tok->buf;
339 2076 : error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
340 : }
341 25 : else if (p->tok->fp == NULL || p->tok->fp == stdin) {
342 25 : error_line = get_error_line_from_tokenizer_buffers(p, lineno);
343 : }
344 : else {
345 0 : error_line = PyUnicode_FromStringAndSize("", 0);
346 : }
347 2101 : if (!error_line) {
348 0 : goto error;
349 : }
350 : }
351 :
352 2151 : if (p->start_rule == Py_fstring_input) {
353 12 : col_offset -= p->starting_col_offset;
354 12 : end_col_offset -= p->starting_col_offset;
355 : }
356 :
357 2151 : Py_ssize_t col_number = col_offset;
358 2151 : Py_ssize_t end_col_number = end_col_offset;
359 :
360 2151 : if (p->tok->encoding != NULL) {
361 2092 : col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
362 2092 : if (col_number < 0) {
363 0 : goto error;
364 : }
365 2092 : if (end_col_number > 0) {
366 925 : Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number);
367 925 : if (end_col_offset < 0) {
368 0 : goto error;
369 : } else {
370 925 : end_col_number = end_col_offset;
371 : }
372 : }
373 : }
374 2151 : tmp = Py_BuildValue("(OiiNii)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
375 2151 : if (!tmp) {
376 0 : goto error;
377 : }
378 2151 : value = PyTuple_Pack(2, errstr, tmp);
379 2151 : Py_DECREF(tmp);
380 2151 : if (!value) {
381 0 : goto error;
382 : }
383 2151 : PyErr_SetObject(errtype, value);
384 :
385 2151 : Py_DECREF(errstr);
386 2151 : Py_DECREF(value);
387 2151 : if (p->start_rule == Py_fstring_input) {
388 12 : PyMem_Free((void *)errmsg);
389 : }
390 2151 : return NULL;
391 :
392 0 : error:
393 0 : Py_XDECREF(errstr);
394 0 : Py_XDECREF(error_line);
395 0 : if (p->start_rule == Py_fstring_input) {
396 0 : PyMem_Free((void *)errmsg);
397 : }
398 0 : return NULL;
399 : }
400 :
401 : void
402 2000 : _Pypegen_set_syntax_error(Parser* p, Token* last_token) {
403 : // Existing sintax error
404 2000 : if (PyErr_Occurred()) {
405 : // Prioritize tokenizer errors to custom syntax errors raised
406 : // on the second phase only if the errors come from the parser.
407 1499 : int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK);
408 1499 : if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
409 479 : _PyPegen_tokenize_full_source_to_check_for_errors(p);
410 : }
411 : // Propagate the existing syntax error.
412 1499 : return;
413 : }
414 : // Initialization error
415 501 : if (p->fill == 0) {
416 0 : RAISE_SYNTAX_ERROR("error at start before reading any input");
417 : }
418 : // Parser encountered EOF (End of File) unexpectedtly
419 501 : if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) {
420 0 : if (p->tok->level) {
421 0 : raise_unclosed_parentheses_error(p);
422 : } else {
423 0 : RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
424 : }
425 0 : return;
426 : }
427 : // Indentation error in the tokenizer
428 501 : if (last_token->type == INDENT || last_token->type == DEDENT) {
429 8 : RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
430 8 : return;
431 : }
432 : // Unknown error (generic case)
433 :
434 : // Use the last token we found on the first pass to avoid reporting
435 : // incorrect locations for generic syntax errors just because we reached
436 : // further away when trying to find specific syntax errors in the second
437 : // pass.
438 493 : RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
439 : // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
440 : // generic SyntaxError we just raised if errors are found.
441 493 : _PyPegen_tokenize_full_source_to_check_for_errors(p);
442 : }
|