Coverage Report

Created: 2022-07-08 09:39

/home/mdboom/Work/builds/cpython/Parser/pegen_errors.c
Line
Count
Source (jump to first uncovered line)
1
#include <Python.h>
2
#include <errcode.h>
3
4
#include "tokenizer.h"
5
#include "pegen.h"
6
7
// TOKENIZER ERRORS
8
9
void
10
_PyPegen_raise_tokenizer_init_error(PyObject *filename)
11
{
12
    if (!(PyErr_ExceptionMatches(PyExc_LookupError)
  Branch (12:11): [True: 5, False: 21]
13
          || 
PyErr_ExceptionMatches(PyExc_SyntaxError)21
  Branch (13:14): [True: 20, False: 1]
14
          || 
PyErr_ExceptionMatches(PyExc_ValueError)1
  Branch (14:14): [True: 1, False: 0]
15
          || 
PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)0
)) {
  Branch (15:14): [True: 0, False: 0]
16
        return;
17
    }
18
    PyObject *errstr = NULL;
19
    PyObject *tuple = NULL;
20
    PyObject *type;
21
    PyObject *value;
22
    PyObject *tback;
23
    PyErr_Fetch(&type, &value, &tback);
24
    errstr = PyObject_Str(value);
25
    if (!errstr) {
  Branch (25:9): [True: 0, False: 26]
26
        goto error;
27
    }
28
29
    PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
30
    if (!tmp) {
  Branch (30:9): [True: 0, False: 26]
31
        goto error;
32
    }
33
34
    tuple = PyTuple_Pack(2, errstr, tmp);
35
    Py_DECREF(tmp);
36
    if (!value) {
  Branch (36:9): [True: 0, False: 26]
37
        goto error;
38
    }
39
    PyErr_SetObject(PyExc_SyntaxError, tuple);
40
41
error:
42
    Py_XDECREF(type);
43
    Py_XDECREF(value);
44
    Py_XDECREF(tback);
45
    Py_XDECREF(errstr);
46
    Py_XDECREF(tuple);
47
}
48
49
static inline void
50
raise_unclosed_parentheses_error(Parser *p) {
51
       int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
52
       int error_col = p->tok->parencolstack[p->tok->level-1];
53
       RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
54
                                  error_lineno, error_col, error_lineno, -1,
55
                                  "'%c' was never closed",
56
                                  p->tok->parenstack[p->tok->level-1]);
57
}
58
59
int
60
_Pypegen_tokenizer_error(Parser *p)
61
{
62
    if (PyErr_Occurred()) {
  Branch (62:9): [True: 198, False: 178]
63
        return -1;
64
    }
65
66
    const char *msg = NULL;
67
    PyObject* errtype = PyExc_SyntaxError;
68
    Py_ssize_t col_offset = -1;
69
    switch (p->tok->done) {
70
        case E_TOKEN:
  Branch (70:9): [True: 0, False: 178]
71
            msg = "invalid token";
72
            break;
73
        case E_EOF:
  Branch (73:9): [True: 163, False: 15]
74
            if (p->tok->level) {
  Branch (74:17): [True: 155, False: 8]
75
                raise_unclosed_parentheses_error(p);
76
            } else {
77
                RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
78
            }
79
            return -1;
80
        case E_DEDENT:
  Branch (80:9): [True: 6, False: 172]
81
            RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
82
            return -1;
83
        case E_INTR:
  Branch (83:9): [True: 0, False: 178]
84
            if (!PyErr_Occurred()) {
  Branch (84:17): [True: 0, False: 0]
85
                PyErr_SetNone(PyExc_KeyboardInterrupt);
86
            }
87
            return -1;
88
        case E_NOMEM:
  Branch (88:9): [True: 0, False: 178]
89
            PyErr_NoMemory();
90
            return -1;
91
        case E_TABSPACE:
  Branch (91:9): [True: 2, False: 176]
92
            errtype = PyExc_TabError;
93
            msg = "inconsistent use of tabs and spaces in indentation";
94
            break;
95
        case E_TOODEEP:
  Branch (95:9): [True: 0, False: 178]
96
            errtype = PyExc_IndentationError;
97
            msg = "too many levels of indentation";
98
            break;
99
        case E_LINECONT: {
  Branch (99:9): [True: 7, False: 171]
100
            col_offset = p->tok->cur - p->tok->buf - 1;
101
            msg = "unexpected character after line continuation character";
102
            break;
103
        }
104
        default:
  Branch (104:9): [True: 0, False: 178]
105
            msg = "unknown parsing error";
106
    }
107
108
    RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
109
                               col_offset >= 0 ? 
col_offset7
:
02
,
  Branch (109:32): [True: 7, False: 2]
110
                               p->tok->lineno, -1, msg);
111
    return -1;
112
}
113
114
int
115
_Pypegen_raise_decode_error(Parser *p)
116
{
117
    assert(PyErr_Occurred());
118
    const char *errtype = NULL;
119
    if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
  Branch (119:9): [True: 40, False: 265]
120
        errtype = "unicode error";
121
    }
122
    else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
  Branch (122:14): [True: 2, False: 263]
123
        errtype = "value error";
124
    }
125
    if (errtype) {
  Branch (125:9): [True: 42, False: 263]
126
        PyObject *type;
127
        PyObject *value;
128
        PyObject *tback;
129
        PyObject *errstr;
130
        PyErr_Fetch(&type, &value, &tback);
131
        errstr = PyObject_Str(value);
132
        if (errstr) {
  Branch (132:13): [True: 42, False: 0]
133
            RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
134
            Py_DECREF(errstr);
135
        }
136
        else {
137
            PyErr_Clear();
138
            RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
139
        }
140
        Py_XDECREF(type);
141
        Py_XDECREF(value);
142
        Py_XDECREF(tback);
143
    }
144
145
    return -1;
146
}
147
148
static int
149
_PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
150
    // Tokenize the whole input to see if there are any tokenization
151
    // errors such as mistmatching parentheses. These will get priority
152
    // over generic syntax errors only if the line number of the error is
153
    // before the one that we had for the generic error.
154
155
    // We don't want to tokenize to the end for interactive input
156
    if (p->tok->prompt != NULL) {
  Branch (156:9): [True: 0, False: 954]
157
        return 0;
158
    }
159
160
    PyObject *type, *value, *traceback;
161
    PyErr_Fetch(&type, &value, &traceback);
162
163
    Token *current_token = p->known_err_token != NULL ? 
p->known_err_token0
: p->tokens[p->fill - 1];
  Branch (163:28): [True: 0, False: 954]
164
    Py_ssize_t current_err_line = current_token->lineno;
165
166
    int ret = 0;
167
168
    for (;;) {
169
        const char *start;
170
        const char *end;
171
        switch (_PyTokenizer_Get(p->tok, &start, &end)) {
172
            case ERRORTOKEN:
  Branch (172:13): [True: 31, False: 4.05k]
173
                if (p->tok->level != 0) {
  Branch (173:21): [True: 31, False: 0]
174
                    int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
175
                    if (current_err_line > error_lineno) {
  Branch (175:25): [True: 3, False: 28]
176
                        raise_unclosed_parentheses_error(p);
177
                        ret = -1;
178
                        goto exit;
179
                    }
180
                }
181
                break;
182
            case ENDMARKER:
  Branch (182:13): [True: 923, False: 3.16k]
183
                break;
184
            default:
  Branch (184:13): [True: 3.13k, False: 954]
185
                continue;
186
        }
187
        break;
188
    }
189
190
191
exit:
192
    if (PyErr_Occurred()) {
  Branch (192:9): [True: 3, False: 951]
193
        Py_XDECREF(value);
194
        Py_XDECREF(type);
195
        Py_XDECREF(traceback);
196
    } else {
197
        PyErr_Restore(type, value, traceback);
198
    }
199
    return ret;
200
}
201
202
// PARSER ERRORS
203
204
void *
205
_PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...)
206
{
207
    if (p->fill == 0) {
  Branch (207:9): [True: 0, False: 634]
208
        va_list va;
209
        va_start(va, errmsg);
210
        _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
211
        va_end(va);
212
        return NULL;
213
    }
214
215
    Token *t = p->known_err_token != NULL ? 
p->known_err_token4
:
p->tokens[p->fill - 1]630
;
  Branch (215:16): [True: 4, False: 630]
216
    Py_ssize_t col_offset;
217
    Py_ssize_t end_col_offset = -1;
218
    if (t->col_offset == -1) {
  Branch (218:9): [True: 538, False: 96]
219
        if (p->tok->cur == p->tok->buf) {
  Branch (219:13): [True: 3, False: 535]
220
            col_offset = 0;
221
        } else {
222
            const char* start = p->tok->buf  ? p->tok->line_start : 
p->tok->buf0
;
  Branch (222:33): [True: 535, False: 0]
223
            col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
224
        }
225
    } else {
226
        col_offset = t->col_offset + 1;
227
    }
228
229
    if (t->end_col_offset != -1) {
  Branch (229:9): [True: 96, False: 538]
230
        end_col_offset = t->end_col_offset + 1;
231
    }
232
233
    va_list va;
234
    va_start(va, errmsg);
235
    _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
236
    va_end(va);
237
238
    return NULL;
239
}
240
241
static PyObject *
242
get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
243
{
244
    /* If the file descriptor is interactive, the source lines of the current
245
     * (multi-line) statement are stored in p->tok->interactive_src_start.
246
     * If not, we're parsing from a string, which means that the whole source
247
     * is stored in p->tok->str. */
248
    assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp == stdin);
249
250
    char *cur_line = p->tok->fp_interactive ? 
p->tok->interactive_src_start0
: p->tok->str;
  Branch (250:22): [True: 0, False: 25]
251
    if (cur_line == NULL) {
  Branch (251:9): [True: 0, False: 25]
252
        assert(p->tok->fp_interactive);
253
        // We can reach this point if the tokenizer buffers for interactive source have not been
254
        // initialized because we failed to decode the original source with the given locale.
255
        return PyUnicode_FromStringAndSize("", 0);
256
    }
257
258
    Py_ssize_t relative_lineno = p->starting_lineno ? 
lineno - p->starting_lineno + 11
:
lineno24
;
  Branch (258:34): [True: 1, False: 24]
259
    const char* buf_end = p->tok->fp_interactive ? 
p->tok->interactive_src_end0
: p->tok->inp;
  Branch (259:27): [True: 0, False: 25]
260
261
    for (int i = 0; i < relative_lineno - 1; 
i++16
) {
  Branch (261:21): [True: 16, False: 25]
262
        char *new_line = strchr(cur_line, '\n');
263
        // The assert is here for debug builds but the conditional that
264
        // follows is there so in release builds we do not crash at the cost
265
        // to report a potentially wrong line.
266
        assert(new_line != NULL && new_line + 1 < buf_end);
267
        if (new_line == NULL || new_line + 1 > buf_end) {
  Branch (267:13): [True: 0, False: 16]
  Branch (267:33): [True: 0, False: 16]
268
            break;
269
        }
270
        cur_line = new_line + 1;
271
    }
272
273
    char *next_newline;
274
    if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
  Branch (274:9): [True: 2, False: 23]
275
        next_newline = cur_line + strlen(cur_line);
276
    }
277
    return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
278
}
279
280
void *
281
_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
282
                                    Py_ssize_t lineno, Py_ssize_t col_offset,
283
                                    Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
284
                                    const char *errmsg, va_list va)
285
{
286
    PyObject *value = NULL;
287
    PyObject *errstr = NULL;
288
    PyObject *error_line = NULL;
289
    PyObject *tmp = NULL;
290
    p->error_indicator = 1;
291
292
    if (end_lineno == CURRENT_POS) {
  Branch (292:9): [True: 11, False: 1.64k]
293
        end_lineno = p->tok->lineno;
294
    }
295
    if (end_col_offset == CURRENT_POS) {
  Branch (295:9): [True: 11, False: 1.64k]
296
        end_col_offset = p->tok->cur - p->tok->line_start;
297
    }
298
299
    if (p->start_rule == Py_fstring_input) {
  Branch (299:9): [True: 11, False: 1.64k]
300
        const char *fstring_msg = "f-string: ";
301
        Py_ssize_t len = strlen(fstring_msg) + strlen(errmsg);
302
303
        char *new_errmsg = PyMem_Malloc(len + 1); // Lengths of both strings plus NULL character
304
        if (!new_errmsg) {
  Branch (304:13): [True: 0, False: 11]
305
            return (void *) PyErr_NoMemory();
306
        }
307
308
        // Copy both strings into new buffer
309
        memcpy(new_errmsg, fstring_msg, strlen(fstring_msg));
310
        memcpy(new_errmsg + strlen(fstring_msg), errmsg, strlen(errmsg));
311
        new_errmsg[len] = 0;
312
        errmsg = new_errmsg;
313
    }
314
    errstr = PyUnicode_FromFormatV(errmsg, va);
315
    if (!errstr) {
  Branch (315:9): [True: 0, False: 1.65k]
316
        goto error;
317
    }
318
319
    if (p->tok->fp_interactive && 
p->tok->interactive_src_start != NULL0
) {
  Branch (319:9): [True: 0, False: 1.65k]
  Branch (319:35): [True: 0, False: 0]
320
        error_line = get_error_line_from_tokenizer_buffers(p, lineno);
321
    }
322
    else if (p->start_rule == Py_file_input) {
  Branch (322:14): [True: 555, False: 1.10k]
323
        error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
324
                                                     (int) lineno, p->tok->encoding);
325
    }
326
327
    if (!error_line) {
  Branch (327:9): [True: 1.63k, False: 24]
328
        /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
329
           then we need to find the error line from some other source, because
330
           p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
331
           failed or we're parsing from a string or the REPL. There's a third edge case where
332
           we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
333
           `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
334
           does not physically exist */
335
        assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
336
337
        if (p->tok->lineno <= lineno && 
p->tok->inp > p->tok->buf1.61k
) {
  Branch (337:13): [True: 1.61k, False: 23]
  Branch (337:41): [True: 1.60k, False: 2]
338
            Py_ssize_t size = p->tok->inp - p->tok->buf;
339
            error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
340
        }
341
        else if (p->tok->fp == NULL || 
p->tok->fp == stdin0
) {
  Branch (341:18): [True: 25, False: 0]
  Branch (341:40): [True: 0, False: 0]
342
            error_line = get_error_line_from_tokenizer_buffers(p, lineno);
343
        }
344
        else {
345
            error_line = PyUnicode_FromStringAndSize("", 0);
346
        }
347
        if (!error_line) {
  Branch (347:13): [True: 0, False: 1.63k]
348
            goto error;
349
        }
350
    }
351
352
    if (p->start_rule == Py_fstring_input) {
  Branch (352:9): [True: 11, False: 1.64k]
353
        col_offset -= p->starting_col_offset;
354
        end_col_offset -= p->starting_col_offset;
355
    }
356
357
    Py_ssize_t col_number = col_offset;
358
    Py_ssize_t end_col_number = end_col_offset;
359
360
    if (p->tok->encoding != NULL) {
  Branch (360:9): [True: 1.61k, False: 41]
361
        col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
362
        if (col_number < 0) {
  Branch (362:13): [True: 0, False: 1.61k]
363
            goto error;
364
        }
365
        if (end_col_number > 0) {
  Branch (365:13): [True: 922, False: 695]
366
            Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number);
367
            if (end_col_offset < 0) {
  Branch (367:17): [True: 0, False: 922]
368
                goto error;
369
            } else {
370
                end_col_number = end_col_offset;
371
            }
372
        }
373
    }
374
    tmp = Py_BuildValue("(OiiNii)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
375
    if (!tmp) {
  Branch (375:9): [True: 0, False: 1.65k]
376
        goto error;
377
    }
378
    value = PyTuple_Pack(2, errstr, tmp);
379
    Py_DECREF(tmp);
380
    if (!value) {
  Branch (380:9): [True: 0, False: 1.65k]
381
        goto error;
382
    }
383
    PyErr_SetObject(errtype, value);
384
385
    Py_DECREF(errstr);
386
    Py_DECREF(value);
387
    if (p->start_rule == Py_fstring_input) {
  Branch (387:9): [True: 11, False: 1.64k]
388
        PyMem_Free((void *)errmsg);
389
    }
390
    return NULL;
391
392
error:
393
    Py_XDECREF(errstr);
394
    Py_XDECREF(error_line);
395
    if (p->start_rule == Py_fstring_input) {
  Branch (395:9): [True: 0, False: 0]
396
        PyMem_Free((void *)errmsg);
397
    }
398
    return NULL;
399
}
400
401
void
402
_Pypegen_set_syntax_error(Parser* p, Token* last_token) {
403
    // Existing sintax error
404
    if (PyErr_Occurred()) {
  Branch (404:9): [True: 1.00k, False: 493]
405
        // Prioritize tokenizer errors to custom syntax errors raised
406
        // on the second phase only if the errors come from the parser.
407
        int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK);
  Branch (407:26): [True: 0, False: 1.00k]
  Branch (407:52): [True: 469, False: 537]
408
        if (is_tok_ok && 
PyErr_ExceptionMatches(PyExc_SyntaxError)469
) {
  Branch (408:13): [True: 469, False: 537]
  Branch (408:26): [True: 469, False: 0]
409
            _PyPegen_tokenize_full_source_to_check_for_errors(p);
410
        }
411
        // Propagate the existing syntax error.
412
        return;
413
    }
414
    // Initialization error
415
    if (p->fill == 0) {
  Branch (415:9): [True: 0, False: 493]
416
        RAISE_SYNTAX_ERROR("error at start before reading any input");
417
    }
418
    // Parser encountered EOF (End of File) unexpectedtly
419
    if (last_token->type == ERRORTOKEN && 
p->tok->done == 0
E_EOF0
) {
  Branch (419:9): [True: 0, False: 493]
  Branch (419:43): [True: 0, False: 0]
420
        if (p->tok->level) {
  Branch (420:13): [True: 0, False: 0]
421
            raise_unclosed_parentheses_error(p);
422
        } else {
423
            RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
424
        }
425
        return;
426
    }
427
    // Indentation error in the tokenizer
428
    if (last_token->type == INDENT || 
last_token->type == 485
DEDENT485
) {
  Branch (428:9): [True: 8, False: 485]
  Branch (428:39): [True: 0, False: 485]
429
        RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
430
        return;
431
    }
432
    // Unknown error (generic case)
433
434
    // Use the last token we found on the first pass to avoid reporting
435
    // incorrect locations for generic syntax errors just because we reached
436
    // further away when trying to find specific syntax errors in the second
437
    // pass.
438
    RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
439
    // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
440
    // generic SyntaxError we just raised if errors are found.
441
    _PyPegen_tokenize_full_source_to_check_for_errors(p);
442
}