LCOV - code coverage report
Current view: top level - Parser - pegen_errors.c (source / functions) Hit Total Coverage
Test: CPython lcov report Lines: 199 247 80.6 %
Date: 2022-07-07 18:19:46 Functions: 9 9 100.0 %

          Line data    Source code
       1             : #include <Python.h>
       2             : #include <errcode.h>
       3             : 
       4             : #include "tokenizer.h"
       5             : #include "pegen.h"
       6             : 
       7             : // TOKENIZER ERRORS
       8             : 
       9             : void
      10          26 : _PyPegen_raise_tokenizer_init_error(PyObject *filename)
      11             : {
      12          26 :     if (!(PyErr_ExceptionMatches(PyExc_LookupError)
      13          21 :           || PyErr_ExceptionMatches(PyExc_SyntaxError)
      14           1 :           || PyErr_ExceptionMatches(PyExc_ValueError)
      15           0 :           || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
      16           0 :         return;
      17             :     }
      18          26 :     PyObject *errstr = NULL;
      19          26 :     PyObject *tuple = NULL;
      20             :     PyObject *type;
      21             :     PyObject *value;
      22             :     PyObject *tback;
      23          26 :     PyErr_Fetch(&type, &value, &tback);
      24          26 :     errstr = PyObject_Str(value);
      25          26 :     if (!errstr) {
      26           0 :         goto error;
      27             :     }
      28             : 
      29          26 :     PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
      30          26 :     if (!tmp) {
      31           0 :         goto error;
      32             :     }
      33             : 
      34          26 :     tuple = PyTuple_Pack(2, errstr, tmp);
      35          26 :     Py_DECREF(tmp);
      36          26 :     if (!value) {
      37           0 :         goto error;
      38             :     }
      39          26 :     PyErr_SetObject(PyExc_SyntaxError, tuple);
      40             : 
      41          26 : error:
      42          26 :     Py_XDECREF(type);
      43          26 :     Py_XDECREF(value);
      44          26 :     Py_XDECREF(tback);
      45          26 :     Py_XDECREF(errstr);
      46          26 :     Py_XDECREF(tuple);
      47             : }
      48             : 
      49             : static inline void
      50         170 : raise_unclosed_parentheses_error(Parser *p) {
      51         170 :        int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
      52         170 :        int error_col = p->tok->parencolstack[p->tok->level-1];
      53         170 :        RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
      54             :                                   error_lineno, error_col, error_lineno, -1,
      55             :                                   "'%c' was never closed",
      56         170 :                                   p->tok->parenstack[p->tok->level-1]);
      57         170 : }
      58             : 
      59             : int
      60         394 : _Pypegen_tokenizer_error(Parser *p)
      61             : {
      62         394 :     if (PyErr_Occurred()) {
      63         202 :         return -1;
      64             :     }
      65             : 
      66         192 :     const char *msg = NULL;
      67         192 :     PyObject* errtype = PyExc_SyntaxError;
      68         192 :     Py_ssize_t col_offset = -1;
      69         192 :     switch (p->tok->done) {
      70           0 :         case E_TOKEN:
      71           0 :             msg = "invalid token";
      72           0 :             break;
      73         177 :         case E_EOF:
      74         177 :             if (p->tok->level) {
      75         167 :                 raise_unclosed_parentheses_error(p);
      76             :             } else {
      77          10 :                 RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
      78             :             }
      79         177 :             return -1;
      80           6 :         case E_DEDENT:
      81           6 :             RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
      82           6 :             return -1;
      83           0 :         case E_INTR:
      84           0 :             if (!PyErr_Occurred()) {
      85           0 :                 PyErr_SetNone(PyExc_KeyboardInterrupt);
      86             :             }
      87           0 :             return -1;
      88           0 :         case E_NOMEM:
      89           0 :             PyErr_NoMemory();
      90           0 :             return -1;
      91           2 :         case E_TABSPACE:
      92           2 :             errtype = PyExc_TabError;
      93           2 :             msg = "inconsistent use of tabs and spaces in indentation";
      94           2 :             break;
      95           0 :         case E_TOODEEP:
      96           0 :             errtype = PyExc_IndentationError;
      97           0 :             msg = "too many levels of indentation";
      98           0 :             break;
      99           7 :         case E_LINECONT: {
     100           7 :             col_offset = p->tok->cur - p->tok->buf - 1;
     101           7 :             msg = "unexpected character after line continuation character";
     102           7 :             break;
     103             :         }
     104           0 :         default:
     105           0 :             msg = "unknown parsing error";
     106             :     }
     107             : 
     108           9 :     RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
     109             :                                col_offset >= 0 ? col_offset : 0,
     110           9 :                                p->tok->lineno, -1, msg);
     111           9 :     return -1;
     112             : }
     113             : 
     114             : int
     115         774 : _Pypegen_raise_decode_error(Parser *p)
     116             : {
     117         774 :     assert(PyErr_Occurred());
     118         774 :     const char *errtype = NULL;
     119         774 :     if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
     120         503 :         errtype = "unicode error";
     121             :     }
     122         271 :     else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
     123           2 :         errtype = "value error";
     124             :     }
     125         774 :     if (errtype) {
     126             :         PyObject *type;
     127             :         PyObject *value;
     128             :         PyObject *tback;
     129             :         PyObject *errstr;
     130         505 :         PyErr_Fetch(&type, &value, &tback);
     131         505 :         errstr = PyObject_Str(value);
     132         505 :         if (errstr) {
     133         505 :             RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
     134         505 :             Py_DECREF(errstr);
     135             :         }
     136             :         else {
     137           0 :             PyErr_Clear();
     138           0 :             RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
     139             :         }
     140         505 :         Py_XDECREF(type);
     141         505 :         Py_XDECREF(value);
     142         505 :         Py_XDECREF(tback);
     143             :     }
     144             : 
     145         774 :     return -1;
     146             : }
     147             : 
     148             : static int
     149         972 : _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
     150             :     // Tokenize the whole input to see if there are any tokenization
     151             :     // errors such as mistmatching parentheses. These will get priority
     152             :     // over generic syntax errors only if the line number of the error is
     153             :     // before the one that we had for the generic error.
     154             : 
     155             :     // We don't want to tokenize to the end for interactive input
     156         972 :     if (p->tok->prompt != NULL) {
     157           8 :         return 0;
     158             :     }
     159             : 
     160             :     PyObject *type, *value, *traceback;
     161         964 :     PyErr_Fetch(&type, &value, &traceback);
     162             : 
     163         964 :     Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
     164         964 :     Py_ssize_t current_err_line = current_token->lineno;
     165             : 
     166         964 :     int ret = 0;
     167             : 
     168        3142 :     for (;;) {
     169             :         const char *start;
     170             :         const char *end;
     171        4106 :         switch (_PyTokenizer_Get(p->tok, &start, &end)) {
     172          31 :             case ERRORTOKEN:
     173          31 :                 if (p->tok->level != 0) {
     174          31 :                     int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
     175          31 :                     if (current_err_line > error_lineno) {
     176           3 :                         raise_unclosed_parentheses_error(p);
     177           3 :                         ret = -1;
     178           3 :                         goto exit;
     179             :                     }
     180             :                 }
     181          28 :                 break;
     182         933 :             case ENDMARKER:
     183         933 :                 break;
     184        3142 :             default:
     185        3142 :                 continue;
     186             :         }
     187         961 :         break;
     188             :     }
     189             : 
     190             : 
     191         964 : exit:
     192         964 :     if (PyErr_Occurred()) {
     193           3 :         Py_XDECREF(value);
     194           3 :         Py_XDECREF(type);
     195           3 :         Py_XDECREF(traceback);
     196             :     } else {
     197         961 :         PyErr_Restore(type, value, traceback);
     198             :     }
     199         964 :     return ret;
     200             : }
     201             : 
     202             : // PARSER ERRORS
     203             : 
     204             : void *
     205        1101 : _PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...)
     206             : {
     207        1101 :     if (p->fill == 0) {
     208             :         va_list va;
     209           0 :         va_start(va, errmsg);
     210           0 :         _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
     211           0 :         va_end(va);
     212           0 :         return NULL;
     213             :     }
     214             : 
     215        1101 :     Token *t = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
     216             :     Py_ssize_t col_offset;
     217        1101 :     Py_ssize_t end_col_offset = -1;
     218        1101 :     if (t->col_offset == -1) {
     219        1002 :         if (p->tok->cur == p->tok->buf) {
     220           4 :             col_offset = 0;
     221             :         } else {
     222         998 :             const char* start = p->tok->buf  ? p->tok->line_start : p->tok->buf;
     223         998 :             col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
     224             :         }
     225             :     } else {
     226          99 :         col_offset = t->col_offset + 1;
     227             :     }
     228             : 
     229        1101 :     if (t->end_col_offset != -1) {
     230          99 :         end_col_offset = t->end_col_offset + 1;
     231             :     }
     232             : 
     233             :     va_list va;
     234        1101 :     va_start(va, errmsg);
     235        1101 :     _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
     236        1101 :     va_end(va);
     237             : 
     238        1101 :     return NULL;
     239             : }
     240             : 
     241             : static PyObject *
     242          35 : get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
     243             : {
     244             :     /* If the file descriptor is interactive, the source lines of the current
     245             :      * (multi-line) statement are stored in p->tok->interactive_src_start.
     246             :      * If not, we're parsing from a string, which means that the whole source
     247             :      * is stored in p->tok->str. */
     248          35 :     assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp == stdin);
     249             : 
     250          35 :     char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
     251          35 :     if (cur_line == NULL) {
     252           0 :         assert(p->tok->fp_interactive);
     253             :         // We can reach this point if the tokenizer buffers for interactive source have not been
     254             :         // initialized because we failed to decode the original source with the given locale.
     255           0 :         return PyUnicode_FromStringAndSize("", 0);
     256             :     }
     257             : 
     258          35 :     Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno;
     259          35 :     const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp;
     260             : 
     261          53 :     for (int i = 0; i < relative_lineno - 1; i++) {
     262          18 :         char *new_line = strchr(cur_line, '\n');
     263             :         // The assert is here for debug builds but the conditional that
     264             :         // follows is there so in release builds we do not crash at the cost
     265             :         // to report a potentially wrong line.
     266          18 :         assert(new_line != NULL && new_line + 1 < buf_end);
     267          18 :         if (new_line == NULL || new_line + 1 > buf_end) {
     268             :             break;
     269             :         }
     270          18 :         cur_line = new_line + 1;
     271             :     }
     272             : 
     273             :     char *next_newline;
     274          35 :     if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
     275           2 :         next_newline = cur_line + strlen(cur_line);
     276             :     }
     277          35 :     return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
     278             : }
     279             : 
     280             : void *
     281        2151 : _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
     282             :                                     Py_ssize_t lineno, Py_ssize_t col_offset,
     283             :                                     Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
     284             :                                     const char *errmsg, va_list va)
     285             : {
     286        2151 :     PyObject *value = NULL;
     287        2151 :     PyObject *errstr = NULL;
     288        2151 :     PyObject *error_line = NULL;
     289        2151 :     PyObject *tmp = NULL;
     290        2151 :     p->error_indicator = 1;
     291             : 
     292        2151 :     if (end_lineno == CURRENT_POS) {
     293          11 :         end_lineno = p->tok->lineno;
     294             :     }
     295        2151 :     if (end_col_offset == CURRENT_POS) {
     296          11 :         end_col_offset = p->tok->cur - p->tok->line_start;
     297             :     }
     298             : 
     299        2151 :     if (p->start_rule == Py_fstring_input) {
     300          12 :         const char *fstring_msg = "f-string: ";
     301          12 :         Py_ssize_t len = strlen(fstring_msg) + strlen(errmsg);
     302             : 
     303          12 :         char *new_errmsg = PyMem_Malloc(len + 1); // Lengths of both strings plus NULL character
     304          12 :         if (!new_errmsg) {
     305           0 :             return (void *) PyErr_NoMemory();
     306             :         }
     307             : 
     308             :         // Copy both strings into new buffer
     309          12 :         memcpy(new_errmsg, fstring_msg, strlen(fstring_msg));
     310          12 :         memcpy(new_errmsg + strlen(fstring_msg), errmsg, strlen(errmsg));
     311          12 :         new_errmsg[len] = 0;
     312          12 :         errmsg = new_errmsg;
     313             :     }
     314        2151 :     errstr = PyUnicode_FromFormatV(errmsg, va);
     315        2151 :     if (!errstr) {
     316           0 :         goto error;
     317             :     }
     318             : 
     319        2151 :     if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) {
     320          10 :         error_line = get_error_line_from_tokenizer_buffers(p, lineno);
     321             :     }
     322        2141 :     else if (p->start_rule == Py_file_input) {
     323         574 :         error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
     324         574 :                                                      (int) lineno, p->tok->encoding);
     325             :     }
     326             : 
     327        2151 :     if (!error_line) {
     328             :         /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
     329             :            then we need to find the error line from some other source, because
     330             :            p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
     331             :            failed or we're parsing from a string or the REPL. There's a third edge case where
     332             :            we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
     333             :            `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
     334             :            does not physically exist */
     335        2101 :         assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
     336             : 
     337        4177 :         if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
     338        2076 :             Py_ssize_t size = p->tok->inp - p->tok->buf;
     339        2076 :             error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
     340             :         }
     341          25 :         else if (p->tok->fp == NULL || p->tok->fp == stdin) {
     342          25 :             error_line = get_error_line_from_tokenizer_buffers(p, lineno);
     343             :         }
     344             :         else {
     345           0 :             error_line = PyUnicode_FromStringAndSize("", 0);
     346             :         }
     347        2101 :         if (!error_line) {
     348           0 :             goto error;
     349             :         }
     350             :     }
     351             : 
     352        2151 :     if (p->start_rule == Py_fstring_input) {
     353          12 :         col_offset -= p->starting_col_offset;
     354          12 :         end_col_offset -= p->starting_col_offset;
     355             :     }
     356             : 
     357        2151 :     Py_ssize_t col_number = col_offset;
     358        2151 :     Py_ssize_t end_col_number = end_col_offset;
     359             : 
     360        2151 :     if (p->tok->encoding != NULL) {
     361        2092 :         col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
     362        2092 :         if (col_number < 0) {
     363           0 :             goto error;
     364             :         }
     365        2092 :         if (end_col_number > 0) {
     366         925 :             Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number);
     367         925 :             if (end_col_offset < 0) {
     368           0 :                 goto error;
     369             :             } else {
     370         925 :                 end_col_number = end_col_offset;
     371             :             }
     372             :         }
     373             :     }
     374        2151 :     tmp = Py_BuildValue("(OiiNii)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
     375        2151 :     if (!tmp) {
     376           0 :         goto error;
     377             :     }
     378        2151 :     value = PyTuple_Pack(2, errstr, tmp);
     379        2151 :     Py_DECREF(tmp);
     380        2151 :     if (!value) {
     381           0 :         goto error;
     382             :     }
     383        2151 :     PyErr_SetObject(errtype, value);
     384             : 
     385        2151 :     Py_DECREF(errstr);
     386        2151 :     Py_DECREF(value);
     387        2151 :     if (p->start_rule == Py_fstring_input) {
     388          12 :         PyMem_Free((void *)errmsg);
     389             :     }
     390        2151 :     return NULL;
     391             : 
     392           0 : error:
     393           0 :     Py_XDECREF(errstr);
     394           0 :     Py_XDECREF(error_line);
     395           0 :     if (p->start_rule == Py_fstring_input) {
     396           0 :         PyMem_Free((void *)errmsg);
     397             :     }
     398           0 :     return NULL;
     399             : }
     400             : 
     401             : void
     402        2000 : _Pypegen_set_syntax_error(Parser* p, Token* last_token) {
     403             :     // Existing sintax error
     404        2000 :     if (PyErr_Occurred()) {
     405             :         // Prioritize tokenizer errors to custom syntax errors raised
     406             :         // on the second phase only if the errors come from the parser.
     407        1499 :         int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK);
     408        1499 :         if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
     409         479 :             _PyPegen_tokenize_full_source_to_check_for_errors(p);
     410             :         }
     411             :         // Propagate the existing syntax error.
     412        1499 :         return;
     413             :     }
     414             :     // Initialization error
     415         501 :     if (p->fill == 0) {
     416           0 :         RAISE_SYNTAX_ERROR("error at start before reading any input");
     417             :     }
     418             :     // Parser encountered EOF (End of File) unexpectedtly
     419         501 :     if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) {
     420           0 :         if (p->tok->level) {
     421           0 :             raise_unclosed_parentheses_error(p);
     422             :         } else {
     423           0 :             RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
     424             :         }
     425           0 :         return;
     426             :     }
     427             :     // Indentation error in the tokenizer
     428         501 :     if (last_token->type == INDENT || last_token->type == DEDENT) {
     429           8 :         RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
     430           8 :         return;
     431             :     }
     432             :     // Unknown error (generic case)
     433             : 
     434             :     // Use the last token we found on the first pass to avoid reporting
     435             :     // incorrect locations for generic syntax errors just because we reached
     436             :     // further away when trying to find specific syntax errors in the second
     437             :     // pass.
     438         493 :     RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
     439             :     // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
     440             :     // generic SyntaxError we just raised if errors are found.
     441         493 :     _PyPegen_tokenize_full_source_to_check_for_errors(p);
     442             : }

Generated by: LCOV version 1.14