LCOV - CPython lcov report - Python/codecs.c

LCOV - code coverage report

Current view:	top level - Python - codecs.c (source / functions)		Hit	Total	Coverage
Test:	CPython lcov report	Lines:	664	794	83.6 %
Date:	2022-07-07 18:19:46	Functions:	46	50	92.0 %

          Line data    Source code

       1             : /* ------------------------------------------------------------------------
       2             : 
       3             :    Python Codec Registry and support functions
       4             : 
       5             : Written by Marc-Andre Lemburg (mal@lemburg.com).
       6             : 
       7             : Copyright (c) Corporation for National Research Initiatives.
       8             : 
       9             :    ------------------------------------------------------------------------ */
      10             : 
      11             : #include "Python.h"
      12             : #include "pycore_call.h"          // _PyObject_CallNoArgs()
      13             : #include "pycore_interp.h"        // PyInterpreterState.codec_search_path
      14             : #include "pycore_pystate.h"       // _PyInterpreterState_GET()
      15             : #include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
      16             : #include <ctype.h>
      17             : 
      18             : const char *Py_hexdigits = "0123456789abcdef";
      19             : 
      20             : /* --- Codec Registry ----------------------------------------------------- */
      21             : 
      22             : /* Import the standard encodings package which will register the first
      23             :    codec search function.
      24             : 
      25             :    This is done in a lazy way so that the Unicode implementation does
      26             :    not downgrade startup time of scripts not needing it.
      27             : 
      28             :    ImportErrors are silently ignored by this function. Only one try is
      29             :    made.
      30             : 
      31             : */
      32             : 
      33             : static int _PyCodecRegistry_Init(void); /* Forward */
      34             : 
      35        3412 : int PyCodec_Register(PyObject *search_function)
      36             : {
      37        3412 :     PyInterpreterState *interp = _PyInterpreterState_GET();
      38        3412 :     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
      39           0 :         goto onError;
      40        3412 :     if (search_function == NULL) {
      41           0 :         PyErr_BadArgument();
      42           0 :         goto onError;
      43             :     }
      44        3412 :     if (!PyCallable_Check(search_function)) {
      45           1 :         PyErr_SetString(PyExc_TypeError, "argument must be callable");
      46           1 :         goto onError;
      47             :     }
      48        3411 :     return PyList_Append(interp->codec_search_path, search_function);
      49             : 
      50           1 :  onError:
      51           1 :     return -1;
      52             : }
      53             : 
      54             : int
      55         280 : PyCodec_Unregister(PyObject *search_function)
      56             : {
      57         280 :     PyInterpreterState *interp = PyInterpreterState_Get();
      58         280 :     PyObject *codec_search_path = interp->codec_search_path;
      59             :     /* Do nothing if codec_search_path is not created yet or was cleared. */
      60         280 :     if (codec_search_path == NULL) {
      61           0 :         return 0;
      62             :     }
      63             : 
      64         280 :     assert(PyList_CheckExact(codec_search_path));
      65         280 :     Py_ssize_t n = PyList_GET_SIZE(codec_search_path);
      66         560 :     for (Py_ssize_t i = 0; i < n; i++) {
      67         560 :         PyObject *item = PyList_GET_ITEM(codec_search_path, i);
      68         560 :         if (item == search_function) {
      69         280 :             if (interp->codec_search_cache != NULL) {
      70         280 :                 assert(PyDict_CheckExact(interp->codec_search_cache));
      71         280 :                 PyDict_Clear(interp->codec_search_cache);
      72             :             }
      73         280 :             return PyList_SetSlice(codec_search_path, i, i+1, NULL);
      74             :         }
      75             :     }
      76           0 :     return 0;
      77             : }
      78             : 
      79             : extern int _Py_normalize_encoding(const char *, char *, size_t);
      80             : 
      81             : /* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
      82             :    converted to lower case, spaces and hyphens are replaced with underscores. */
      83             : 
      84             : static
      85     8047350 : PyObject *normalizestring(const char *string)
      86             : {
      87     8047350 :     size_t len = strlen(string);
      88             :     char *encoding;
      89             :     PyObject *v;
      90             : 
      91     8047350 :     if (len > PY_SSIZE_T_MAX) {
      92           0 :         PyErr_SetString(PyExc_OverflowError, "string is too large");
      93           0 :         return NULL;
      94             :     }
      95             : 
      96     8047350 :     encoding = PyMem_Malloc(len + 1);
      97     8047350 :     if (encoding == NULL)
      98           0 :         return PyErr_NoMemory();
      99             : 
     100     8047350 :     if (!_Py_normalize_encoding(string, encoding, len + 1))
     101             :     {
     102           0 :         PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
     103           0 :         PyMem_Free(encoding);
     104           0 :         return NULL;
     105             :     }
     106             : 
     107     8047350 :     v = PyUnicode_FromString(encoding);
     108     8047350 :     PyMem_Free(encoding);
     109     8047350 :     return v;
     110             : }
     111             : 
     112             : /* Lookup the given encoding and return a tuple providing the codec
     113             :    facilities.
     114             : 
     115             :    The encoding string is looked up converted to all lower-case
     116             :    characters. This makes encodings looked up through this mechanism
     117             :    effectively case-insensitive.
     118             : 
     119             :    If no codec is found, a LookupError is set and NULL returned.
     120             : 
     121             :    As side effect, this tries to load the encodings package, if not
     122             :    yet done. This is part of the lazy load strategy for the encodings
     123             :    package.
     124             : 
     125             : */
     126             : 
     127     8047350 : PyObject *_PyCodec_Lookup(const char *encoding)
     128             : {
     129     8047350 :     if (encoding == NULL) {
     130           0 :         PyErr_BadArgument();
     131           0 :         return NULL;
     132             :     }
     133             : 
     134     8047350 :     PyInterpreterState *interp = _PyInterpreterState_GET();
     135     8047350 :     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) {
     136           0 :         return NULL;
     137             :     }
     138             : 
     139             :     /* Convert the encoding to a normalized Python string: all
     140             :        characters are converted to lower case, spaces and hyphens are
     141             :        replaced with underscores. */
     142     8047350 :     PyObject *v = normalizestring(encoding);
     143     8047350 :     if (v == NULL) {
     144           0 :         return NULL;
     145             :     }
     146     8047350 :     PyUnicode_InternInPlace(&v);
     147             : 
     148             :     /* First, try to lookup the name in the registry dictionary */
     149     8047350 :     PyObject *result = PyDict_GetItemWithError(interp->codec_search_cache, v);
     150     8047350 :     if (result != NULL) {
     151     8040980 :         Py_INCREF(result);
     152     8040980 :         Py_DECREF(v);
     153     8040980 :         return result;
     154             :     }
     155        6369 :     else if (PyErr_Occurred()) {
     156           0 :         goto onError;
     157             :     }
     158             : 
     159             :     /* Next, scan the search functions in order of registration */
     160        6369 :     const Py_ssize_t len = PyList_Size(interp->codec_search_path);
     161        6369 :     if (len < 0)
     162           0 :         goto onError;
     163        6369 :     if (len == 0) {
     164           0 :         PyErr_SetString(PyExc_LookupError,
     165             :                         "no codec search functions registered: "
     166             :                         "can't find encoding");
     167           0 :         goto onError;
     168             :     }
     169             : 
     170             :     Py_ssize_t i;
     171        6460 :     for (i = 0; i < len; i++) {
     172             :         PyObject *func;
     173             : 
     174        6398 :         func = PyList_GetItem(interp->codec_search_path, i);
     175        6398 :         if (func == NULL)
     176           0 :             goto onError;
     177        6398 :         result = PyObject_CallOneArg(func, v);
     178        6398 :         if (result == NULL)
     179           0 :             goto onError;
     180        6398 :         if (result == Py_None) {
     181          91 :             Py_DECREF(result);
     182          91 :             continue;
     183             :         }
     184        6307 :         if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
     185           1 :             PyErr_SetString(PyExc_TypeError,
     186             :                             "codec search functions must return 4-tuples");
     187           1 :             Py_DECREF(result);
     188           1 :             goto onError;
     189             :         }
     190        6306 :         break;
     191             :     }
     192        6368 :     if (i == len) {
     193             :         /* XXX Perhaps we should cache misses too ? */
     194          62 :         PyErr_Format(PyExc_LookupError,
     195             :                      "unknown encoding: %s", encoding);
     196          62 :         goto onError;
     197             :     }
     198             : 
     199             :     /* Cache and return the result */
     200        6306 :     if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
     201           0 :         Py_DECREF(result);
     202           0 :         goto onError;
     203             :     }
     204        6306 :     Py_DECREF(v);
     205        6306 :     return result;
     206             : 
     207          63 :  onError:
     208          63 :     Py_DECREF(v);
     209          63 :     return NULL;
     210             : }
     211             : 
     212             : /* Codec registry encoding check API. */
     213             : 
     214           0 : int PyCodec_KnownEncoding(const char *encoding)
     215             : {
     216             :     PyObject *codecs;
     217             : 
     218           0 :     codecs = _PyCodec_Lookup(encoding);
     219           0 :     if (!codecs) {
     220           0 :         PyErr_Clear();
     221           0 :         return 0;
     222             :     }
     223             :     else {
     224           0 :         Py_DECREF(codecs);
     225           0 :         return 1;
     226             :     }
     227             : }
     228             : 
     229             : static
     230     1515180 : PyObject *args_tuple(PyObject *object,
     231             :                      const char *errors)
     232             : {
     233             :     PyObject *args;
     234             : 
     235     1515180 :     args = PyTuple_New(1 + (errors != NULL));
     236     1515180 :     if (args == NULL)
     237           0 :         return NULL;
     238     1515180 :     Py_INCREF(object);
     239     1515180 :     PyTuple_SET_ITEM(args,0,object);
     240     1515180 :     if (errors) {
     241             :         PyObject *v;
     242             : 
     243     1049270 :         v = PyUnicode_FromString(errors);
     244     1049270 :         if (v == NULL) {
     245           0 :             Py_DECREF(args);
     246           0 :             return NULL;
     247             :         }
     248     1049270 :         PyTuple_SET_ITEM(args, 1, v);
     249             :     }
     250     1515180 :     return args;
     251             : }
     252             : 
     253             : /* Helper function to get a codec item */
     254             : 
     255             : static
     256       10028 : PyObject *codec_getitem(const char *encoding, int index)
     257             : {
     258             :     PyObject *codecs;
     259             :     PyObject *v;
     260             : 
     261       10028 :     codecs = _PyCodec_Lookup(encoding);
     262       10028 :     if (codecs == NULL)
     263           3 :         return NULL;
     264       10025 :     v = PyTuple_GET_ITEM(codecs, index);
     265       10025 :     Py_DECREF(codecs);
     266       10025 :     Py_INCREF(v);
     267       10025 :     return v;
     268             : }
     269             : 
     270             : /* Helper functions to create an incremental codec. */
     271             : static
     272       40611 : PyObject *codec_makeincrementalcodec(PyObject *codec_info,
     273             :                                      const char *errors,
     274             :                                      const char *attrname)
     275             : {
     276             :     PyObject *ret, *inccodec;
     277             : 
     278       40611 :     inccodec = PyObject_GetAttrString(codec_info, attrname);
     279       40611 :     if (inccodec == NULL)
     280           0 :         return NULL;
     281       40611 :     if (errors)
     282       40411 :         ret = PyObject_CallFunction(inccodec, "s", errors);
     283             :     else
     284         200 :         ret = _PyObject_CallNoArgs(inccodec);
     285       40611 :     Py_DECREF(inccodec);
     286       40611 :     return ret;
     287             : }
     288             : 
     289             : static
     290         398 : PyObject *codec_getincrementalcodec(const char *encoding,
     291             :                                     const char *errors,
     292             :                                     const char *attrname)
     293             : {
     294             :     PyObject *codec_info, *ret;
     295             : 
     296         398 :     codec_info = _PyCodec_Lookup(encoding);
     297         398 :     if (codec_info == NULL)
     298           0 :         return NULL;
     299         398 :     ret = codec_makeincrementalcodec(codec_info, errors, attrname);
     300         398 :     Py_DECREF(codec_info);
     301         398 :     return ret;
     302             : }
     303             : 
     304             : /* Helper function to create a stream codec. */
     305             : 
     306             : static
     307           0 : PyObject *codec_getstreamcodec(const char *encoding,
     308             :                                PyObject *stream,
     309             :                                const char *errors,
     310             :                                const int index)
     311             : {
     312             :     PyObject *codecs, *streamcodec, *codeccls;
     313             : 
     314           0 :     codecs = _PyCodec_Lookup(encoding);
     315           0 :     if (codecs == NULL)
     316           0 :         return NULL;
     317             : 
     318           0 :     codeccls = PyTuple_GET_ITEM(codecs, index);
     319           0 :     if (errors != NULL)
     320           0 :         streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
     321             :     else
     322           0 :         streamcodec = PyObject_CallOneArg(codeccls, stream);
     323           0 :     Py_DECREF(codecs);
     324           0 :     return streamcodec;
     325             : }
     326             : 
     327             : /* Helpers to work with the result of _PyCodec_Lookup
     328             : 
     329             :  */
     330       15713 : PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
     331             :                                              const char *errors)
     332             : {
     333       15713 :     return codec_makeincrementalcodec(codec_info, errors,
     334             :                                       "incrementaldecoder");
     335             : }
     336             : 
     337       24500 : PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
     338             :                                              const char *errors)
     339             : {
     340       24500 :     return codec_makeincrementalcodec(codec_info, errors,
     341             :                                       "incrementalencoder");
     342             : }
     343             : 
     344             : 
     345             : /* Convenience APIs to query the Codec registry.
     346             : 
     347             :    All APIs return a codec object with incremented refcount.
     348             : 
     349             :  */
     350             : 
     351        9987 : PyObject *PyCodec_Encoder(const char *encoding)
     352             : {
     353        9987 :     return codec_getitem(encoding, 0);
     354             : }
     355             : 
     356          41 : PyObject *PyCodec_Decoder(const char *encoding)
     357             : {
     358          41 :     return codec_getitem(encoding, 1);
     359             : }
     360             : 
     361         199 : PyObject *PyCodec_IncrementalEncoder(const char *encoding,
     362             :                                      const char *errors)
     363             : {
     364         199 :     return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
     365             : }
     366             : 
     367         199 : PyObject *PyCodec_IncrementalDecoder(const char *encoding,
     368             :                                      const char *errors)
     369             : {
     370         199 :     return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
     371             : }
     372             : 
     373           0 : PyObject *PyCodec_StreamReader(const char *encoding,
     374             :                                PyObject *stream,
     375             :                                const char *errors)
     376             : {
     377           0 :     return codec_getstreamcodec(encoding, stream, errors, 2);
     378             : }
     379             : 
     380           0 : PyObject *PyCodec_StreamWriter(const char *encoding,
     381             :                                PyObject *stream,
     382             :                                const char *errors)
     383             : {
     384           0 :     return codec_getstreamcodec(encoding, stream, errors, 3);
     385             : }
     386             : 
     387             : /* Helper that tries to ensure the reported exception chain indicates the
     388             :  * codec that was invoked to trigger the failure without changing the type
     389             :  * of the exception raised.
     390             :  */
     391             : static void
     392         141 : wrap_codec_error(const char *operation,
     393             :                  const char *encoding)
     394             : {
     395             :     /* TrySetFromCause will replace the active exception with a suitably
     396             :      * updated clone if it can, otherwise it will leave the original
     397             :      * exception alone.
     398             :      */
     399         141 :     _PyErr_TrySetFromCause("%s with '%s' codec failed",
     400             :                            operation, encoding);
     401         141 : }
     402             : 
     403             : /* Encode an object (e.g. a Unicode object) using the given encoding
     404             :    and return the resulting encoded object (usually a Python string).
     405             : 
     406             :    errors is passed to the encoder factory as argument if non-NULL. */
     407             : 
     408             : static PyObject *
     409     1269880 : _PyCodec_EncodeInternal(PyObject *object,
     410             :                         PyObject *encoder,
     411             :                         const char *encoding,
     412             :                         const char *errors)
     413             : {
     414     1269880 :     PyObject *args = NULL, *result = NULL;
     415     1269880 :     PyObject *v = NULL;
     416             : 
     417     1269880 :     args = args_tuple(object, errors);
     418     1269880 :     if (args == NULL)
     419           0 :         goto onError;
     420             : 
     421     1269880 :     result = PyObject_Call(encoder, args, NULL);
     422     1269880 :     if (result == NULL) {
     423          63 :         wrap_codec_error("encoding", encoding);
     424          63 :         goto onError;
     425             :     }
     426             : 
     427     2539630 :     if (!PyTuple_Check(result) ||
     428     1269820 :         PyTuple_GET_SIZE(result) != 2) {
     429           1 :         PyErr_SetString(PyExc_TypeError,
     430             :                         "encoder must return a tuple (object, integer)");
     431           1 :         goto onError;
     432             :     }
     433     1269820 :     v = PyTuple_GET_ITEM(result,0);
     434     1269820 :     Py_INCREF(v);
     435             :     /* We don't check or use the second (integer) entry. */
     436             : 
     437     1269820 :     Py_DECREF(args);
     438     1269820 :     Py_DECREF(encoder);
     439     1269820 :     Py_DECREF(result);
     440     1269820 :     return v;
     441             : 
     442          64 :  onError:
     443          64 :     Py_XDECREF(result);
     444          64 :     Py_XDECREF(args);
     445          64 :     Py_XDECREF(encoder);
     446          64 :     return NULL;
     447             : }
     448             : 
     449             : /* Decode an object (usually a Python string) using the given encoding
     450             :    and return an equivalent object (e.g. a Unicode object).
     451             : 
     452             :    errors is passed to the decoder factory as argument if non-NULL. */
     453             : 
     454             : static PyObject *
     455      245301 : _PyCodec_DecodeInternal(PyObject *object,
     456             :                         PyObject *decoder,
     457             :                         const char *encoding,
     458             :                         const char *errors)
     459             : {
     460      245301 :     PyObject *args = NULL, *result = NULL;
     461             :     PyObject *v;
     462             : 
     463      245301 :     args = args_tuple(object, errors);
     464      245301 :     if (args == NULL)
     465           0 :         goto onError;
     466             : 
     467      245301 :     result = PyObject_Call(decoder, args, NULL);
     468      245301 :     if (result == NULL) {
     469          78 :         wrap_codec_error("decoding", encoding);
     470          78 :         goto onError;
     471             :     }
     472      490445 :     if (!PyTuple_Check(result) ||
     473      245222 :         PyTuple_GET_SIZE(result) != 2) {
     474           1 :         PyErr_SetString(PyExc_TypeError,
     475             :                         "decoder must return a tuple (object,integer)");
     476           1 :         goto onError;
     477             :     }
     478      245222 :     v = PyTuple_GET_ITEM(result,0);
     479      245222 :     Py_INCREF(v);
     480             :     /* We don't check or use the second (integer) entry. */
     481             : 
     482      245222 :     Py_DECREF(args);
     483      245222 :     Py_DECREF(decoder);
     484      245222 :     Py_DECREF(result);
     485      245222 :     return v;
     486             : 
     487          79 :  onError:
     488          79 :     Py_XDECREF(args);
     489          79 :     Py_XDECREF(decoder);
     490          79 :     Py_XDECREF(result);
     491          79 :     return NULL;
     492             : }
     493             : 
     494             : /* Generic encoding/decoding API */
     495        9987 : PyObject *PyCodec_Encode(PyObject *object,
     496             :                          const char *encoding,
     497             :                          const char *errors)
     498             : {
     499             :     PyObject *encoder;
     500             : 
     501        9987 :     encoder = PyCodec_Encoder(encoding);
     502        9987 :     if (encoder == NULL)
     503           2 :         return NULL;
     504             : 
     505        9985 :     return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
     506             : }
     507             : 
     508          41 : PyObject *PyCodec_Decode(PyObject *object,
     509             :                          const char *encoding,
     510             :                          const char *errors)
     511             : {
     512             :     PyObject *decoder;
     513             : 
     514          41 :     decoder = PyCodec_Decoder(encoding);
     515          41 :     if (decoder == NULL)
     516           1 :         return NULL;
     517             : 
     518          40 :     return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
     519             : }
     520             : 
     521             : /* Text encoding/decoding API */
     522     1542080 : PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
     523             :                                        const char *alternate_command)
     524             : {
     525             :     PyObject *codec;
     526             :     PyObject *attr;
     527             :     int is_text_codec;
     528             : 
     529     1542080 :     codec = _PyCodec_Lookup(encoding);
     530     1542080 :     if (codec == NULL)
     531           4 :         return NULL;
     532             : 
     533             :     /* Backwards compatibility: assume any raw tuple describes a text
     534             :      * encoding, and the same for anything lacking the private
     535             :      * attribute.
     536             :      */
     537     1542080 :     if (!PyTuple_CheckExact(codec)) {
     538     1542060 :         if (_PyObject_LookupAttr(codec, &_Py_ID(_is_text_encoding), &attr) < 0) {
     539           0 :             Py_DECREF(codec);
     540           0 :             return NULL;
     541             :         }
     542     1542060 :         if (attr != NULL) {
     543     1542060 :             is_text_codec = PyObject_IsTrue(attr);
     544     1542060 :             Py_DECREF(attr);
     545     1542060 :             if (is_text_codec <= 0) {
     546          22 :                 Py_DECREF(codec);
     547          22 :                 if (!is_text_codec)
     548          22 :                     PyErr_Format(PyExc_LookupError,
     549             :                                  "'%.400s' is not a text encoding; "
     550             :                                  "use %s to handle arbitrary codecs",
     551             :                                  encoding, alternate_command);
     552          22 :                 return NULL;
     553             :             }
     554             :         }
     555             :     }
     556             : 
     557             :     /* This appears to be a valid text encoding */
     558     1542050 :     return codec;
     559             : }
     560             : 
     561             : 
     562             : static
     563     1505180 : PyObject *codec_getitem_checked(const char *encoding,
     564             :                                 const char *alternate_command,
     565             :                                 int index)
     566             : {
     567             :     PyObject *codec;
     568             :     PyObject *v;
     569             : 
     570     1505180 :     codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
     571     1505180 :     if (codec == NULL)
     572          21 :         return NULL;
     573             : 
     574     1505160 :     v = PyTuple_GET_ITEM(codec, index);
     575     1505160 :     Py_INCREF(v);
     576     1505160 :     Py_DECREF(codec);
     577     1505160 :     return v;
     578             : }
     579             : 
     580     1259900 : static PyObject * _PyCodec_TextEncoder(const char *encoding)
     581             : {
     582     1259900 :     return codec_getitem_checked(encoding, "codecs.encode()", 0);
     583             : }
     584             : 
     585      245275 : static PyObject * _PyCodec_TextDecoder(const char *encoding)
     586             : {
     587      245275 :     return codec_getitem_checked(encoding, "codecs.decode()", 1);
     588             : }
     589             : 
     590     1259900 : PyObject *_PyCodec_EncodeText(PyObject *object,
     591             :                               const char *encoding,
     592             :                               const char *errors)
     593             : {
     594             :     PyObject *encoder;
     595             : 
     596     1259900 :     encoder = _PyCodec_TextEncoder(encoding);
     597     1259900 :     if (encoder == NULL)
     598           7 :         return NULL;
     599             : 
     600     1259900 :     return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
     601             : }
     602             : 
     603      245275 : PyObject *_PyCodec_DecodeText(PyObject *object,
     604             :                               const char *encoding,
     605             :                               const char *errors)
     606             : {
     607             :     PyObject *decoder;
     608             : 
     609      245275 :     decoder = _PyCodec_TextDecoder(encoding);
     610      245275 :     if (decoder == NULL)
     611          14 :         return NULL;
     612             : 
     613      245261 :     return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
     614             : }
     615             : 
     616             : /* Register the error handling callback function error under the name
     617             :    name. This function will be called by the codec when it encounters
     618             :    an unencodable characters/undecodable bytes and doesn't know the
     619             :    callback name, when name is specified as the error parameter
     620             :    in the call to the encode/decode function.
     621             :    Return 0 on success, -1 on error */
     622       25275 : int PyCodec_RegisterError(const char *name, PyObject *error)
     623             : {
     624       25275 :     PyInterpreterState *interp = _PyInterpreterState_GET();
     625       25275 :     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
     626           0 :         return -1;
     627       25275 :     if (!PyCallable_Check(error)) {
     628           1 :         PyErr_SetString(PyExc_TypeError, "handler must be callable");
     629           1 :         return -1;
     630             :     }
     631       25274 :     return PyDict_SetItemString(interp->codec_error_registry,
     632             :                                 name, error);
     633             : }
     634             : 
     635             : /* Lookup the error handling callback function registered under the
     636             :    name error. As a special case NULL can be passed, in which case
     637             :    the error handling callback for strict encoding will be returned. */
     638       51396 : PyObject *PyCodec_LookupError(const char *name)
     639             : {
     640       51396 :     PyObject *handler = NULL;
     641             : 
     642       51396 :     PyInterpreterState *interp = _PyInterpreterState_GET();
     643       51396 :     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
     644           0 :         return NULL;
     645             : 
     646       51396 :     if (name==NULL)
     647        3871 :         name = "strict";
     648       51396 :     handler = _PyDict_GetItemStringWithError(interp->codec_error_registry, name);
     649       51396 :     if (handler) {
     650       51361 :         Py_INCREF(handler);
     651             :     }
     652          35 :     else if (!PyErr_Occurred()) {
     653          35 :         PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
     654             :     }
     655       51396 :     return handler;
     656             : }
     657             : 
     658          62 : static void wrong_exception_type(PyObject *exc)
     659             : {
     660          62 :     PyErr_Format(PyExc_TypeError,
     661             :                  "don't know how to handle %.200s in error callback",
     662          62 :                  Py_TYPE(exc)->tp_name);
     663          62 : }
     664             : 
     665        5693 : PyObject *PyCodec_StrictErrors(PyObject *exc)
     666             : {
     667        5693 :     if (PyExceptionInstance_Check(exc))
     668        5689 :         PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
     669             :     else
     670           4 :         PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
     671        5693 :     return NULL;
     672             : }
     673             : 
     674             : 
     675        4012 : PyObject *PyCodec_IgnoreErrors(PyObject *exc)
     676             : {
     677             :     Py_ssize_t end;
     678             : 
     679        4012 :     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
     680        3423 :         if (PyUnicodeEncodeError_GetEnd(exc, &end))
     681           0 :             return NULL;
     682             :     }
     683         589 :     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
     684         580 :         if (PyUnicodeDecodeError_GetEnd(exc, &end))
     685           0 :             return NULL;
     686             :     }
     687           9 :     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
     688           1 :         if (PyUnicodeTranslateError_GetEnd(exc, &end))
     689           0 :             return NULL;
     690             :     }
     691             :     else {
     692           8 :         wrong_exception_type(exc);
     693           8 :         return NULL;
     694             :     }
     695        4004 :     return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
     696             : }
     697             : 
     698             : 
     699        1966 : PyObject *PyCodec_ReplaceErrors(PyObject *exc)
     700             : {
     701             :     Py_ssize_t start, end, i, len;
     702             : 
     703        1966 :     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
     704             :         PyObject *res;
     705             :         Py_UCS1 *outp;
     706        1024 :         if (PyUnicodeEncodeError_GetStart(exc, &start))
     707           1 :             return NULL;
     708        1023 :         if (PyUnicodeEncodeError_GetEnd(exc, &end))
     709           0 :             return NULL;
     710        1023 :         len = end - start;
     711        1023 :         res = PyUnicode_New(len, '?');
     712        1023 :         if (res == NULL)
     713           0 :             return NULL;
     714        1023 :         assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
     715        1023 :         outp = PyUnicode_1BYTE_DATA(res);
     716        6042 :         for (i = 0; i < len; ++i)
     717        5019 :             outp[i] = '?';
     718        1023 :         assert(_PyUnicode_CheckConsistency(res, 1));
     719        1023 :         return Py_BuildValue("(Nn)", res, end);
     720             :     }
     721         942 :     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
     722         933 :         if (PyUnicodeDecodeError_GetEnd(exc, &end))
     723           1 :             return NULL;
     724         932 :         return Py_BuildValue("(Cn)",
     725             :                              (int)Py_UNICODE_REPLACEMENT_CHARACTER,
     726             :                              end);
     727             :     }
     728           9 :     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
     729             :         PyObject *res;
     730             :         Py_UCS2 *outp;
     731           1 :         if (PyUnicodeTranslateError_GetStart(exc, &start))
     732           0 :             return NULL;
     733           1 :         if (PyUnicodeTranslateError_GetEnd(exc, &end))
     734           0 :             return NULL;
     735           1 :         len = end - start;
     736           1 :         res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
     737           1 :         if (res == NULL)
     738           0 :             return NULL;
     739           1 :         assert(PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
     740           1 :         outp = PyUnicode_2BYTE_DATA(res);
     741           2 :         for (i = 0; i < len; i++)
     742           1 :             outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
     743           1 :         assert(_PyUnicode_CheckConsistency(res, 1));
     744           1 :         return Py_BuildValue("(Nn)", res, end);
     745             :     }
     746             :     else {
     747           8 :         wrong_exception_type(exc);
     748           8 :         return NULL;
     749             :     }
     750             : }
     751             : 
     752        1077 : PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
     753             : {
     754        1077 :     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
     755             :         PyObject *restuple;
     756             :         PyObject *object;
     757             :         Py_ssize_t i;
     758             :         Py_ssize_t start;
     759             :         Py_ssize_t end;
     760             :         PyObject *res;
     761             :         Py_UCS1 *outp;
     762             :         Py_ssize_t ressize;
     763             :         Py_UCS4 ch;
     764        1067 :         if (PyUnicodeEncodeError_GetStart(exc, &start))
     765           0 :             return NULL;
     766        1067 :         if (PyUnicodeEncodeError_GetEnd(exc, &end))
     767           0 :             return NULL;
     768        1067 :         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
     769           0 :             return NULL;
     770        1067 :         if (end - start > PY_SSIZE_T_MAX / (2+7+1))
     771           0 :             end = start + PY_SSIZE_T_MAX / (2+7+1);
     772        6159 :         for (i = start, ressize = 0; i < end; ++i) {
     773             :             /* object is guaranteed to be "ready" */
     774        5092 :             ch = PyUnicode_READ_CHAR(object, i);
     775        5092 :             if (ch<10)
     776           3 :                 ressize += 2+1+1;
     777        5089 :             else if (ch<100)
     778           2 :                 ressize += 2+2+1;
     779        5087 :             else if (ch<1000)
     780        1004 :                 ressize += 2+3+1;
     781        4083 :             else if (ch<10000)
     782          60 :                 ressize += 2+4+1;
     783        4023 :             else if (ch<100000)
     784        4014 :                 ressize += 2+5+1;
     785           9 :             else if (ch<1000000)
     786           6 :                 ressize += 2+6+1;
     787             :             else
     788           3 :                 ressize += 2+7+1;
     789             :         }
     790             :         /* allocate replacement */
     791        1067 :         res = PyUnicode_New(ressize, 127);
     792        1067 :         if (res == NULL) {
     793           0 :             Py_DECREF(object);
     794           0 :             return NULL;
     795             :         }
     796        1067 :         outp = PyUnicode_1BYTE_DATA(res);
     797             :         /* generate replacement */
     798        6159 :         for (i = start; i < end; ++i) {
     799             :             int digits;
     800             :             int base;
     801        5092 :             ch = PyUnicode_READ_CHAR(object, i);
     802        5092 :             *outp++ = '&';
     803        5092 :             *outp++ = '#';
     804        5092 :             if (ch<10) {
     805           3 :                 digits = 1;
     806           3 :                 base = 1;
     807             :             }
     808        5089 :             else if (ch<100) {
     809           2 :                 digits = 2;
     810           2 :                 base = 10;
     811             :             }
     812        5087 :             else if (ch<1000) {
     813        1004 :                 digits = 3;
     814        1004 :                 base = 100;
     815             :             }
     816        4083 :             else if (ch<10000) {
     817          60 :                 digits = 4;
     818          60 :                 base = 1000;
     819             :             }
     820        4023 :             else if (ch<100000) {
     821        4014 :                 digits = 5;
     822        4014 :                 base = 10000;
     823             :             }
     824           9 :             else if (ch<1000000) {
     825           6 :                 digits = 6;
     826           6 :                 base = 100000;
     827             :             }
     828             :             else {
     829           3 :                 digits = 7;
     830           3 :                 base = 1000000;
     831             :             }
     832       28478 :             while (digits-->0) {
     833       23386 :                 *outp++ = '0' + ch/base;
     834       23386 :                 ch %= base;
     835       23386 :                 base /= 10;
     836             :             }
     837        5092 :             *outp++ = ';';
     838             :         }
     839        1067 :         assert(_PyUnicode_CheckConsistency(res, 1));
     840        1067 :         restuple = Py_BuildValue("(Nn)", res, end);
     841        1067 :         Py_DECREF(object);
     842        1067 :         return restuple;
     843             :     }
     844             :     else {
     845          10 :         wrong_exception_type(exc);
     846          10 :         return NULL;
     847             :     }
     848             : }
     849             : 
     850        1121 : PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
     851             : {
     852             :     PyObject *object;
     853             :     Py_ssize_t i;
     854             :     Py_ssize_t start;
     855             :     Py_ssize_t end;
     856             :     PyObject *res;
     857             :     Py_UCS1 *outp;
     858             :     int ressize;
     859             :     Py_UCS4 c;
     860             : 
     861        1121 :     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
     862             :         const unsigned char *p;
     863          60 :         if (PyUnicodeDecodeError_GetStart(exc, &start))
     864           0 :             return NULL;
     865          60 :         if (PyUnicodeDecodeError_GetEnd(exc, &end))
     866           0 :             return NULL;
     867          60 :         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
     868           0 :             return NULL;
     869          60 :         p = (const unsigned char*)PyBytes_AS_STRING(object);
     870          60 :         res = PyUnicode_New(4 * (end - start), 127);
     871          60 :         if (res == NULL) {
     872           0 :             Py_DECREF(object);
     873           0 :             return NULL;
     874             :         }
     875          60 :         outp = PyUnicode_1BYTE_DATA(res);
     876         174 :         for (i = start; i < end; i++, outp += 4) {
     877         114 :             unsigned char c = p[i];
     878         114 :             outp[0] = '\\';
     879         114 :             outp[1] = 'x';
     880         114 :             outp[2] = Py_hexdigits[(c>>4)&0xf];
     881         114 :             outp[3] = Py_hexdigits[c&0xf];
     882             :         }
     883             : 
     884          60 :         assert(_PyUnicode_CheckConsistency(res, 1));
     885          60 :         Py_DECREF(object);
     886          60 :         return Py_BuildValue("(Nn)", res, end);
     887             :     }
     888        1061 :     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
     889        1041 :         if (PyUnicodeEncodeError_GetStart(exc, &start))
     890           0 :             return NULL;
     891        1041 :         if (PyUnicodeEncodeError_GetEnd(exc, &end))
     892           0 :             return NULL;
     893        1041 :         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
     894           0 :             return NULL;
     895             :     }
     896          20 :     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
     897          12 :         if (PyUnicodeTranslateError_GetStart(exc, &start))
     898           0 :             return NULL;
     899          12 :         if (PyUnicodeTranslateError_GetEnd(exc, &end))
     900           0 :             return NULL;
     901          12 :         if (!(object = PyUnicodeTranslateError_GetObject(exc)))
     902           0 :             return NULL;
     903             :     }
     904             :     else {
     905           8 :         wrong_exception_type(exc);
     906           8 :         return NULL;
     907             :     }
     908             : 
     909        1053 :     if (end - start > PY_SSIZE_T_MAX / (1+1+8))
     910           0 :         end = start + PY_SSIZE_T_MAX / (1+1+8);
     911        7108 :     for (i = start, ressize = 0; i < end; ++i) {
     912             :         /* object is guaranteed to be "ready" */
     913        6055 :         c = PyUnicode_READ_CHAR(object, i);
     914        6055 :         if (c >= 0x10000) {
     915           7 :             ressize += 1+1+8;
     916             :         }
     917        6048 :         else if (c >= 0x100) {
     918        5030 :             ressize += 1+1+4;
     919             :         }
     920             :         else
     921        1018 :             ressize += 1+1+2;
     922             :     }
     923        1053 :     res = PyUnicode_New(ressize, 127);
     924        1053 :     if (res == NULL) {
     925           0 :         Py_DECREF(object);
     926           0 :         return NULL;
     927             :     }
     928        1053 :     outp = PyUnicode_1BYTE_DATA(res);
     929        7108 :     for (i = start; i < end; ++i) {
     930        6055 :         c = PyUnicode_READ_CHAR(object, i);
     931        6055 :         *outp++ = '\\';
     932        6055 :         if (c >= 0x00010000) {
     933           7 :             *outp++ = 'U';
     934           7 :             *outp++ = Py_hexdigits[(c>>28)&0xf];
     935           7 :             *outp++ = Py_hexdigits[(c>>24)&0xf];
     936           7 :             *outp++ = Py_hexdigits[(c>>20)&0xf];
     937           7 :             *outp++ = Py_hexdigits[(c>>16)&0xf];
     938           7 :             *outp++ = Py_hexdigits[(c>>12)&0xf];
     939           7 :             *outp++ = Py_hexdigits[(c>>8)&0xf];
     940             :         }
     941        6048 :         else if (c >= 0x100) {
     942        5030 :             *outp++ = 'u';
     943        5030 :             *outp++ = Py_hexdigits[(c>>12)&0xf];
     944        5030 :             *outp++ = Py_hexdigits[(c>>8)&0xf];
     945             :         }
     946             :         else
     947        1018 :             *outp++ = 'x';
     948        6055 :         *outp++ = Py_hexdigits[(c>>4)&0xf];
     949        6055 :         *outp++ = Py_hexdigits[c&0xf];
     950             :     }
     951             : 
     952        1053 :     assert(_PyUnicode_CheckConsistency(res, 1));
     953        1053 :     Py_DECREF(object);
     954        1053 :     return Py_BuildValue("(Nn)", res, end);
     955             : }
     956             : 
     957             : static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
     958             : 
     959        2039 : PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
     960             : {
     961        2039 :     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
     962             :         PyObject *restuple;
     963             :         PyObject *object;
     964             :         Py_ssize_t i;
     965             :         Py_ssize_t start;
     966             :         Py_ssize_t end;
     967             :         PyObject *res;
     968             :         Py_UCS1 *outp;
     969             :         Py_ssize_t ressize;
     970             :         int replsize;
     971             :         Py_UCS4 c;
     972             :         char buffer[256]; /* NAME_MAXLEN */
     973        2029 :         if (PyUnicodeEncodeError_GetStart(exc, &start))
     974           0 :             return NULL;
     975        2029 :         if (PyUnicodeEncodeError_GetEnd(exc, &end))
     976           0 :             return NULL;
     977        2029 :         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
     978           0 :             return NULL;
     979        2029 :         if (!ucnhash_capi) {
     980             :             /* load the unicode data module */
     981           2 :             ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
     982             :                                             PyUnicodeData_CAPSULE_NAME, 1);
     983           2 :             if (!ucnhash_capi) {
     984           0 :                 return NULL;
     985             :             }
     986             :         }
     987       12059 :         for (i = start, ressize = 0; i < end; ++i) {
     988             :             /* object is guaranteed to be "ready" */
     989       10030 :             c = PyUnicode_READ_CHAR(object, i);
     990       10030 :             if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
     991       10013 :                 replsize = 1+1+1+(int)strlen(buffer)+1;
     992             :             }
     993          17 :             else if (c >= 0x10000) {
     994           4 :                 replsize = 1+1+8;
     995             :             }
     996          13 :             else if (c >= 0x100) {
     997          12 :                 replsize = 1+1+4;
     998             :             }
     999             :             else
    1000           1 :                 replsize = 1+1+2;
    1001       10030 :             if (ressize > PY_SSIZE_T_MAX - replsize)
    1002           0 :                 break;
    1003       10030 :             ressize += replsize;
    1004             :         }
    1005        2029 :         end = i;
    1006        2029 :         res = PyUnicode_New(ressize, 127);
    1007        2029 :         if (res==NULL)
    1008           0 :             return NULL;
    1009        2029 :         for (i = start, outp = PyUnicode_1BYTE_DATA(res);
    1010       12059 :             i < end; ++i) {
    1011       10030 :             c = PyUnicode_READ_CHAR(object, i);
    1012       10030 :             *outp++ = '\\';
    1013       10030 :             if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
    1014       10013 :                 *outp++ = 'N';
    1015       10013 :                 *outp++ = '{';
    1016       10013 :                 strcpy((char *)outp, buffer);
    1017       10013 :                 outp += strlen(buffer);
    1018       10013 :                 *outp++ = '}';
    1019       10013 :                 continue;
    1020             :             }
    1021          17 :             if (c >= 0x00010000) {
    1022           4 :                 *outp++ = 'U';
    1023           4 :                 *outp++ = Py_hexdigits[(c>>28)&0xf];
    1024           4 :                 *outp++ = Py_hexdigits[(c>>24)&0xf];
    1025           4 :                 *outp++ = Py_hexdigits[(c>>20)&0xf];
    1026           4 :                 *outp++ = Py_hexdigits[(c>>16)&0xf];
    1027           4 :                 *outp++ = Py_hexdigits[(c>>12)&0xf];
    1028           4 :                 *outp++ = Py_hexdigits[(c>>8)&0xf];
    1029             :             }
    1030          13 :             else if (c >= 0x100) {
    1031          12 :                 *outp++ = 'u';
    1032          12 :                 *outp++ = Py_hexdigits[(c>>12)&0xf];
    1033          12 :                 *outp++ = Py_hexdigits[(c>>8)&0xf];
    1034             :             }
    1035             :             else
    1036           1 :                 *outp++ = 'x';
    1037          17 :             *outp++ = Py_hexdigits[(c>>4)&0xf];
    1038          17 :             *outp++ = Py_hexdigits[c&0xf];
    1039             :         }
    1040             : 
    1041        2029 :         assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
    1042        2029 :         assert(_PyUnicode_CheckConsistency(res, 1));
    1043        2029 :         restuple = Py_BuildValue("(Nn)", res, end);
    1044        2029 :         Py_DECREF(object);
    1045        2029 :         return restuple;
    1046             :     }
    1047             :     else {
    1048          10 :         wrong_exception_type(exc);
    1049          10 :         return NULL;
    1050             :     }
    1051             : }
    1052             : 
    1053             : #define ENC_UNKNOWN     -1
    1054             : #define ENC_UTF8        0
    1055             : #define ENC_UTF16BE     1
    1056             : #define ENC_UTF16LE     2
    1057             : #define ENC_UTF32BE     3
    1058             : #define ENC_UTF32LE     4
    1059             : 
    1060             : static int
    1061        1493 : get_standard_encoding(const char *encoding, int *bytelength)
    1062             : {
    1063        1493 :     if (Py_TOLOWER(encoding[0]) == 'u' &&
    1064        1490 :         Py_TOLOWER(encoding[1]) == 't' &&
    1065        1490 :         Py_TOLOWER(encoding[2]) == 'f') {
    1066        1490 :         encoding += 3;
    1067        1490 :         if (*encoding == '-' || *encoding == '_' )
    1068        1490 :             encoding++;
    1069        1490 :         if (encoding[0] == '8' && encoding[1] == '\0') {
    1070        1374 :             *bytelength = 3;
    1071        1374 :             return ENC_UTF8;
    1072             :         }
    1073         116 :         else if (encoding[0] == '1' && encoding[1] == '6') {
    1074          50 :             encoding += 2;
    1075          50 :             *bytelength = 2;
    1076          50 :             if (*encoding == '\0') {
    1077             : #ifdef WORDS_BIGENDIAN
    1078             :                 return ENC_UTF16BE;
    1079             : #else
    1080           5 :                 return ENC_UTF16LE;
    1081             : #endif
    1082             :             }
    1083          45 :             if (*encoding == '-' || *encoding == '_' )
    1084          29 :                 encoding++;
    1085          45 :             if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
    1086          45 :                 if (Py_TOLOWER(encoding[0]) == 'b')
    1087          18 :                     return ENC_UTF16BE;
    1088          27 :                 if (Py_TOLOWER(encoding[0]) == 'l')
    1089          27 :                     return ENC_UTF16LE;
    1090             :             }
    1091             :         }
    1092          66 :         else if (encoding[0] == '3' && encoding[1] == '2') {
    1093          66 :             encoding += 2;
    1094          66 :             *bytelength = 4;
    1095          66 :             if (*encoding == '\0') {
    1096             : #ifdef WORDS_BIGENDIAN
    1097             :                 return ENC_UTF32BE;
    1098             : #else
    1099           5 :                 return ENC_UTF32LE;
    1100             : #endif
    1101             :             }
    1102          61 :             if (*encoding == '-' || *encoding == '_' )
    1103          45 :                 encoding++;
    1104          61 :             if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
    1105          61 :                 if (Py_TOLOWER(encoding[0]) == 'b')
    1106          22 :                     return ENC_UTF32BE;
    1107          39 :                 if (Py_TOLOWER(encoding[0]) == 'l')
    1108          39 :                     return ENC_UTF32LE;
    1109             :             }
    1110             :         }
    1111             :     }
    1112           3 :     else if (strcmp(encoding, "CP_UTF8") == 0) {
    1113           0 :         *bytelength = 3;
    1114           0 :         return ENC_UTF8;
    1115             :     }
    1116           3 :     return ENC_UNKNOWN;
    1117             : }
    1118             : 
    1119             : /* This handler is declared static until someone demonstrates
    1120             :    a need to call it directly. */
    1121             : static PyObject *
    1122        1502 : PyCodec_SurrogatePassErrors(PyObject *exc)
    1123             : {
    1124             :     PyObject *restuple;
    1125             :     PyObject *object;
    1126             :     PyObject *encode;
    1127             :     const char *encoding;
    1128             :     int code;
    1129             :     int bytelength;
    1130             :     Py_ssize_t i;
    1131             :     Py_ssize_t start;
    1132             :     Py_ssize_t end;
    1133             :     PyObject *res;
    1134             : 
    1135        1502 :     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
    1136             :         unsigned char *outp;
    1137          53 :         if (PyUnicodeEncodeError_GetStart(exc, &start))
    1138           0 :             return NULL;
    1139          53 :         if (PyUnicodeEncodeError_GetEnd(exc, &end))
    1140           0 :             return NULL;
    1141          53 :         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
    1142           0 :             return NULL;
    1143          53 :         if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
    1144           0 :             Py_DECREF(object);
    1145           0 :             return NULL;
    1146             :         }
    1147          53 :         if (!(encoding = PyUnicode_AsUTF8(encode))) {
    1148           0 :             Py_DECREF(object);
    1149           0 :             Py_DECREF(encode);
    1150           0 :             return NULL;
    1151             :         }
    1152          53 :         code = get_standard_encoding(encoding, &bytelength);
    1153          53 :         Py_DECREF(encode);
    1154          53 :         if (code == ENC_UNKNOWN) {
    1155             :             /* Not supported, fail with original exception */
    1156           3 :             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
    1157           3 :             Py_DECREF(object);
    1158           3 :             return NULL;
    1159             :         }
    1160             : 
    1161          50 :         if (end - start > PY_SSIZE_T_MAX / bytelength)
    1162           0 :             end = start + PY_SSIZE_T_MAX / bytelength;
    1163          50 :         res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
    1164          50 :         if (!res) {
    1165           0 :             Py_DECREF(object);
    1166           0 :             return NULL;
    1167             :         }
    1168          50 :         outp = (unsigned char*)PyBytes_AsString(res);
    1169         100 :         for (i = start; i < end; i++) {
    1170             :             /* object is guaranteed to be "ready" */
    1171          55 :             Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
    1172          55 :             if (!Py_UNICODE_IS_SURROGATE(ch)) {
    1173             :                 /* Not a surrogate, fail with original exception */
    1174           5 :                 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
    1175           5 :                 Py_DECREF(res);
    1176           5 :                 Py_DECREF(object);
    1177           5 :                 return NULL;
    1178             :             }
    1179          50 :             switch (code) {
    1180           4 :             case ENC_UTF8:
    1181           4 :                 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
    1182           4 :                 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
    1183           4 :                 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
    1184           4 :                 break;
    1185          14 :             case ENC_UTF16LE:
    1186          14 :                 *outp++ = (unsigned char) ch;
    1187          14 :                 *outp++ = (unsigned char)(ch >> 8);
    1188          14 :                 break;
    1189           9 :             case ENC_UTF16BE:
    1190           9 :                 *outp++ = (unsigned char)(ch >> 8);
    1191           9 :                 *outp++ = (unsigned char) ch;
    1192           9 :                 break;
    1193          14 :             case ENC_UTF32LE:
    1194          14 :                 *outp++ = (unsigned char) ch;
    1195          14 :                 *outp++ = (unsigned char)(ch >> 8);
    1196          14 :                 *outp++ = (unsigned char)(ch >> 16);
    1197          14 :                 *outp++ = (unsigned char)(ch >> 24);
    1198          14 :                 break;
    1199           9 :             case ENC_UTF32BE:
    1200           9 :                 *outp++ = (unsigned char)(ch >> 24);
    1201           9 :                 *outp++ = (unsigned char)(ch >> 16);
    1202           9 :                 *outp++ = (unsigned char)(ch >> 8);
    1203           9 :                 *outp++ = (unsigned char) ch;
    1204           9 :                 break;
    1205             :             }
    1206          50 :         }
    1207          45 :         restuple = Py_BuildValue("(On)", res, end);
    1208          45 :         Py_DECREF(res);
    1209          45 :         Py_DECREF(object);
    1210          45 :         return restuple;
    1211             :     }
    1212        1449 :     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
    1213             :         const unsigned char *p;
    1214        1440 :         Py_UCS4 ch = 0;
    1215        1440 :         if (PyUnicodeDecodeError_GetStart(exc, &start))
    1216           0 :             return NULL;
    1217        1440 :         if (PyUnicodeDecodeError_GetEnd(exc, &end))
    1218           0 :             return NULL;
    1219        1440 :         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
    1220           0 :             return NULL;
    1221        1440 :         p = (const unsigned char*)PyBytes_AS_STRING(object);
    1222        1440 :         if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
    1223           0 :             Py_DECREF(object);
    1224           0 :             return NULL;
    1225             :         }
    1226        1440 :         if (!(encoding = PyUnicode_AsUTF8(encode))) {
    1227           0 :             Py_DECREF(object);
    1228           0 :             Py_DECREF(encode);
    1229           0 :             return NULL;
    1230             :         }
    1231        1440 :         code = get_standard_encoding(encoding, &bytelength);
    1232        1440 :         Py_DECREF(encode);
    1233        1440 :         if (code == ENC_UNKNOWN) {
    1234             :             /* Not supported, fail with original exception */
    1235           0 :             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
    1236           0 :             Py_DECREF(object);
    1237           0 :             return NULL;
    1238             :         }
    1239             : 
    1240             :         /* Try decoding a single surrogate character. If
    1241             :            there are more, let the codec call us again. */
    1242        1440 :         p += start;
    1243        1440 :         if (PyBytes_GET_SIZE(object) - start >= bytelength) {
    1244        1437 :             switch (code) {
    1245        1367 :             case ENC_UTF8:
    1246        1367 :                 if ((p[0] & 0xf0) == 0xe0 &&
    1247        1365 :                     (p[1] & 0xc0) == 0x80 &&
    1248        1365 :                     (p[2] & 0xc0) == 0x80) {
    1249             :                     /* it's a three-byte code */
    1250        1363 :                     ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
    1251             :                 }
    1252        1367 :                 break;
    1253          18 :             case ENC_UTF16LE:
    1254          18 :                 ch = p[1] << 8 | p[0];
    1255          18 :                 break;
    1256           9 :             case ENC_UTF16BE:
    1257           9 :                 ch = p[0] << 8 | p[1];
    1258           9 :                 break;
    1259          30 :             case ENC_UTF32LE:
    1260          30 :                 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
    1261          30 :                 break;
    1262          13 :             case ENC_UTF32BE:
    1263          13 :                 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
    1264          13 :                 break;
    1265             :             }
    1266           3 :         }
    1267             : 
    1268        1440 :         Py_DECREF(object);
    1269        1440 :         if (!Py_UNICODE_IS_SURROGATE(ch)) {
    1270             :             /* it's not a surrogate - fail */
    1271          11 :             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
    1272          11 :             return NULL;
    1273             :         }
    1274        1429 :         res = PyUnicode_FromOrdinal(ch);
    1275        1429 :         if (res == NULL)
    1276           0 :             return NULL;
    1277        1429 :         return Py_BuildValue("(Nn)", res, start + bytelength);
    1278             :     }
    1279             :     else {
    1280           9 :         wrong_exception_type(exc);
    1281           9 :         return NULL;
    1282             :     }
    1283             : }
    1284             : 
    1285             : static PyObject *
    1286          75 : PyCodec_SurrogateEscapeErrors(PyObject *exc)
    1287             : {
    1288             :     PyObject *restuple;
    1289             :     PyObject *object;
    1290             :     Py_ssize_t i;
    1291             :     Py_ssize_t start;
    1292             :     Py_ssize_t end;
    1293             :     PyObject *res;
    1294             : 
    1295          75 :     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
    1296             :         char *outp;
    1297          63 :         if (PyUnicodeEncodeError_GetStart(exc, &start))
    1298           0 :             return NULL;
    1299          63 :         if (PyUnicodeEncodeError_GetEnd(exc, &end))
    1300           0 :             return NULL;
    1301          63 :         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
    1302           0 :             return NULL;
    1303          63 :         res = PyBytes_FromStringAndSize(NULL, end-start);
    1304          63 :         if (!res) {
    1305           0 :             Py_DECREF(object);
    1306           0 :             return NULL;
    1307             :         }
    1308          63 :         outp = PyBytes_AsString(res);
    1309          65 :         for (i = start; i < end; i++) {
    1310             :             /* object is guaranteed to be "ready" */
    1311          63 :             Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
    1312          63 :             if (ch < 0xdc80 || ch > 0xdcff) {
    1313             :                 /* Not a UTF-8b surrogate, fail with original exception */
    1314          61 :                 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
    1315          61 :                 Py_DECREF(res);
    1316          61 :                 Py_DECREF(object);
    1317          61 :                 return NULL;
    1318             :             }
    1319           2 :             *outp++ = ch - 0xdc00;
    1320             :         }
    1321           2 :         restuple = Py_BuildValue("(On)", res, end);
    1322           2 :         Py_DECREF(res);
    1323           2 :         Py_DECREF(object);
    1324           2 :         return restuple;
    1325             :     }
    1326          12 :     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
    1327             :         PyObject *str;
    1328             :         const unsigned char *p;
    1329             :         Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
    1330           3 :         int consumed = 0;
    1331           3 :         if (PyUnicodeDecodeError_GetStart(exc, &start))
    1332           0 :             return NULL;
    1333           3 :         if (PyUnicodeDecodeError_GetEnd(exc, &end))
    1334           0 :             return NULL;
    1335           3 :         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
    1336           0 :             return NULL;
    1337           3 :         p = (const unsigned char*)PyBytes_AS_STRING(object);
    1338           5 :         while (consumed < 4 && consumed < end-start) {
    1339             :             /* Refuse to escape ASCII bytes. */
    1340           3 :             if (p[start+consumed] < 128)
    1341           1 :                 break;
    1342           2 :             ch[consumed] = 0xdc00 + p[start+consumed];
    1343           2 :             consumed++;
    1344             :         }
    1345           3 :         Py_DECREF(object);
    1346           3 :         if (!consumed) {
    1347             :             /* codec complained about ASCII byte. */
    1348           1 :             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
    1349           1 :             return NULL;
    1350             :         }
    1351           2 :         str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
    1352           2 :         if (str == NULL)
    1353           0 :             return NULL;
    1354           2 :         return Py_BuildValue("(Nn)", str, start+consumed);
    1355             :     }
    1356             :     else {
    1357           9 :         wrong_exception_type(exc);
    1358           9 :         return NULL;
    1359             :     }
    1360             : }
    1361             : 
    1362             : 
    1363        4405 : static PyObject *strict_errors(PyObject *self, PyObject *exc)
    1364             : {
    1365        4405 :     return PyCodec_StrictErrors(exc);
    1366             : }
    1367             : 
    1368             : 
    1369        4012 : static PyObject *ignore_errors(PyObject *self, PyObject *exc)
    1370             : {
    1371        4012 :     return PyCodec_IgnoreErrors(exc);
    1372             : }
    1373             : 
    1374             : 
    1375        1966 : static PyObject *replace_errors(PyObject *self, PyObject *exc)
    1376             : {
    1377        1966 :     return PyCodec_ReplaceErrors(exc);
    1378             : }
    1379             : 
    1380             : 
    1381        1077 : static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
    1382             : {
    1383        1077 :     return PyCodec_XMLCharRefReplaceErrors(exc);
    1384             : }
    1385             : 
    1386             : 
    1387        1121 : static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
    1388             : {
    1389        1121 :     return PyCodec_BackslashReplaceErrors(exc);
    1390             : }
    1391             : 
    1392        2039 : static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
    1393             : {
    1394        2039 :     return PyCodec_NameReplaceErrors(exc);
    1395             : }
    1396             : 
    1397        1502 : static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
    1398             : {
    1399        1502 :     return PyCodec_SurrogatePassErrors(exc);
    1400             : }
    1401             : 
    1402          75 : static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
    1403             : {
    1404          75 :     return PyCodec_SurrogateEscapeErrors(exc);
    1405             : }
    1406             : 
    1407        3130 : static int _PyCodecRegistry_Init(void)
    1408             : {
    1409             :     static struct {
    1410             :         const char *name;
    1411             :         PyMethodDef def;
    1412             :     } methods[] =
    1413             :     {
    1414             :         {
    1415             :             "strict",
    1416             :             {
    1417             :                 "strict_errors",
    1418             :                 strict_errors,
    1419             :                 METH_O,
    1420             :                 PyDoc_STR("Implements the 'strict' error handling, which "
    1421             :                           "raises a UnicodeError on coding errors.")
    1422             :             }
    1423             :         },
    1424             :         {
    1425             :             "ignore",
    1426             :             {
    1427             :                 "ignore_errors",
    1428             :                 ignore_errors,
    1429             :                 METH_O,
    1430             :                 PyDoc_STR("Implements the 'ignore' error handling, which "
    1431             :                           "ignores malformed data and continues.")
    1432             :             }
    1433             :         },
    1434             :         {
    1435             :             "replace",
    1436             :             {
    1437             :                 "replace_errors",
    1438             :                 replace_errors,
    1439             :                 METH_O,
    1440             :                 PyDoc_STR("Implements the 'replace' error handling, which "
    1441             :                           "replaces malformed data with a replacement marker.")
    1442             :             }
    1443             :         },
    1444             :         {
    1445             :             "xmlcharrefreplace",
    1446             :             {
    1447             :                 "xmlcharrefreplace_errors",
    1448             :                 xmlcharrefreplace_errors,
    1449             :                 METH_O,
    1450             :                 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
    1451             :                           "which replaces an unencodable character with the "
    1452             :                           "appropriate XML character reference.")
    1453             :             }
    1454             :         },
    1455             :         {
    1456             :             "backslashreplace",
    1457             :             {
    1458             :                 "backslashreplace_errors",
    1459             :                 backslashreplace_errors,
    1460             :                 METH_O,
    1461             :                 PyDoc_STR("Implements the 'backslashreplace' error handling, "
    1462             :                           "which replaces malformed data with a backslashed "
    1463             :                           "escape sequence.")
    1464             :             }
    1465             :         },
    1466             :         {
    1467             :             "namereplace",
    1468             :             {
    1469             :                 "namereplace_errors",
    1470             :                 namereplace_errors,
    1471             :                 METH_O,
    1472             :                 PyDoc_STR("Implements the 'namereplace' error handling, "
    1473             :                           "which replaces an unencodable character with a "
    1474             :                           "\\N{...} escape sequence.")
    1475             :             }
    1476             :         },
    1477             :         {
    1478             :             "surrogatepass",
    1479             :             {
    1480             :                 "surrogatepass",
    1481             :                 surrogatepass_errors,
    1482             :                 METH_O
    1483             :             }
    1484             :         },
    1485             :         {
    1486             :             "surrogateescape",
    1487             :             {
    1488             :                 "surrogateescape",
    1489             :                 surrogateescape_errors,
    1490             :                 METH_O
    1491             :             }
    1492             :         }
    1493             :     };
    1494             : 
    1495        3130 :     PyInterpreterState *interp = _PyInterpreterState_GET();
    1496             :     PyObject *mod;
    1497             : 
    1498        3130 :     if (interp->codec_search_path != NULL)
    1499           0 :         return 0;
    1500             : 
    1501        3130 :     interp->codec_search_path = PyList_New(0);
    1502        3130 :     if (interp->codec_search_path == NULL) {
    1503           0 :         return -1;
    1504             :     }
    1505             : 
    1506        3130 :     interp->codec_search_cache = PyDict_New();
    1507        3130 :     if (interp->codec_search_cache == NULL) {
    1508           0 :         return -1;
    1509             :     }
    1510             : 
    1511        3130 :     interp->codec_error_registry = PyDict_New();
    1512        3130 :     if (interp->codec_error_registry == NULL) {
    1513           0 :         return -1;
    1514             :     }
    1515             : 
    1516       28170 :     for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
    1517       25040 :         PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
    1518       25040 :         if (!func) {
    1519           0 :             return -1;
    1520             :         }
    1521             : 
    1522       25040 :         int res = PyCodec_RegisterError(methods[i].name, func);
    1523       25040 :         Py_DECREF(func);
    1524       25040 :         if (res) {
    1525           0 :             return -1;
    1526             :         }
    1527             :     }
    1528             : 
    1529        3130 :     mod = PyImport_ImportModule("encodings");
    1530        3130 :     if (mod == NULL) {
    1531           0 :         return -1;
    1532             :     }
    1533        3130 :     Py_DECREF(mod);
    1534        3130 :     interp->codecs_initialized = 1;
    1535        3130 :     return 0;
    1536             : }

Generated by: LCOV version 1.14