LCOV - CPython lcov report - Objects/unicodeobject.c

LCOV - code coverage report

Current view:	top level - Objects - unicodeobject.c (source / functions)		Hit	Total	Coverage
Test:	CPython lcov report	Lines:	5789	6951	83.3 %
Date:	2022-07-07 18:19:46	Functions:	293	321	91.3 %

          Line data    Source code

       1             : /*
       2             : 
       3             : Unicode implementation based on original code by Fredrik Lundh,
       4             : modified by Marc-Andre Lemburg <mal@lemburg.com>.
       5             : 
       6             : Major speed upgrades to the method implementations at the Reykjavik
       7             : NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
       8             : 
       9             : Copyright (c) Corporation for National Research Initiatives.
      10             : 
      11             : --------------------------------------------------------------------
      12             : The original string type implementation is:
      13             : 
      14             :   Copyright (c) 1999 by Secret Labs AB
      15             :   Copyright (c) 1999 by Fredrik Lundh
      16             : 
      17             : By obtaining, using, and/or copying this software and/or its
      18             : associated documentation, you agree that you have read, understood,
      19             : and will comply with the following terms and conditions:
      20             : 
      21             : Permission to use, copy, modify, and distribute this software and its
      22             : associated documentation for any purpose and without fee is hereby
      23             : granted, provided that the above copyright notice appears in all
      24             : copies, and that both that copyright notice and this permission notice
      25             : appear in supporting documentation, and that the name of Secret Labs
      26             : AB or the author not be used in advertising or publicity pertaining to
      27             : distribution of the software without specific, written prior
      28             : permission.
      29             : 
      30             : SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
      31             : THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
      32             : FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
      33             : ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
      34             : WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
      35             : ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
      36             : OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
      37             : --------------------------------------------------------------------
      38             : 
      39             : */
      40             : 
      41             : #define PY_SSIZE_T_CLEAN
      42             : #include "Python.h"
      43             : #include "pycore_abstract.h"      // _PyIndex_Check()
      44             : #include "pycore_atomic_funcs.h"  // _Py_atomic_size_get()
      45             : #include "pycore_bytesobject.h"   // _PyBytes_Repeat()
      46             : #include "pycore_bytes_methods.h" // _Py_bytes_lower()
      47             : #include "pycore_format.h"        // F_LJUST
      48             : #include "pycore_initconfig.h"    // _PyStatus_OK()
      49             : #include "pycore_interp.h"        // PyInterpreterState.fs_codec
      50             : #include "pycore_long.h"          // _PyLong_FormatWriter()
      51             : #include "pycore_object.h"        // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
      52             : #include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
      53             : #include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
      54             : #include "pycore_pystate.h"       // _PyInterpreterState_GET()
      55             : #include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
      56             : #include "pycore_unicodeobject.h" // struct _Py_unicode_state
      57             : #include "stringlib/eq.h"         // unicode_eq()
      58             : 
      59             : #ifdef MS_WINDOWS
      60             : #include <windows.h>
      61             : #endif
      62             : 
      63             : #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
      64             : #  include "pycore_fileutils.h"   // _Py_LocaleUsesNonUnicodeWchar()
      65             : #endif
      66             : 
      67             : /* Uncomment to display statistics on interned strings at exit
      68             :    in _PyUnicode_ClearInterned(). */
      69             : /* #define INTERNED_STATS 1 */
      70             : 
      71             : 
      72             : /*[clinic input]
      73             : class str "PyObject *" "&PyUnicode_Type"
      74             : [clinic start generated code]*/
      75             : /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
      76             : 
      77             : /*[python input]
      78             : class Py_UCS4_converter(CConverter):
      79             :     type = 'Py_UCS4'
      80             :     converter = 'convert_uc'
      81             : 
      82             :     def converter_init(self):
      83             :         if self.default is not unspecified:
      84             :             self.c_default = ascii(self.default)
      85             :             if len(self.c_default) > 4 or self.c_default[0] != "'":
      86             :                 self.c_default = hex(ord(self.default))
      87             : 
      88             : [python start generated code]*/
      89             : /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
      90             : 
      91             : /* --- Globals ------------------------------------------------------------
      92             : 
      93             : NOTE: In the interpreter's initialization phase, some globals are currently
      94             :       initialized dynamically as needed. In the process Unicode objects may
      95             :       be created before the Unicode type is ready.
      96             : 
      97             : */
      98             : 
      99             : 
     100             : #ifdef __cplusplus
     101             : extern "C" {
     102             : #endif
     103             : 
     104             : // Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
     105             : // The value must be the same in fileutils.c.
     106             : #define MAX_UNICODE 0x10ffff
     107             : 
     108             : #ifdef Py_DEBUG
     109             : #  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
     110             : #else
     111             : #  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
     112             : #endif
     113             : 
     114             : #define _PyUnicode_UTF8(op)                             \
     115             :     (_PyCompactUnicodeObject_CAST(op)->utf8)
     116             : #define PyUnicode_UTF8(op)                              \
     117             :     (assert(_PyUnicode_CHECK(op)),                      \
     118             :      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
     119             :          ((char*)(_PyASCIIObject_CAST(op) + 1)) :       \
     120             :          _PyUnicode_UTF8(op))
     121             : #define _PyUnicode_UTF8_LENGTH(op)                      \
     122             :     (_PyCompactUnicodeObject_CAST(op)->utf8_length)
     123             : #define PyUnicode_UTF8_LENGTH(op)                       \
     124             :     (assert(_PyUnicode_CHECK(op)),                      \
     125             :      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
     126             :          _PyASCIIObject_CAST(op)->length :              \
     127             :          _PyUnicode_UTF8_LENGTH(op))
     128             : 
     129             : #define _PyUnicode_LENGTH(op)                           \
     130             :     (_PyASCIIObject_CAST(op)->length)
     131             : #define _PyUnicode_STATE(op)                            \
     132             :     (_PyASCIIObject_CAST(op)->state)
     133             : #define _PyUnicode_HASH(op)                             \
     134             :     (_PyASCIIObject_CAST(op)->hash)
     135             : #define _PyUnicode_KIND(op)                             \
     136             :     (assert(_PyUnicode_CHECK(op)),                      \
     137             :      _PyASCIIObject_CAST(op)->state.kind)
     138             : #define _PyUnicode_GET_LENGTH(op)                       \
     139             :     (assert(_PyUnicode_CHECK(op)),                      \
     140             :      _PyASCIIObject_CAST(op)->length)
     141             : #define _PyUnicode_DATA_ANY(op)                         \
     142             :     (_PyUnicodeObject_CAST(op)->data.any)
     143             : 
     144             : #define _PyUnicode_SHARE_UTF8(op)                       \
     145             :     (assert(_PyUnicode_CHECK(op)),                      \
     146             :      assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
     147             :      (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
     148             : 
     149             : /* true if the Unicode object has an allocated UTF-8 memory block
     150             :    (not shared with other data) */
     151             : #define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
     152             :     ((!PyUnicode_IS_COMPACT_ASCII(op)                   \
     153             :       && _PyUnicode_UTF8(op)                            \
     154             :       && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
     155             : 
     156             : /* Generic helper macro to convert characters of different types.
     157             :    from_type and to_type have to be valid type names, begin and end
     158             :    are pointers to the source characters which should be of type
     159             :    "from_type *".  to is a pointer of type "to_type *" and points to the
     160             :    buffer where the result characters are written to. */
     161             : #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
     162             :     do {                                                \
     163             :         to_type *_to = (to_type *)(to);                 \
     164             :         const from_type *_iter = (const from_type *)(begin);\
     165             :         const from_type *_end = (const from_type *)(end);\
     166             :         Py_ssize_t n = (_end) - (_iter);                \
     167             :         const from_type *_unrolled_end =                \
     168             :             _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
     169             :         while (_iter < (_unrolled_end)) {               \
     170             :             _to[0] = (to_type) _iter[0];                \
     171             :             _to[1] = (to_type) _iter[1];                \
     172             :             _to[2] = (to_type) _iter[2];                \
     173             :             _to[3] = (to_type) _iter[3];                \
     174             :             _iter += 4; _to += 4;                       \
     175             :         }                                               \
     176             :         while (_iter < (_end))                          \
     177             :             *_to++ = (to_type) *_iter++;                \
     178             :     } while (0)
     179             : 
     180             : #define LATIN1(ch)  \
     181             :     (ch < 128 \
     182             :      ? (PyObject*)&_Py_SINGLETON(strings).ascii[ch] \
     183             :      : (PyObject*)&_Py_SINGLETON(strings).latin1[ch - 128])
     184             : 
     185             : #ifdef MS_WINDOWS
     186             :    /* On Windows, overallocate by 50% is the best factor */
     187             : #  define OVERALLOCATE_FACTOR 2
     188             : #else
     189             :    /* On Linux, overallocate by 25% is the best factor */
     190             : #  define OVERALLOCATE_FACTOR 4
     191             : #endif
     192             : 
     193             : /* This dictionary holds all interned unicode strings.  Note that references
     194             :    to strings in this dictionary are *not* counted in the string's ob_refcnt.
     195             :    When the interned string reaches a refcnt of 0 the string deallocation
     196             :    function will delete the reference from this dictionary.
     197             : 
     198             :    Another way to look at this is that to say that the actual reference
     199             :    count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
     200             : */
     201             : static PyObject *interned = NULL;
     202             : 
     203             : /* Forward declaration */
     204             : static inline int
     205             : _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
     206             : static inline void
     207             : _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
     208             : static PyObject *
     209             : unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
     210             :                     const char *errors);
     211             : static PyObject *
     212             : unicode_decode_utf8(const char *s, Py_ssize_t size,
     213             :                     _Py_error_handler error_handler, const char *errors,
     214             :                     Py_ssize_t *consumed);
     215             : #ifdef Py_DEBUG
     216             : static inline int unicode_is_finalizing(void);
     217             : static int unicode_is_singleton(PyObject *unicode);
     218             : #endif
     219             : 
     220             : 
     221             : // Return a borrowed reference to the empty string singleton.
     222    40681700 : static inline PyObject* unicode_get_empty(void)
     223             : {
     224             :     _Py_DECLARE_STR(empty, "");
     225    40681700 :     return &_Py_STR(empty);
     226             : }
     227             : 
     228             : 
     229             : // Return a strong reference to the empty string singleton.
     230     5934120 : static inline PyObject* unicode_new_empty(void)
     231             : {
     232     5934120 :     PyObject *empty = unicode_get_empty();
     233     5934120 :     Py_INCREF(empty);
     234     5934120 :     return empty;
     235             : }
     236             : 
     237             : #define _Py_RETURN_UNICODE_EMPTY()   \
     238             :     do {                             \
     239             :         return unicode_new_empty();  \
     240             :     } while (0)
     241             : 
     242             : static inline void
     243      409637 : unicode_fill(int kind, void *data, Py_UCS4 value,
     244             :              Py_ssize_t start, Py_ssize_t length)
     245             : {
     246      409637 :     assert(0 <= start);
     247      409637 :     switch (kind) {
     248      409148 :     case PyUnicode_1BYTE_KIND: {
     249      409148 :         assert(value <= 0xff);
     250      409148 :         Py_UCS1 ch = (unsigned char)value;
     251      409148 :         Py_UCS1 *to = (Py_UCS1 *)data + start;
     252      409148 :         memset(to, ch, length);
     253      409148 :         break;
     254             :     }
     255         461 :     case PyUnicode_2BYTE_KIND: {
     256         461 :         assert(value <= 0xffff);
     257         461 :         Py_UCS2 ch = (Py_UCS2)value;
     258         461 :         Py_UCS2 *to = (Py_UCS2 *)data + start;
     259         461 :         const Py_UCS2 *end = to + length;
     260       10753 :         for (; to < end; ++to) *to = ch;
     261         461 :         break;
     262             :     }
     263          28 :     case PyUnicode_4BYTE_KIND: {
     264          28 :         assert(value <= MAX_UNICODE);
     265          28 :         Py_UCS4 ch = value;
     266          28 :         Py_UCS4 * to = (Py_UCS4 *)data + start;
     267          28 :         const Py_UCS4 *end = to + length;
     268         113 :         for (; to < end; ++to) *to = ch;
     269          28 :         break;
     270             :     }
     271           0 :     default: Py_UNREACHABLE();
     272             :     }
     273      409637 : }
     274             : 
     275             : 
     276             : /* Fast detection of the most frequent whitespace characters */
     277             : const unsigned char _Py_ascii_whitespace[] = {
     278             :     0, 0, 0, 0, 0, 0, 0, 0,
     279             : /*     case 0x0009: * CHARACTER TABULATION */
     280             : /*     case 0x000A: * LINE FEED */
     281             : /*     case 0x000B: * LINE TABULATION */
     282             : /*     case 0x000C: * FORM FEED */
     283             : /*     case 0x000D: * CARRIAGE RETURN */
     284             :     0, 1, 1, 1, 1, 1, 0, 0,
     285             :     0, 0, 0, 0, 0, 0, 0, 0,
     286             : /*     case 0x001C: * FILE SEPARATOR */
     287             : /*     case 0x001D: * GROUP SEPARATOR */
     288             : /*     case 0x001E: * RECORD SEPARATOR */
     289             : /*     case 0x001F: * UNIT SEPARATOR */
     290             :     0, 0, 0, 0, 1, 1, 1, 1,
     291             : /*     case 0x0020: * SPACE */
     292             :     1, 0, 0, 0, 0, 0, 0, 0,
     293             :     0, 0, 0, 0, 0, 0, 0, 0,
     294             :     0, 0, 0, 0, 0, 0, 0, 0,
     295             :     0, 0, 0, 0, 0, 0, 0, 0,
     296             : 
     297             :     0, 0, 0, 0, 0, 0, 0, 0,
     298             :     0, 0, 0, 0, 0, 0, 0, 0,
     299             :     0, 0, 0, 0, 0, 0, 0, 0,
     300             :     0, 0, 0, 0, 0, 0, 0, 0,
     301             :     0, 0, 0, 0, 0, 0, 0, 0,
     302             :     0, 0, 0, 0, 0, 0, 0, 0,
     303             :     0, 0, 0, 0, 0, 0, 0, 0,
     304             :     0, 0, 0, 0, 0, 0, 0, 0
     305             : };
     306             : 
     307             : /* forward */
     308             : static PyObject* get_latin1_char(unsigned char ch);
     309             : static int unicode_modifiable(PyObject *unicode);
     310             : 
     311             : 
     312             : static PyObject *
     313             : _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
     314             : static PyObject *
     315             : _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
     316             : static PyObject *
     317             : _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
     318             : 
     319             : static PyObject *
     320             : unicode_encode_call_errorhandler(const char *errors,
     321             :        PyObject **errorHandler,const char *encoding, const char *reason,
     322             :        PyObject *unicode, PyObject **exceptionObject,
     323             :        Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
     324             : 
     325             : static void
     326             : raise_encode_exception(PyObject **exceptionObject,
     327             :                        const char *encoding,
     328             :                        PyObject *unicode,
     329             :                        Py_ssize_t startpos, Py_ssize_t endpos,
     330             :                        const char *reason);
     331             : 
     332             : /* Same for linebreaks */
     333             : static const unsigned char ascii_linebreak[] = {
     334             :     0, 0, 0, 0, 0, 0, 0, 0,
     335             : /*         0x000A, * LINE FEED */
     336             : /*         0x000B, * LINE TABULATION */
     337             : /*         0x000C, * FORM FEED */
     338             : /*         0x000D, * CARRIAGE RETURN */
     339             :     0, 0, 1, 1, 1, 1, 0, 0,
     340             :     0, 0, 0, 0, 0, 0, 0, 0,
     341             : /*         0x001C, * FILE SEPARATOR */
     342             : /*         0x001D, * GROUP SEPARATOR */
     343             : /*         0x001E, * RECORD SEPARATOR */
     344             :     0, 0, 0, 0, 1, 1, 1, 0,
     345             :     0, 0, 0, 0, 0, 0, 0, 0,
     346             :     0, 0, 0, 0, 0, 0, 0, 0,
     347             :     0, 0, 0, 0, 0, 0, 0, 0,
     348             :     0, 0, 0, 0, 0, 0, 0, 0,
     349             : 
     350             :     0, 0, 0, 0, 0, 0, 0, 0,
     351             :     0, 0, 0, 0, 0, 0, 0, 0,
     352             :     0, 0, 0, 0, 0, 0, 0, 0,
     353             :     0, 0, 0, 0, 0, 0, 0, 0,
     354             :     0, 0, 0, 0, 0, 0, 0, 0,
     355             :     0, 0, 0, 0, 0, 0, 0, 0,
     356             :     0, 0, 0, 0, 0, 0, 0, 0,
     357             :     0, 0, 0, 0, 0, 0, 0, 0
     358             : };
     359             : 
     360             : static int convert_uc(PyObject *obj, void *addr);
     361             : 
     362             : struct encoding_map;
     363             : #include "clinic/unicodeobject.c.h"
     364             : 
     365             : _Py_error_handler
     366     1422150 : _Py_GetErrorHandler(const char *errors)
     367             : {
     368     1422150 :     if (errors == NULL || strcmp(errors, "strict") == 0) {
     369       23405 :         return _Py_ERROR_STRICT;
     370             :     }
     371     1398750 :     if (strcmp(errors, "surrogateescape") == 0) {
     372     1389540 :         return _Py_ERROR_SURROGATEESCAPE;
     373             :     }
     374        9205 :     if (strcmp(errors, "replace") == 0) {
     375         865 :         return _Py_ERROR_REPLACE;
     376             :     }
     377        8340 :     if (strcmp(errors, "ignore") == 0) {
     378         613 :         return _Py_ERROR_IGNORE;
     379             :     }
     380        7727 :     if (strcmp(errors, "backslashreplace") == 0) {
     381        3883 :         return _Py_ERROR_BACKSLASHREPLACE;
     382             :     }
     383        3844 :     if (strcmp(errors, "surrogatepass") == 0) {
     384        3502 :         return _Py_ERROR_SURROGATEPASS;
     385             :     }
     386         342 :     if (strcmp(errors, "xmlcharrefreplace") == 0) {
     387         190 :         return _Py_ERROR_XMLCHARREFREPLACE;
     388             :     }
     389         152 :     return _Py_ERROR_OTHER;
     390             : }
     391             : 
     392             : 
     393             : static _Py_error_handler
     394     1119110 : get_error_handler_wide(const wchar_t *errors)
     395             : {
     396     1119110 :     if (errors == NULL || wcscmp(errors, L"strict") == 0) {
     397           0 :         return _Py_ERROR_STRICT;
     398             :     }
     399     1119110 :     if (wcscmp(errors, L"surrogateescape") == 0) {
     400     1119110 :         return _Py_ERROR_SURROGATEESCAPE;
     401             :     }
     402           0 :     if (wcscmp(errors, L"replace") == 0) {
     403           0 :         return _Py_ERROR_REPLACE;
     404             :     }
     405           0 :     if (wcscmp(errors, L"ignore") == 0) {
     406           0 :         return _Py_ERROR_IGNORE;
     407             :     }
     408           0 :     if (wcscmp(errors, L"backslashreplace") == 0) {
     409           0 :         return _Py_ERROR_BACKSLASHREPLACE;
     410             :     }
     411           0 :     if (wcscmp(errors, L"surrogatepass") == 0) {
     412           0 :         return _Py_ERROR_SURROGATEPASS;
     413             :     }
     414           0 :     if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
     415           0 :         return _Py_ERROR_XMLCHARREFREPLACE;
     416             :     }
     417           0 :     return _Py_ERROR_OTHER;
     418             : }
     419             : 
     420             : 
     421             : static inline int
     422    13795400 : unicode_check_encoding_errors(const char *encoding, const char *errors)
     423             : {
     424    13795400 :     if (encoding == NULL && errors == NULL) {
     425       98604 :         return 0;
     426             :     }
     427             : 
     428    13696800 :     PyInterpreterState *interp = _PyInterpreterState_GET();
     429             : #ifndef Py_DEBUG
     430             :     /* In release mode, only check in development mode (-X dev) */
     431             :     if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
     432             :         return 0;
     433             :     }
     434             : #else
     435             :     /* Always check in debug mode */
     436             : #endif
     437             : 
     438             :     /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
     439             :        codec registry is ready: before_PyUnicode_InitEncodings() is called. */
     440    13696800 :     if (!interp->unicode.fs_codec.encoding) {
     441          10 :         return 0;
     442             :     }
     443             : 
     444             :     /* Disable checks during Python finalization. For example, it allows to
     445             :        call _PyObject_Dump() during finalization for debugging purpose. */
     446    13696800 :     if (interp->finalizing) {
     447           0 :         return 0;
     448             :     }
     449             : 
     450    13696800 :     if (encoding != NULL
     451             :         // Fast path for the most common built-in encodings. Even if the codec
     452             :         // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to
     453             :         // create a temporary Unicode string (the key in the cache).
     454    13696800 :         && strcmp(encoding, "utf-8") != 0
     455     8630700 :         && strcmp(encoding, "utf8") != 0
     456     8572060 :         && strcmp(encoding, "ascii") != 0)
     457             :     {
     458     6464930 :         PyObject *handler = _PyCodec_Lookup(encoding);
     459     6464930 :         if (handler == NULL) {
     460          40 :             return -1;
     461             :         }
     462     6464890 :         Py_DECREF(handler);
     463             :     }
     464             : 
     465    13696700 :     if (errors != NULL
     466             :         // Fast path for the most common built-in error handlers.
     467     5136510 :         && strcmp(errors, "strict") != 0
     468     4985210 :         && strcmp(errors, "ignore") != 0
     469     3934690 :         && strcmp(errors, "replace") != 0
     470     3907540 :         && strcmp(errors, "surrogateescape") != 0
     471     1466380 :         && strcmp(errors, "surrogatepass") != 0)
     472             :     {
     473       12558 :         PyObject *handler = PyCodec_LookupError(errors);
     474       12558 :         if (handler == NULL) {
     475          30 :             return -1;
     476             :         }
     477       12528 :         Py_DECREF(handler);
     478             :     }
     479    13696700 :     return 0;
     480             : }
     481             : 
     482             : 
     483             : int
     484   921539000 : _PyUnicode_CheckConsistency(PyObject *op, int check_content)
     485             : {
     486             : #define CHECK(expr) \
     487             :     do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
     488             : 
     489   921539000 :     assert(op != NULL);
     490   921539000 :     CHECK(PyUnicode_Check(op));
     491             : 
     492   921539000 :     PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
     493   921539000 :     int kind = ascii->state.kind;
     494             : 
     495   921539000 :     if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
     496   839935000 :         CHECK(kind == PyUnicode_1BYTE_KIND);
     497             :     }
     498             :     else {
     499    81603900 :         PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
     500             :         void *data;
     501             : 
     502    81603900 :         if (ascii->state.compact == 1) {
     503    81348800 :             data = compact + 1;
     504    81348800 :             CHECK(kind == PyUnicode_1BYTE_KIND
     505             :                                  || kind == PyUnicode_2BYTE_KIND
     506             :                                  || kind == PyUnicode_4BYTE_KIND);
     507    81348800 :             CHECK(ascii->state.ascii == 0);
     508    81348800 :             CHECK(compact->utf8 != data);
     509             :         }
     510             :         else {
     511      255053 :             PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
     512             : 
     513      255053 :             data = unicode->data.any;
     514      255053 :             CHECK(kind == PyUnicode_1BYTE_KIND
     515             :                      || kind == PyUnicode_2BYTE_KIND
     516             :                      || kind == PyUnicode_4BYTE_KIND);
     517      255053 :             CHECK(ascii->state.compact == 0);
     518      255053 :             CHECK(data != NULL);
     519      255053 :             if (ascii->state.ascii) {
     520      204520 :                 CHECK(compact->utf8 == data);
     521      204520 :                 CHECK(compact->utf8_length == ascii->length);
     522             :             }
     523             :             else {
     524       50533 :                 CHECK(compact->utf8 != data);
     525             :             }
     526             :         }
     527             : 
     528    81603900 :         if (compact->utf8 == NULL)
     529    81380700 :             CHECK(compact->utf8_length == 0);
     530             :     }
     531             : 
     532             :     /* check that the best kind is used: O(n) operation */
     533   921539000 :     if (check_content) {
     534             :         Py_ssize_t i;
     535   169621000 :         Py_UCS4 maxchar = 0;
     536             :         const void *data;
     537             :         Py_UCS4 ch;
     538             : 
     539   169621000 :         data = PyUnicode_DATA(ascii);
     540  5332340000 :         for (i=0; i < ascii->length; i++)
     541             :         {
     542  5162720000 :             ch = PyUnicode_READ(kind, data, i);
     543  5162720000 :             if (ch > maxchar)
     544   469388000 :                 maxchar = ch;
     545             :         }
     546   169621000 :         if (kind == PyUnicode_1BYTE_KIND) {
     547   137319000 :             if (ascii->state.ascii == 0) {
     548     1071400 :                 CHECK(maxchar >= 128);
     549     1071400 :                 CHECK(maxchar <= 255);
     550             :             }
     551             :             else
     552   136247000 :                 CHECK(maxchar < 128);
     553             :         }
     554    32302000 :         else if (kind == PyUnicode_2BYTE_KIND) {
     555     4841100 :             CHECK(maxchar >= 0x100);
     556     4841100 :             CHECK(maxchar <= 0xFFFF);
     557             :         }
     558             :         else {
     559    27460900 :             CHECK(maxchar >= 0x10000);
     560    27460900 :             CHECK(maxchar <= MAX_UNICODE);
     561             :         }
     562   169621000 :         CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
     563             :     }
     564   921539000 :     return 1;
     565             : 
     566             : #undef CHECK
     567             : }
     568             : 
     569             : static PyObject*
     570    13855900 : unicode_result(PyObject *unicode)
     571             : {
     572    13855900 :     assert(_PyUnicode_CHECK(unicode));
     573             : 
     574    13855900 :     Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
     575    13855900 :     if (length == 0) {
     576           2 :         PyObject *empty = unicode_get_empty();
     577           2 :         if (unicode != empty) {
     578           0 :             Py_DECREF(unicode);
     579           0 :             Py_INCREF(empty);
     580             :         }
     581           2 :         return empty;
     582             :     }
     583             : 
     584    13855900 :     if (length == 1) {
     585      946907 :         int kind = PyUnicode_KIND(unicode);
     586      946907 :         if (kind == PyUnicode_1BYTE_KIND) {
     587      248989 :             const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
     588      248989 :             Py_UCS1 ch = data[0];
     589      248989 :             PyObject *latin1_char = LATIN1(ch);
     590      248989 :             if (unicode != latin1_char) {
     591      244309 :                 Py_INCREF(latin1_char);
     592      244309 :                 Py_DECREF(unicode);
     593             :             }
     594      248989 :             return latin1_char;
     595             :         }
     596             :     }
     597             : 
     598    13606900 :     assert(_PyUnicode_CheckConsistency(unicode, 1));
     599    13606900 :     return unicode;
     600             : }
     601             : 
     602             : static PyObject*
     603    19182000 : unicode_result_unchanged(PyObject *unicode)
     604             : {
     605    19182000 :     if (PyUnicode_CheckExact(unicode)) {
     606    19120400 :         Py_INCREF(unicode);
     607    19120400 :         return unicode;
     608             :     }
     609             :     else
     610             :         /* Subtype -- return genuine unicode string with the same value. */
     611       61564 :         return _PyUnicode_Copy(unicode);
     612             : }
     613             : 
     614             : /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
     615             :    ASCII, Latin1, UTF-8, etc. */
     616             : static char*
     617        8392 : backslashreplace(_PyBytesWriter *writer, char *str,
     618             :                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
     619             : {
     620             :     Py_ssize_t size, i;
     621             :     Py_UCS4 ch;
     622             :     int kind;
     623             :     const void *data;
     624             : 
     625        8392 :     kind = PyUnicode_KIND(unicode);
     626        8392 :     data = PyUnicode_DATA(unicode);
     627             : 
     628        8392 :     size = 0;
     629             :     /* determine replacement size */
     630       32649 :     for (i = collstart; i < collend; ++i) {
     631             :         Py_ssize_t incr;
     632             : 
     633       24257 :         ch = PyUnicode_READ(kind, data, i);
     634       24257 :         if (ch < 0x100)
     635        4784 :             incr = 2+2;
     636       19473 :         else if (ch < 0x10000)
     637       19178 :             incr = 2+4;
     638             :         else {
     639         295 :             assert(ch <= MAX_UNICODE);
     640         295 :             incr = 2+8;
     641             :         }
     642       24257 :         if (size > PY_SSIZE_T_MAX - incr) {
     643           0 :             PyErr_SetString(PyExc_OverflowError,
     644             :                             "encoded result is too long for a Python string");
     645           0 :             return NULL;
     646             :         }
     647       24257 :         size += incr;
     648             :     }
     649             : 
     650        8392 :     str = _PyBytesWriter_Prepare(writer, str, size);
     651        8392 :     if (str == NULL)
     652           0 :         return NULL;
     653             : 
     654             :     /* generate replacement */
     655       32649 :     for (i = collstart; i < collend; ++i) {
     656       24257 :         ch = PyUnicode_READ(kind, data, i);
     657       24257 :         *str++ = '\\';
     658       24257 :         if (ch >= 0x00010000) {
     659         295 :             *str++ = 'U';
     660         295 :             *str++ = Py_hexdigits[(ch>>28)&0xf];
     661         295 :             *str++ = Py_hexdigits[(ch>>24)&0xf];
     662         295 :             *str++ = Py_hexdigits[(ch>>20)&0xf];
     663         295 :             *str++ = Py_hexdigits[(ch>>16)&0xf];
     664         295 :             *str++ = Py_hexdigits[(ch>>12)&0xf];
     665         295 :             *str++ = Py_hexdigits[(ch>>8)&0xf];
     666             :         }
     667       23962 :         else if (ch >= 0x100) {
     668       19178 :             *str++ = 'u';
     669       19178 :             *str++ = Py_hexdigits[(ch>>12)&0xf];
     670       19178 :             *str++ = Py_hexdigits[(ch>>8)&0xf];
     671             :         }
     672             :         else
     673        4784 :             *str++ = 'x';
     674       24257 :         *str++ = Py_hexdigits[(ch>>4)&0xf];
     675       24257 :         *str++ = Py_hexdigits[ch&0xf];
     676             :     }
     677        8392 :     return str;
     678             : }
     679             : 
     680             : /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
     681             :    ASCII, Latin1, UTF-8, etc. */
     682             : static char*
     683        1242 : xmlcharrefreplace(_PyBytesWriter *writer, char *str,
     684             :                   PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
     685             : {
     686             :     Py_ssize_t size, i;
     687             :     Py_UCS4 ch;
     688             :     int kind;
     689             :     const void *data;
     690             : 
     691        1242 :     kind = PyUnicode_KIND(unicode);
     692        1242 :     data = PyUnicode_DATA(unicode);
     693             : 
     694        1242 :     size = 0;
     695             :     /* determine replacement size */
     696        5557 :     for (i = collstart; i < collend; ++i) {
     697             :         Py_ssize_t incr;
     698             : 
     699        4315 :         ch = PyUnicode_READ(kind, data, i);
     700        4315 :         if (ch < 10)
     701           0 :             incr = 2+1+1;
     702        4315 :         else if (ch < 100)
     703           0 :             incr = 2+2+1;
     704        4315 :         else if (ch < 1000)
     705        1066 :             incr = 2+3+1;
     706        3249 :         else if (ch < 10000)
     707         193 :             incr = 2+4+1;
     708        3056 :         else if (ch < 100000)
     709        3053 :             incr = 2+5+1;
     710           3 :         else if (ch < 1000000)
     711           2 :             incr = 2+6+1;
     712             :         else {
     713           1 :             assert(ch <= MAX_UNICODE);
     714           1 :             incr = 2+7+1;
     715             :         }
     716        4315 :         if (size > PY_SSIZE_T_MAX - incr) {
     717           0 :             PyErr_SetString(PyExc_OverflowError,
     718             :                             "encoded result is too long for a Python string");
     719           0 :             return NULL;
     720             :         }
     721        4315 :         size += incr;
     722             :     }
     723             : 
     724        1242 :     str = _PyBytesWriter_Prepare(writer, str, size);
     725        1242 :     if (str == NULL)
     726           0 :         return NULL;
     727             : 
     728             :     /* generate replacement */
     729        5557 :     for (i = collstart; i < collend; ++i) {
     730        4315 :         size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
     731        4315 :         if (size < 0) {
     732           0 :             return NULL;
     733             :         }
     734        4315 :         str += size;
     735             :     }
     736        1242 :     return str;
     737             : }
     738             : 
     739             : /* --- Bloom Filters ----------------------------------------------------- */
     740             : 
     741             : /* stuff to implement simple "bloom filters" for Unicode characters.
     742             :    to keep things simple, we use a single bitmask, using the least 5
     743             :    bits from each unicode characters as the bit index. */
     744             : 
     745             : /* the linebreak mask is set up by _PyUnicode_Init() below */
     746             : 
     747             : #if LONG_BIT >= 128
     748             : #define BLOOM_WIDTH 128
     749             : #elif LONG_BIT >= 64
     750             : #define BLOOM_WIDTH 64
     751             : #elif LONG_BIT >= 32
     752             : #define BLOOM_WIDTH 32
     753             : #else
     754             : #error "LONG_BIT is smaller than 32"
     755             : #endif
     756             : 
     757             : #define BLOOM_MASK unsigned long
     758             : 
     759             : static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
     760             : 
     761             : #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
     762             : 
     763             : #define BLOOM_LINEBREAK(ch)                                             \
     764             :     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
     765             :      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
     766             : 
     767             : static inline BLOOM_MASK
     768     6982990 : make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
     769             : {
     770             : #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
     771             :     do {                                               \
     772             :         TYPE *data = (TYPE *)PTR;                      \
     773             :         TYPE *end = data + LEN;                        \
     774             :         Py_UCS4 ch;                                    \
     775             :         for (; data != end; data++) {                  \
     776             :             ch = *data;                                \
     777             :             MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
     778             :         }                                              \
     779             :         break;                                         \
     780             :     } while (0)
     781             : 
     782             :     /* calculate simple bloom-style bitmask for a given unicode string */
     783             : 
     784             :     BLOOM_MASK mask;
     785             : 
     786     6982990 :     mask = 0;
     787     6982990 :     switch (kind) {
     788     6980030 :     case PyUnicode_1BYTE_KIND:
     789    14356700 :         BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
     790     6980030 :         break;
     791        2963 :     case PyUnicode_2BYTE_KIND:
     792       26667 :         BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
     793        2963 :         break;
     794           0 :     case PyUnicode_4BYTE_KIND:
     795           0 :         BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
     796           0 :         break;
     797           0 :     default:
     798           0 :         Py_UNREACHABLE();
     799             :     }
     800     6982990 :     return mask;
     801             : 
     802             : #undef BLOOM_UPDATE
     803             : }
     804             : 
     805             : static int
     806    67976500 : ensure_unicode(PyObject *obj)
     807             : {
     808    67976500 :     if (!PyUnicode_Check(obj)) {
     809          15 :         PyErr_Format(PyExc_TypeError,
     810             :                      "must be str, not %.100s",
     811          15 :                      Py_TYPE(obj)->tp_name);
     812          15 :         return -1;
     813             :     }
     814    67976400 :     return 0;
     815             : }
     816             : 
     817             : /* Compilation of templated routines */
     818             : 
     819             : #define STRINGLIB_GET_EMPTY() unicode_get_empty()
     820             : 
     821             : #include "stringlib/asciilib.h"
     822             : #include "stringlib/fastsearch.h"
     823             : #include "stringlib/partition.h"
     824             : #include "stringlib/split.h"
     825             : #include "stringlib/count.h"
     826             : #include "stringlib/find.h"
     827             : #include "stringlib/find_max_char.h"
     828             : #include "stringlib/undef.h"
     829             : 
     830             : #include "stringlib/ucs1lib.h"
     831             : #include "stringlib/fastsearch.h"
     832             : #include "stringlib/partition.h"
     833             : #include "stringlib/split.h"
     834             : #include "stringlib/count.h"
     835             : #include "stringlib/find.h"
     836             : #include "stringlib/replace.h"
     837             : #include "stringlib/find_max_char.h"
     838             : #include "stringlib/undef.h"
     839             : 
     840             : #include "stringlib/ucs2lib.h"
     841             : #include "stringlib/fastsearch.h"
     842             : #include "stringlib/partition.h"
     843             : #include "stringlib/split.h"
     844             : #include "stringlib/count.h"
     845             : #include "stringlib/find.h"
     846             : #include "stringlib/replace.h"
     847             : #include "stringlib/find_max_char.h"
     848             : #include "stringlib/undef.h"
     849             : 
     850             : #include "stringlib/ucs4lib.h"
     851             : #include "stringlib/fastsearch.h"
     852             : #include "stringlib/partition.h"
     853             : #include "stringlib/split.h"
     854             : #include "stringlib/count.h"
     855             : #include "stringlib/find.h"
     856             : #include "stringlib/replace.h"
     857             : #include "stringlib/find_max_char.h"
     858             : #include "stringlib/undef.h"
     859             : 
     860             : #undef STRINGLIB_GET_EMPTY
     861             : 
     862             : /* --- Unicode Object ----------------------------------------------------- */
     863             : 
     864             : static inline Py_ssize_t
     865    35364400 : findchar(const void *s, int kind,
     866             :          Py_ssize_t size, Py_UCS4 ch,
     867             :          int direction)
     868             : {
     869    35364400 :     switch (kind) {
     870    34223200 :     case PyUnicode_1BYTE_KIND:
     871    34223200 :         if ((Py_UCS1) ch != ch)
     872           2 :             return -1;
     873    34223200 :         if (direction > 0)
     874    32980300 :             return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
     875             :         else
     876     1242840 :             return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
     877       90064 :     case PyUnicode_2BYTE_KIND:
     878       90064 :         if ((Py_UCS2) ch != ch)
     879           0 :             return -1;
     880       90064 :         if (direction > 0)
     881       89729 :             return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
     882             :         else
     883         335 :             return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
     884     1051150 :     case PyUnicode_4BYTE_KIND:
     885     1051150 :         if (direction > 0)
     886     1051140 :             return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
     887             :         else
     888           9 :             return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
     889           0 :     default:
     890           0 :         Py_UNREACHABLE();
     891             :     }
     892             : }
     893             : 
     894             : #ifdef Py_DEBUG
     895             : /* Fill the data of a Unicode string with invalid characters to detect bugs
     896             :    earlier.
     897             : 
     898             :    _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
     899             :    ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
     900             :    invalid character in Unicode 6.0. */
     901             : static void
     902   261413000 : unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
     903             : {
     904   261413000 :     int kind = PyUnicode_KIND(unicode);
     905   261413000 :     Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
     906   261413000 :     Py_ssize_t length = _PyUnicode_LENGTH(unicode);
     907   261413000 :     if (length <= old_length)
     908     9986210 :         return;
     909   251427000 :     memset(data + old_length * kind, 0xff, (length - old_length) * kind);
     910             : }
     911             : #endif
     912             : 
     913             : static PyObject*
     914    11785200 : resize_compact(PyObject *unicode, Py_ssize_t length)
     915             : {
     916             :     Py_ssize_t char_size;
     917             :     Py_ssize_t struct_size;
     918             :     Py_ssize_t new_size;
     919             :     PyObject *new_unicode;
     920             : #ifdef Py_DEBUG
     921    11785200 :     Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
     922             : #endif
     923             : 
     924    11785200 :     assert(unicode_modifiable(unicode));
     925    11785200 :     assert(PyUnicode_IS_COMPACT(unicode));
     926             : 
     927    11785200 :     char_size = PyUnicode_KIND(unicode);
     928    11785200 :     if (PyUnicode_IS_ASCII(unicode))
     929    10869400 :         struct_size = sizeof(PyASCIIObject);
     930             :     else
     931      915825 :         struct_size = sizeof(PyCompactUnicodeObject);
     932             : 
     933    11785200 :     if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
     934           0 :         PyErr_NoMemory();
     935           0 :         return NULL;
     936             :     }
     937    11785200 :     new_size = (struct_size + (length + 1) * char_size);
     938             : 
     939    11785200 :     if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
     940          11 :         PyObject_Free(_PyUnicode_UTF8(unicode));
     941          11 :         _PyUnicode_UTF8(unicode) = NULL;
     942          11 :         _PyUnicode_UTF8_LENGTH(unicode) = 0;
     943             :     }
     944             : #ifdef Py_REF_DEBUG
     945    11785200 :     _Py_RefTotal--;
     946             : #endif
     947             : #ifdef Py_TRACE_REFS
     948             :     _Py_ForgetReference(unicode);
     949             : #endif
     950             : 
     951    11785200 :     new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
     952    11785200 :     if (new_unicode == NULL) {
     953           0 :         _Py_NewReference(unicode);
     954           0 :         PyErr_NoMemory();
     955           0 :         return NULL;
     956             :     }
     957    11785200 :     unicode = new_unicode;
     958    11785200 :     _Py_NewReference(unicode);
     959             : 
     960    11785200 :     _PyUnicode_LENGTH(unicode) = length;
     961             : #ifdef Py_DEBUG
     962    11785200 :     unicode_fill_invalid(unicode, old_length);
     963             : #endif
     964    11785200 :     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
     965             :                     length, 0);
     966    11785200 :     assert(_PyUnicode_CheckConsistency(unicode, 0));
     967    11785200 :     return unicode;
     968             : }
     969             : 
     970             : static int
     971           0 : resize_inplace(PyObject *unicode, Py_ssize_t length)
     972             : {
     973           0 :     assert(!PyUnicode_IS_COMPACT(unicode));
     974           0 :     assert(Py_REFCNT(unicode) == 1);
     975             : 
     976             :     Py_ssize_t new_size;
     977             :     Py_ssize_t char_size;
     978             :     int share_utf8;
     979             :     void *data;
     980             : #ifdef Py_DEBUG
     981           0 :     Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
     982             : #endif
     983             : 
     984           0 :     data = _PyUnicode_DATA_ANY(unicode);
     985           0 :     char_size = PyUnicode_KIND(unicode);
     986           0 :     share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
     987             : 
     988           0 :     if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
     989           0 :         PyErr_NoMemory();
     990           0 :         return -1;
     991             :     }
     992           0 :     new_size = (length + 1) * char_size;
     993             : 
     994           0 :     if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
     995             :     {
     996           0 :         PyObject_Free(_PyUnicode_UTF8(unicode));
     997           0 :         _PyUnicode_UTF8(unicode) = NULL;
     998           0 :         _PyUnicode_UTF8_LENGTH(unicode) = 0;
     999             :     }
    1000             : 
    1001           0 :     data = (PyObject *)PyObject_Realloc(data, new_size);
    1002           0 :     if (data == NULL) {
    1003           0 :         PyErr_NoMemory();
    1004           0 :         return -1;
    1005             :     }
    1006           0 :     _PyUnicode_DATA_ANY(unicode) = data;
    1007           0 :     if (share_utf8) {
    1008           0 :         _PyUnicode_UTF8(unicode) = data;
    1009           0 :         _PyUnicode_UTF8_LENGTH(unicode) = length;
    1010             :     }
    1011           0 :     _PyUnicode_LENGTH(unicode) = length;
    1012           0 :     PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
    1013             : #ifdef Py_DEBUG
    1014           0 :     unicode_fill_invalid(unicode, old_length);
    1015             : #endif
    1016             : 
    1017             :     /* check for integer overflow */
    1018           0 :     if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
    1019           0 :         PyErr_NoMemory();
    1020           0 :         return -1;
    1021             :     }
    1022           0 :     assert(_PyUnicode_CheckConsistency(unicode, 0));
    1023           0 :     return 0;
    1024             : }
    1025             : 
    1026             : static PyObject*
    1027           0 : resize_copy(PyObject *unicode, Py_ssize_t length)
    1028             : {
    1029             :     Py_ssize_t copy_length;
    1030             :     PyObject *copy;
    1031             : 
    1032           0 :     copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
    1033           0 :     if (copy == NULL)
    1034           0 :         return NULL;
    1035             : 
    1036           0 :     copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
    1037           0 :     _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
    1038           0 :     return copy;
    1039             : }
    1040             : 
    1041             : static const char*
    1042          12 : unicode_kind_name(PyObject *unicode)
    1043             : {
    1044             :     /* don't check consistency: unicode_kind_name() is called from
    1045             :        _PyUnicode_Dump() */
    1046          12 :     if (!PyUnicode_IS_COMPACT(unicode))
    1047             :     {
    1048           0 :         switch (PyUnicode_KIND(unicode))
    1049             :         {
    1050           0 :         case PyUnicode_1BYTE_KIND:
    1051           0 :             if (PyUnicode_IS_ASCII(unicode))
    1052           0 :                 return "legacy ascii";
    1053             :             else
    1054           0 :                 return "legacy latin1";
    1055           0 :         case PyUnicode_2BYTE_KIND:
    1056           0 :             return "legacy UCS2";
    1057           0 :         case PyUnicode_4BYTE_KIND:
    1058           0 :             return "legacy UCS4";
    1059           0 :         default:
    1060           0 :             return "<legacy invalid kind>";
    1061             :         }
    1062             :     }
    1063          12 :     switch (PyUnicode_KIND(unicode)) {
    1064           6 :     case PyUnicode_1BYTE_KIND:
    1065           6 :         if (PyUnicode_IS_ASCII(unicode))
    1066           3 :             return "ascii";
    1067             :         else
    1068           3 :             return "latin1";
    1069           3 :     case PyUnicode_2BYTE_KIND:
    1070           3 :         return "UCS2";
    1071           3 :     case PyUnicode_4BYTE_KIND:
    1072           3 :         return "UCS4";
    1073           0 :     default:
    1074           0 :         return "<invalid compact kind>";
    1075             :     }
    1076             : }
    1077             : 
    1078             : #ifdef Py_DEBUG
    1079             : /* Functions wrapping macros for use in debugger */
    1080           0 : const char *_PyUnicode_utf8(void *unicode_raw){
    1081           0 :     PyObject *unicode = _PyObject_CAST(unicode_raw);
    1082           0 :     return PyUnicode_UTF8(unicode);
    1083             : }
    1084             : 
    1085           0 : const void *_PyUnicode_compact_data(void *unicode_raw) {
    1086           0 :     PyObject *unicode = _PyObject_CAST(unicode_raw);
    1087           0 :     return _PyUnicode_COMPACT_DATA(unicode);
    1088             : }
    1089           0 : const void *_PyUnicode_data(void *unicode_raw) {
    1090           0 :     PyObject *unicode = _PyObject_CAST(unicode_raw);
    1091           0 :     printf("obj %p\n", (void*)unicode);
    1092           0 :     printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
    1093           0 :     printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
    1094           0 :     printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
    1095           0 :     printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
    1096           0 :     printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
    1097           0 :     return PyUnicode_DATA(unicode);
    1098             : }
    1099             : 
    1100             : void
    1101           0 : _PyUnicode_Dump(PyObject *op)
    1102             : {
    1103           0 :     PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
    1104           0 :     PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
    1105           0 :     PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
    1106             :     const void *data;
    1107             : 
    1108           0 :     if (ascii->state.compact)
    1109             :     {
    1110           0 :         if (ascii->state.ascii)
    1111           0 :             data = (ascii + 1);
    1112             :         else
    1113           0 :             data = (compact + 1);
    1114             :     }
    1115             :     else
    1116           0 :         data = unicode->data.any;
    1117           0 :     printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
    1118             : 
    1119           0 :     if (!ascii->state.ascii) {
    1120           0 :         printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
    1121             :     }
    1122           0 :     printf(", data=%p\n", data);
    1123           0 : }
    1124             : #endif
    1125             : 
    1126             : 
    1127             : PyObject *
    1128   250643000 : PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
    1129             : {
    1130             :     /* Optimization for empty strings */
    1131   250643000 :     if (size == 0) {
    1132     1014520 :         return unicode_new_empty();
    1133             :     }
    1134             : 
    1135             :     PyObject *obj;
    1136             :     PyCompactUnicodeObject *unicode;
    1137             :     void *data;
    1138             :     int kind;
    1139             :     int is_ascii;
    1140             :     Py_ssize_t char_size;
    1141             :     Py_ssize_t struct_size;
    1142             : 
    1143   249628000 :     is_ascii = 0;
    1144   249628000 :     struct_size = sizeof(PyCompactUnicodeObject);
    1145   249628000 :     if (maxchar < 128) {
    1146   206650000 :         kind = PyUnicode_1BYTE_KIND;
    1147   206650000 :         char_size = 1;
    1148   206650000 :         is_ascii = 1;
    1149   206650000 :         struct_size = sizeof(PyASCIIObject);
    1150             :     }
    1151    42978600 :     else if (maxchar < 256) {
    1152      551499 :         kind = PyUnicode_1BYTE_KIND;
    1153      551499 :         char_size = 1;
    1154             :     }
    1155    42427100 :     else if (maxchar < 65536) {
    1156     4563050 :         kind = PyUnicode_2BYTE_KIND;
    1157     4563050 :         char_size = 2;
    1158             :     }
    1159             :     else {
    1160    37864100 :         if (maxchar > MAX_UNICODE) {
    1161           0 :             PyErr_SetString(PyExc_SystemError,
    1162             :                             "invalid maximum character passed to PyUnicode_New");
    1163           0 :             return NULL;
    1164             :         }
    1165    37864100 :         kind = PyUnicode_4BYTE_KIND;
    1166    37864100 :         char_size = 4;
    1167             :     }
    1168             : 
    1169             :     /* Ensure we won't overflow the size. */
    1170   249628000 :     if (size < 0) {
    1171           0 :         PyErr_SetString(PyExc_SystemError,
    1172             :                         "Negative size passed to PyUnicode_New");
    1173           0 :         return NULL;
    1174             :     }
    1175   249628000 :     if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
    1176           8 :         return PyErr_NoMemory();
    1177             : 
    1178             :     /* Duplicated allocation code from _PyObject_New() instead of a call to
    1179             :      * PyObject_New() so we are able to allocate space for the object and
    1180             :      * it's data buffer.
    1181             :      */
    1182   249628000 :     obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
    1183   249628000 :     if (obj == NULL) {
    1184         281 :         return PyErr_NoMemory();
    1185             :     }
    1186   249628000 :     _PyObject_Init(obj, &PyUnicode_Type);
    1187             : 
    1188   249628000 :     unicode = (PyCompactUnicodeObject *)obj;
    1189   249628000 :     if (is_ascii)
    1190   206649000 :         data = ((PyASCIIObject*)obj) + 1;
    1191             :     else
    1192    42978600 :         data = unicode + 1;
    1193   249628000 :     _PyUnicode_LENGTH(unicode) = size;
    1194   249628000 :     _PyUnicode_HASH(unicode) = -1;
    1195   249628000 :     _PyUnicode_STATE(unicode).interned = 0;
    1196   249628000 :     _PyUnicode_STATE(unicode).kind = kind;
    1197   249628000 :     _PyUnicode_STATE(unicode).compact = 1;
    1198   249628000 :     _PyUnicode_STATE(unicode).ascii = is_ascii;
    1199   249628000 :     if (is_ascii) {
    1200   206649000 :         ((char*)data)[size] = 0;
    1201             :     }
    1202    42978600 :     else if (kind == PyUnicode_1BYTE_KIND) {
    1203      551497 :         ((char*)data)[size] = 0;
    1204      551497 :         unicode->utf8 = NULL;
    1205      551497 :         unicode->utf8_length = 0;
    1206             :     }
    1207             :     else {
    1208    42427100 :         unicode->utf8 = NULL;
    1209    42427100 :         unicode->utf8_length = 0;
    1210    42427100 :         if (kind == PyUnicode_2BYTE_KIND)
    1211     4563050 :             ((Py_UCS2*)data)[size] = 0;
    1212             :         else /* kind == PyUnicode_4BYTE_KIND */
    1213    37864100 :             ((Py_UCS4*)data)[size] = 0;
    1214             :     }
    1215             : #ifdef Py_DEBUG
    1216   249628000 :     unicode_fill_invalid((PyObject*)unicode, 0);
    1217             : #endif
    1218   249628000 :     assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
    1219   249628000 :     return obj;
    1220             : }
    1221             : 
    1222             : #if SIZEOF_WCHAR_T == 2
    1223             : /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
    1224             :    will decode surrogate pairs, the other conversions are implemented as macros
    1225             :    for efficiency.
    1226             : 
    1227             :    This function assumes that unicode can hold one more code point than wstr
    1228             :    characters for a terminating null character. */
    1229             : static void
    1230             : unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
    1231             :                               PyObject *unicode)
    1232             : {
    1233             :     const wchar_t *iter;
    1234             :     Py_UCS4 *ucs4_out;
    1235             : 
    1236             :     assert(unicode != NULL);
    1237             :     assert(_PyUnicode_CHECK(unicode));
    1238             :     assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
    1239             :     ucs4_out = PyUnicode_4BYTE_DATA(unicode);
    1240             : 
    1241             :     for (iter = begin; iter < end; ) {
    1242             :         assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
    1243             :                            _PyUnicode_GET_LENGTH(unicode)));
    1244             :         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
    1245             :             && (iter+1) < end
    1246             :             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
    1247             :         {
    1248             :             *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
    1249             :             iter += 2;
    1250             :         }
    1251             :         else {
    1252             :             *ucs4_out++ = *iter;
    1253             :             iter++;
    1254             :         }
    1255             :     }
    1256             :     assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
    1257             :                         _PyUnicode_GET_LENGTH(unicode)));
    1258             : 
    1259             : }
    1260             : #endif
    1261             : 
    1262             : static int
    1263       29335 : unicode_check_modifiable(PyObject *unicode)
    1264             : {
    1265       29335 :     if (!unicode_modifiable(unicode)) {
    1266           0 :         PyErr_SetString(PyExc_SystemError,
    1267             :                         "Cannot modify a string currently used");
    1268           0 :         return -1;
    1269             :     }
    1270       29335 :     return 0;
    1271             : }
    1272             : 
    1273             : static int
    1274   170726000 : _copy_characters(PyObject *to, Py_ssize_t to_start,
    1275             :                  PyObject *from, Py_ssize_t from_start,
    1276             :                  Py_ssize_t how_many, int check_maxchar)
    1277             : {
    1278             :     int from_kind, to_kind;
    1279             :     const void *from_data;
    1280             :     void *to_data;
    1281             : 
    1282   170726000 :     assert(0 <= how_many);
    1283   170726000 :     assert(0 <= from_start);
    1284   170726000 :     assert(0 <= to_start);
    1285   170726000 :     assert(PyUnicode_Check(from));
    1286   170726000 :     assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
    1287             : 
    1288   170726000 :     assert(PyUnicode_Check(to));
    1289   170726000 :     assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
    1290             : 
    1291   170726000 :     if (how_many == 0)
    1292      425657 :         return 0;
    1293             : 
    1294   170300000 :     from_kind = PyUnicode_KIND(from);
    1295   170300000 :     from_data = PyUnicode_DATA(from);
    1296   170300000 :     to_kind = PyUnicode_KIND(to);
    1297   170300000 :     to_data = PyUnicode_DATA(to);
    1298             : 
    1299             : #ifdef Py_DEBUG
    1300   170300000 :     if (!check_maxchar
    1301   170278000 :         && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
    1302             :     {
    1303         155 :         Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
    1304             :         Py_UCS4 ch;
    1305             :         Py_ssize_t i;
    1306        1000 :         for (i=0; i < how_many; i++) {
    1307         845 :             ch = PyUnicode_READ(from_kind, from_data, from_start + i);
    1308         845 :             assert(ch <= to_maxchar);
    1309             :         }
    1310             :     }
    1311             : #endif
    1312             : 
    1313   170300000 :     if (from_kind == to_kind) {
    1314   132903000 :         if (check_maxchar
    1315       21937 :             && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
    1316             :         {
    1317             :             /* Writing Latin-1 characters into an ASCII string requires to
    1318             :                check that all written characters are pure ASCII */
    1319             :             Py_UCS4 max_char;
    1320           1 :             max_char = ucs1lib_find_max_char(from_data,
    1321             :                                              (const Py_UCS1*)from_data + how_many);
    1322           1 :             if (max_char >= 128)
    1323           1 :                 return -1;
    1324             :         }
    1325   132903000 :         memcpy((char*)to_data + to_kind * to_start,
    1326   132903000 :                   (const char*)from_data + from_kind * from_start,
    1327   132903000 :                   to_kind * how_many);
    1328             :     }
    1329    37397300 :     else if (from_kind == PyUnicode_1BYTE_KIND
    1330    37323100 :              && to_kind == PyUnicode_2BYTE_KIND)
    1331             :     {
    1332    13734500 :         _PyUnicode_CONVERT_BYTES(
    1333             :             Py_UCS1, Py_UCS2,
    1334             :             PyUnicode_1BYTE_DATA(from) + from_start,
    1335             :             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
    1336             :             PyUnicode_2BYTE_DATA(to) + to_start
    1337             :             );
    1338             :     }
    1339    33374300 :     else if (from_kind == PyUnicode_1BYTE_KIND
    1340    33300100 :              && to_kind == PyUnicode_4BYTE_KIND)
    1341             :     {
    1342    93697300 :         _PyUnicode_CONVERT_BYTES(
    1343             :             Py_UCS1, Py_UCS4,
    1344             :             PyUnicode_1BYTE_DATA(from) + from_start,
    1345             :             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
    1346             :             PyUnicode_4BYTE_DATA(to) + to_start
    1347             :             );
    1348             :     }
    1349       74200 :     else if (from_kind == PyUnicode_2BYTE_KIND
    1350       74189 :              && to_kind == PyUnicode_4BYTE_KIND)
    1351             :     {
    1352      307281 :         _PyUnicode_CONVERT_BYTES(
    1353             :             Py_UCS2, Py_UCS4,
    1354             :             PyUnicode_2BYTE_DATA(from) + from_start,
    1355             :             PyUnicode_2BYTE_DATA(from) + from_start + how_many,
    1356             :             PyUnicode_4BYTE_DATA(to) + to_start
    1357             :             );
    1358             :     }
    1359             :     else {
    1360         141 :         assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
    1361             : 
    1362         141 :         if (!check_maxchar) {
    1363         136 :             if (from_kind == PyUnicode_2BYTE_KIND
    1364         128 :                 && to_kind == PyUnicode_1BYTE_KIND)
    1365             :             {
    1366         280 :                 _PyUnicode_CONVERT_BYTES(
    1367             :                     Py_UCS2, Py_UCS1,
    1368             :                     PyUnicode_2BYTE_DATA(from) + from_start,
    1369             :                     PyUnicode_2BYTE_DATA(from) + from_start + how_many,
    1370             :                     PyUnicode_1BYTE_DATA(to) + to_start
    1371             :                     );
    1372             :             }
    1373           8 :             else if (from_kind == PyUnicode_4BYTE_KIND
    1374           8 :                      && to_kind == PyUnicode_1BYTE_KIND)
    1375             :             {
    1376          16 :                 _PyUnicode_CONVERT_BYTES(
    1377             :                     Py_UCS4, Py_UCS1,
    1378             :                     PyUnicode_4BYTE_DATA(from) + from_start,
    1379             :                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
    1380             :                     PyUnicode_1BYTE_DATA(to) + to_start
    1381             :                     );
    1382             :             }
    1383           6 :             else if (from_kind == PyUnicode_4BYTE_KIND
    1384           6 :                      && to_kind == PyUnicode_2BYTE_KIND)
    1385             :             {
    1386          48 :                 _PyUnicode_CONVERT_BYTES(
    1387             :                     Py_UCS4, Py_UCS2,
    1388             :                     PyUnicode_4BYTE_DATA(from) + from_start,
    1389             :                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
    1390             :                     PyUnicode_2BYTE_DATA(to) + to_start
    1391             :                     );
    1392             :             }
    1393             :             else {
    1394           0 :                 Py_UNREACHABLE();
    1395             :             }
    1396             :         }
    1397             :         else {
    1398           5 :             const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
    1399             :             Py_UCS4 ch;
    1400             :             Py_ssize_t i;
    1401             : 
    1402           5 :             for (i=0; i < how_many; i++) {
    1403           5 :                 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
    1404           5 :                 if (ch > to_maxchar)
    1405           5 :                     return -1;
    1406           0 :                 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
    1407             :             }
    1408             :         }
    1409             :     }
    1410   170300000 :     return 0;
    1411             : }
    1412             : 
    1413             : void
    1414   170704000 : _PyUnicode_FastCopyCharacters(
    1415             :     PyObject *to, Py_ssize_t to_start,
    1416             :     PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
    1417             : {
    1418   170704000 :     (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
    1419   170704000 : }
    1420             : 
    1421             : Py_ssize_t
    1422       21949 : PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
    1423             :                          PyObject *from, Py_ssize_t from_start,
    1424             :                          Py_ssize_t how_many)
    1425             : {
    1426             :     int err;
    1427             : 
    1428       21949 :     if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
    1429           1 :         PyErr_BadInternalCall();
    1430           1 :         return -1;
    1431             :     }
    1432             : 
    1433       21948 :     if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
    1434           2 :         PyErr_SetString(PyExc_IndexError, "string index out of range");
    1435           2 :         return -1;
    1436             :     }
    1437       21946 :     if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
    1438           2 :         PyErr_SetString(PyExc_IndexError, "string index out of range");
    1439           2 :         return -1;
    1440             :     }
    1441       21944 :     if (how_many < 0) {
    1442           1 :         PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
    1443           1 :         return -1;
    1444             :     }
    1445       21943 :     how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
    1446       21943 :     if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
    1447           1 :         PyErr_Format(PyExc_SystemError,
    1448             :                      "Cannot write %zi characters at %zi "
    1449             :                      "in a string of %zi characters",
    1450             :                      how_many, to_start, PyUnicode_GET_LENGTH(to));
    1451           1 :         return -1;
    1452             :     }
    1453             : 
    1454       21942 :     if (how_many == 0)
    1455           0 :         return 0;
    1456             : 
    1457       21942 :     if (unicode_check_modifiable(to))
    1458           0 :         return -1;
    1459             : 
    1460       21942 :     err = _copy_characters(to, to_start, from, from_start, how_many, 1);
    1461       21942 :     if (err) {
    1462           6 :         PyErr_Format(PyExc_SystemError,
    1463             :                      "Cannot copy %s characters "
    1464             :                      "into a string of %s characters",
    1465             :                      unicode_kind_name(from),
    1466             :                      unicode_kind_name(to));
    1467           6 :         return -1;
    1468             :     }
    1469       21936 :     return how_many;
    1470             : }
    1471             : 
    1472             : /* Find the maximum code point and count the number of surrogate pairs so a
    1473             :    correct string length can be computed before converting a string to UCS4.
    1474             :    This function counts single surrogates as a character and not as a pair.
    1475             : 
    1476             :    Return 0 on success, or -1 on error. */
    1477             : static int
    1478     2787570 : find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
    1479             :                         Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
    1480             : {
    1481             :     const wchar_t *iter;
    1482             :     Py_UCS4 ch;
    1483             : 
    1484     2787570 :     assert(num_surrogates != NULL && maxchar != NULL);
    1485     2787570 :     *num_surrogates = 0;
    1486     2787570 :     *maxchar = 0;
    1487             : 
    1488    59051800 :     for (iter = begin; iter < end; ) {
    1489             : #if SIZEOF_WCHAR_T == 2
    1490             :         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
    1491             :             && (iter+1) < end
    1492             :             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
    1493             :         {
    1494             :             ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
    1495             :             ++(*num_surrogates);
    1496             :             iter += 2;
    1497             :         }
    1498             :         else
    1499             : #endif
    1500             :         {
    1501    56264200 :             ch = *iter;
    1502    56264200 :             iter++;
    1503             :         }
    1504    56264200 :         if (ch > *maxchar) {
    1505    14837600 :             *maxchar = ch;
    1506    14837600 :             if (*maxchar > MAX_UNICODE) {
    1507           3 :                 PyErr_Format(PyExc_ValueError,
    1508             :                              "character U+%x is not in range [U+0000; U+%x]",
    1509             :                              ch, MAX_UNICODE);
    1510           3 :                 return -1;
    1511             :             }
    1512             :         }
    1513             :     }
    1514     2787560 :     return 0;
    1515             : }
    1516             : 
    1517             : static void
    1518   249125000 : unicode_dealloc(PyObject *unicode)
    1519             : {
    1520             : #ifdef Py_DEBUG
    1521   249125000 :     if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
    1522           0 :         _Py_FatalRefcountError("deallocating an Unicode singleton");
    1523             :     }
    1524             : #endif
    1525             : 
    1526   249125000 :     if (PyUnicode_CHECK_INTERNED(unicode)) {
    1527             :         /* Revive the dead object temporarily. PyDict_DelItem() removes two
    1528             :            references (key and value) which were ignored by
    1529             :            PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
    1530             :            to prevent calling unicode_dealloc() again. Adjust refcnt after
    1531             :            PyDict_DelItem(). */
    1532    19698300 :         assert(Py_REFCNT(unicode) == 0);
    1533    19698300 :         Py_SET_REFCNT(unicode, 3);
    1534    19698300 :         if (PyDict_DelItem(interned, unicode) != 0) {
    1535           0 :             _PyErr_WriteUnraisableMsg("deletion of interned string failed",
    1536             :                                       NULL);
    1537             :         }
    1538    19698300 :         assert(Py_REFCNT(unicode) == 1);
    1539    19698300 :         Py_SET_REFCNT(unicode, 0);
    1540             :     }
    1541             : 
    1542   249125000 :     if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
    1543        4181 :         PyObject_Free(_PyUnicode_UTF8(unicode));
    1544             :     }
    1545   249125000 :     if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
    1546      128543 :         PyObject_Free(_PyUnicode_DATA_ANY(unicode));
    1547             :     }
    1548             : 
    1549   249125000 :     Py_TYPE(unicode)->tp_free(unicode);
    1550   249125000 : }
    1551             : 
    1552             : #ifdef Py_DEBUG
    1553             : static int
    1554   264137000 : unicode_is_singleton(PyObject *unicode)
    1555             : {
    1556   264137000 :     if (unicode == &_Py_STR(empty)) {
    1557           0 :         return 1;
    1558             :     }
    1559             : 
    1560   264137000 :     PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
    1561   264137000 :     if (ascii->length == 1) {
    1562    24690400 :         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
    1563    24690400 :         if (ch < 256 && LATIN1(ch) == unicode) {
    1564           0 :             return 1;
    1565             :         }
    1566             :     }
    1567   264137000 :     return 0;
    1568             : }
    1569             : #endif
    1570             : 
    1571             : static int
    1572    17166300 : unicode_modifiable(PyObject *unicode)
    1573             : {
    1574    17166300 :     assert(_PyUnicode_CHECK(unicode));
    1575    17166300 :     if (Py_REFCNT(unicode) != 1)
    1576     2151770 :         return 0;
    1577    15014600 :     if (_PyUnicode_HASH(unicode) != -1)
    1578         233 :         return 0;
    1579    15014300 :     if (PyUnicode_CHECK_INTERNED(unicode))
    1580           0 :         return 0;
    1581    15014300 :     if (!PyUnicode_CheckExact(unicode))
    1582           0 :         return 0;
    1583             : #ifdef Py_DEBUG
    1584             :     /* singleton refcount is greater than 1 */
    1585    15014300 :     assert(!unicode_is_singleton(unicode));
    1586             : #endif
    1587    15014300 :     return 1;
    1588             : }
    1589             : 
    1590             : static int
    1591     1402650 : unicode_resize(PyObject **p_unicode, Py_ssize_t length)
    1592             : {
    1593             :     PyObject *unicode;
    1594             :     Py_ssize_t old_length;
    1595             : 
    1596     1402650 :     assert(p_unicode != NULL);
    1597     1402650 :     unicode = *p_unicode;
    1598             : 
    1599     1402650 :     assert(unicode != NULL);
    1600     1402650 :     assert(PyUnicode_Check(unicode));
    1601     1402650 :     assert(0 <= length);
    1602             : 
    1603     1402650 :     old_length = PyUnicode_GET_LENGTH(unicode);
    1604     1402650 :     if (old_length == length)
    1605           0 :         return 0;
    1606             : 
    1607     1402650 :     if (length == 0) {
    1608           0 :         PyObject *empty = unicode_new_empty();
    1609           0 :         Py_SETREF(*p_unicode, empty);
    1610           0 :         return 0;
    1611             :     }
    1612             : 
    1613     1402650 :     if (!unicode_modifiable(unicode)) {
    1614           0 :         PyObject *copy = resize_copy(unicode, length);
    1615           0 :         if (copy == NULL)
    1616           0 :             return -1;
    1617           0 :         Py_SETREF(*p_unicode, copy);
    1618           0 :         return 0;
    1619             :     }
    1620             : 
    1621     1402650 :     if (PyUnicode_IS_COMPACT(unicode)) {
    1622     1402650 :         PyObject *new_unicode = resize_compact(unicode, length);
    1623     1402650 :         if (new_unicode == NULL)
    1624           0 :             return -1;
    1625     1402650 :         *p_unicode = new_unicode;
    1626     1402650 :         return 0;
    1627             :     }
    1628           0 :     return resize_inplace(unicode, length);
    1629             : }
    1630             : 
    1631             : int
    1632           0 : PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
    1633             : {
    1634             :     PyObject *unicode;
    1635           0 :     if (p_unicode == NULL) {
    1636           0 :         PyErr_BadInternalCall();
    1637           0 :         return -1;
    1638             :     }
    1639           0 :     unicode = *p_unicode;
    1640           0 :     if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
    1641             :     {
    1642           0 :         PyErr_BadInternalCall();
    1643           0 :         return -1;
    1644             :     }
    1645           0 :     return unicode_resize(p_unicode, length);
    1646             : }
    1647             : 
    1648             : /* Copy an ASCII or latin1 char* string into a Python Unicode string.
    1649             : 
    1650             :    WARNING: The function doesn't copy the terminating null character and
    1651             :    doesn't check the maximum character (may write a latin1 character in an
    1652             :    ASCII string). */
    1653             : static void
    1654           3 : unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
    1655             :                    const char *str, Py_ssize_t len)
    1656             : {
    1657           3 :     int kind = PyUnicode_KIND(unicode);
    1658           3 :     const void *data = PyUnicode_DATA(unicode);
    1659           3 :     const char *end = str + len;
    1660             : 
    1661           3 :     assert(index + len <= PyUnicode_GET_LENGTH(unicode));
    1662           3 :     switch (kind) {
    1663           3 :     case PyUnicode_1BYTE_KIND: {
    1664             : #ifdef Py_DEBUG
    1665           3 :         if (PyUnicode_IS_ASCII(unicode)) {
    1666           3 :             Py_UCS4 maxchar = ucs1lib_find_max_char(
    1667             :                 (const Py_UCS1*)str,
    1668             :                 (const Py_UCS1*)str + len);
    1669           3 :             assert(maxchar < 128);
    1670             :         }
    1671             : #endif
    1672           3 :         memcpy((char *) data + index, str, len);
    1673           3 :         break;
    1674             :     }
    1675           0 :     case PyUnicode_2BYTE_KIND: {
    1676           0 :         Py_UCS2 *start = (Py_UCS2 *)data + index;
    1677           0 :         Py_UCS2 *ucs2 = start;
    1678             : 
    1679           0 :         for (; str < end; ++ucs2, ++str)
    1680           0 :             *ucs2 = (Py_UCS2)*str;
    1681             : 
    1682           0 :         assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
    1683           0 :         break;
    1684             :     }
    1685           0 :     case PyUnicode_4BYTE_KIND: {
    1686           0 :         Py_UCS4 *start = (Py_UCS4 *)data + index;
    1687           0 :         Py_UCS4 *ucs4 = start;
    1688             : 
    1689           0 :         for (; str < end; ++ucs4, ++str)
    1690           0 :             *ucs4 = (Py_UCS4)*str;
    1691             : 
    1692           0 :         assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
    1693           0 :         break;
    1694             :     }
    1695           0 :     default:
    1696           0 :         Py_UNREACHABLE();
    1697             :     }
    1698           3 : }
    1699             : 
    1700             : static PyObject*
    1701    79829400 : get_latin1_char(Py_UCS1 ch)
    1702             : {
    1703    79829400 :     return Py_NewRef(LATIN1(ch));
    1704             : }
    1705             : 
    1706             : static PyObject*
    1707    68520600 : unicode_char(Py_UCS4 ch)
    1708             : {
    1709             :     PyObject *unicode;
    1710             : 
    1711    68520600 :     assert(ch <= MAX_UNICODE);
    1712             : 
    1713    68520600 :     if (ch < 256) {
    1714    56459500 :         return get_latin1_char(ch);
    1715             :     }
    1716             : 
    1717    12061100 :     unicode = PyUnicode_New(1, ch);
    1718    12061100 :     if (unicode == NULL)
    1719           0 :         return NULL;
    1720             : 
    1721    12061100 :     assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
    1722    12061100 :     if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
    1723     1558160 :         PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
    1724             :     } else {
    1725    10502900 :         assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
    1726    10502900 :         PyUnicode_4BYTE_DATA(unicode)[0] = ch;
    1727             :     }
    1728    12061100 :     assert(_PyUnicode_CheckConsistency(unicode, 1));
    1729    12061100 :     return unicode;
    1730             : }
    1731             : 
    1732             : PyObject *
    1733     2805300 : PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
    1734             : {
    1735             :     PyObject *unicode;
    1736     2805300 :     Py_UCS4 maxchar = 0;
    1737             :     Py_ssize_t num_surrogates;
    1738             : 
    1739     2805300 :     if (u == NULL && size != 0) {
    1740           0 :         PyErr_BadInternalCall();
    1741           0 :         return NULL;
    1742             :     }
    1743             : 
    1744     2805300 :     if (size == -1) {
    1745      348089 :         size = wcslen(u);
    1746             :     }
    1747             : 
    1748             :     /* If the Unicode data is known at construction time, we can apply
    1749             :        some optimizations which share commonly used objects. */
    1750             : 
    1751             :     /* Optimization for empty strings */
    1752     2805300 :     if (size == 0)
    1753        3508 :         _Py_RETURN_UNICODE_EMPTY();
    1754             : 
    1755             : #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
    1756             :     /* Oracle Solaris uses non-Unicode internal wchar_t form for
    1757             :        non-Unicode locales and hence needs conversion to UCS-4 first. */
    1758             :     if (_Py_LocaleUsesNonUnicodeWchar()) {
    1759             :         wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
    1760             :         if (!converted) {
    1761             :             return NULL;
    1762             :         }
    1763             :         PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
    1764             :         PyMem_Free(converted);
    1765             :         return unicode;
    1766             :     }
    1767             : #endif
    1768             : 
    1769             :     /* Single character Unicode objects in the Latin-1 range are
    1770             :        shared when using this constructor */
    1771     2801790 :     if (size == 1 && (Py_UCS4)*u < 256)
    1772       14222 :         return get_latin1_char((unsigned char)*u);
    1773             : 
    1774             :     /* If not empty and not single character, copy the Unicode data
    1775             :        into the new object */
    1776     2787570 :     if (find_maxchar_surrogates(u, u + size,
    1777             :                                 &maxchar, &num_surrogates) == -1)
    1778           3 :         return NULL;
    1779             : 
    1780     2787560 :     unicode = PyUnicode_New(size - num_surrogates, maxchar);
    1781     2787560 :     if (!unicode)
    1782           0 :         return NULL;
    1783             : 
    1784     2787560 :     switch (PyUnicode_KIND(unicode)) {
    1785     2787390 :     case PyUnicode_1BYTE_KIND:
    1786    20955700 :         _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
    1787             :                                 u, u + size, PyUnicode_1BYTE_DATA(unicode));
    1788     2787390 :         break;
    1789         134 :     case PyUnicode_2BYTE_KIND:
    1790             : #if Py_UNICODE_SIZE == 2
    1791             :         memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
    1792             : #else
    1793        1935 :         _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
    1794             :                                 u, u + size, PyUnicode_2BYTE_DATA(unicode));
    1795             : #endif
    1796         134 :         break;
    1797          42 :     case PyUnicode_4BYTE_KIND:
    1798             : #if SIZEOF_WCHAR_T == 2
    1799             :         /* This is the only case which has to process surrogates, thus
    1800             :            a simple copy loop is not enough and we need a function. */
    1801             :         unicode_convert_wchar_to_ucs4(u, u + size, unicode);
    1802             : #else
    1803          42 :         assert(num_surrogates == 0);
    1804          42 :         memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
    1805             : #endif
    1806          42 :         break;
    1807           0 :     default:
    1808           0 :         Py_UNREACHABLE();
    1809             :     }
    1810             : 
    1811     2787560 :     return unicode_result(unicode);
    1812             : }
    1813             : 
    1814             : PyObject *
    1815     4982840 : PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
    1816             : {
    1817     4982840 :     if (size < 0) {
    1818           0 :         PyErr_SetString(PyExc_SystemError,
    1819             :                         "Negative size passed to PyUnicode_FromStringAndSize");
    1820           0 :         return NULL;
    1821             :     }
    1822     4982840 :     if (u != NULL) {
    1823     4876270 :         return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
    1824             :     }
    1825      106568 :     if (size > 0) {
    1826           0 :         PyErr_SetString(PyExc_SystemError,
    1827             :             "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
    1828           0 :         return NULL;
    1829             :     }
    1830      106568 :     return unicode_new_empty();
    1831             : }
    1832             : 
    1833             : PyObject *
    1834    48587000 : PyUnicode_FromString(const char *u)
    1835             : {
    1836    48587000 :     size_t size = strlen(u);
    1837    48587000 :     if (size > PY_SSIZE_T_MAX) {
    1838           0 :         PyErr_SetString(PyExc_OverflowError, "input too long");
    1839           0 :         return NULL;
    1840             :     }
    1841    48587000 :     return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
    1842             : }
    1843             : 
    1844             : 
    1845             : PyObject *
    1846      149454 : _PyUnicode_FromId(_Py_Identifier *id)
    1847             : {
    1848      149454 :     PyInterpreterState *interp = _PyInterpreterState_GET();
    1849      149454 :     struct _Py_unicode_ids *ids = &interp->unicode.ids;
    1850             : 
    1851      149454 :     Py_ssize_t index = _Py_atomic_size_get(&id->index);
    1852      149454 :     if (index < 0) {
    1853        6677 :         struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids;
    1854             : 
    1855        6677 :         PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK);
    1856             :         // Check again to detect concurrent access. Another thread can have
    1857             :         // initialized the index while this thread waited for the lock.
    1858        6677 :         index = _Py_atomic_size_get(&id->index);
    1859        6677 :         if (index < 0) {
    1860        6677 :             assert(rt_ids->next_index < PY_SSIZE_T_MAX);
    1861        6677 :             index = rt_ids->next_index;
    1862        6677 :             rt_ids->next_index++;
    1863        6677 :             _Py_atomic_size_set(&id->index, index);
    1864             :         }
    1865        6677 :         PyThread_release_lock(rt_ids->lock);
    1866             :     }
    1867      149454 :     assert(index >= 0);
    1868             : 
    1869             :     PyObject *obj;
    1870      149454 :     if (index < ids->size) {
    1871      148599 :         obj = ids->array[index];
    1872      148599 :         if (obj) {
    1873             :             // Return a borrowed reference
    1874      142775 :             return obj;
    1875             :         }
    1876             :     }
    1877             : 
    1878        6679 :     obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
    1879             :                                        NULL, NULL);
    1880        6679 :     if (!obj) {
    1881           0 :         return NULL;
    1882             :     }
    1883        6679 :     PyUnicode_InternInPlace(&obj);
    1884             : 
    1885        6679 :     if (index >= ids->size) {
    1886             :         // Overallocate to reduce the number of realloc
    1887         855 :         Py_ssize_t new_size = Py_MAX(index * 2, 16);
    1888         855 :         Py_ssize_t item_size = sizeof(ids->array[0]);
    1889         855 :         PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
    1890         855 :         if (new_array == NULL) {
    1891           0 :             PyErr_NoMemory();
    1892           0 :             return NULL;
    1893             :         }
    1894         855 :         memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
    1895         855 :         ids->array = new_array;
    1896         855 :         ids->size = new_size;
    1897             :     }
    1898             : 
    1899             :     // The array stores a strong reference
    1900        6679 :     ids->array[index] = obj;
    1901             : 
    1902             :     // Return a borrowed reference
    1903        6679 :     return obj;
    1904             : }
    1905             : 
    1906             : 
    1907             : static void
    1908        3120 : unicode_clear_identifiers(struct _Py_unicode_state *state)
    1909             : {
    1910        3120 :     struct _Py_unicode_ids *ids = &state->ids;
    1911       16800 :     for (Py_ssize_t i=0; i < ids->size; i++) {
    1912       13680 :         Py_XDECREF(ids->array[i]);
    1913             :     }
    1914        3120 :     ids->size = 0;
    1915        3120 :     PyMem_Free(ids->array);
    1916        3120 :     ids->array = NULL;
    1917             :     // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
    1918             :     // after Py_Finalize().
    1919        3120 : }
    1920             : 
    1921             : 
    1922             : /* Internal function, doesn't check maximum character */
    1923             : 
    1924             : PyObject*
    1925    32171400 : _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
    1926             : {
    1927    32171400 :     const unsigned char *s = (const unsigned char *)buffer;
    1928             :     PyObject *unicode;
    1929    32171400 :     if (size == 1) {
    1930             : #ifdef Py_DEBUG
    1931     6940320 :         assert((unsigned char)s[0] < 128);
    1932             : #endif
    1933     6940320 :         return get_latin1_char(s[0]);
    1934             :     }
    1935    25231000 :     unicode = PyUnicode_New(size, 127);
    1936    25231000 :     if (!unicode)
    1937           0 :         return NULL;
    1938    25231000 :     memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
    1939    25231000 :     assert(_PyUnicode_CheckConsistency(unicode, 1));
    1940    25231000 :     return unicode;
    1941             : }
    1942             : 
    1943             : static Py_UCS4
    1944           9 : kind_maxchar_limit(int kind)
    1945             : {
    1946           9 :     switch (kind) {
    1947           0 :     case PyUnicode_1BYTE_KIND:
    1948           0 :         return 0x80;
    1949           9 :     case PyUnicode_2BYTE_KIND:
    1950           9 :         return 0x100;
    1951           0 :     case PyUnicode_4BYTE_KIND:
    1952           0 :         return 0x10000;
    1953           0 :     default:
    1954           0 :         Py_UNREACHABLE();
    1955             :     }
    1956             : }
    1957             : 
    1958             : static PyObject*
    1959    57429400 : _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
    1960             : {
    1961             :     PyObject *res;
    1962             :     unsigned char max_char;
    1963             : 
    1964    57429400 :     if (size == 0) {
    1965      128649 :         _Py_RETURN_UNICODE_EMPTY();
    1966             :     }
    1967    57300800 :     assert(size > 0);
    1968    57300800 :     if (size == 1) {
    1969     6756000 :         return get_latin1_char(u[0]);
    1970             :     }
    1971             : 
    1972    50544800 :     max_char = ucs1lib_find_max_char(u, u + size);
    1973    50544800 :     res = PyUnicode_New(size, max_char);
    1974    50544800 :     if (!res)
    1975           0 :         return NULL;
    1976    50544800 :     memcpy(PyUnicode_1BYTE_DATA(res), u, size);
    1977    50544800 :     assert(_PyUnicode_CheckConsistency(res, 1));
    1978    50544800 :     return res;
    1979             : }
    1980             : 
    1981             : static PyObject*
    1982      320637 : _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
    1983             : {
    1984             :     PyObject *res;
    1985             :     Py_UCS2 max_char;
    1986             : 
    1987      320637 :     if (size == 0)
    1988        7952 :         _Py_RETURN_UNICODE_EMPTY();
    1989      312685 :     assert(size > 0);
    1990      312685 :     if (size == 1)
    1991       26019 :         return unicode_char(u[0]);
    1992             : 
    1993      286666 :     max_char = ucs2lib_find_max_char(u, u + size);
    1994      286666 :     res = PyUnicode_New(size, max_char);
    1995      286666 :     if (!res)
    1996           0 :         return NULL;
    1997      286666 :     if (max_char >= 256)
    1998      132158 :         memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
    1999             :     else {
    2000     1676490 :         _PyUnicode_CONVERT_BYTES(
    2001             :             Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
    2002             :     }
    2003      286666 :     assert(_PyUnicode_CheckConsistency(res, 1));
    2004      286666 :     return res;
    2005             : }
    2006             : 
    2007             : static PyObject*
    2008      407434 : _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
    2009             : {
    2010             :     PyObject *res;
    2011             :     Py_UCS4 max_char;
    2012             : 
    2013      407434 :     if (size == 0)
    2014        7521 :         _Py_RETURN_UNICODE_EMPTY();
    2015      399913 :     assert(size > 0);
    2016      399913 :     if (size == 1)
    2017      101880 :         return unicode_char(u[0]);
    2018             : 
    2019      298033 :     max_char = ucs4lib_find_max_char(u, u + size);
    2020      298033 :     res = PyUnicode_New(size, max_char);
    2021      298033 :     if (!res)
    2022           0 :         return NULL;
    2023      298033 :     if (max_char < 256)
    2024    21807100 :         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
    2025             :                                  PyUnicode_1BYTE_DATA(res));
    2026      164243 :     else if (max_char < 0x10000)
    2027      635729 :         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
    2028             :                                  PyUnicode_2BYTE_DATA(res));
    2029             :     else
    2030       12276 :         memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
    2031      298033 :     assert(_PyUnicode_CheckConsistency(res, 1));
    2032      298033 :     return res;
    2033             : }
    2034             : 
    2035             : PyObject*
    2036    52966000 : PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
    2037             : {
    2038    52966000 :     if (size < 0) {
    2039           0 :         PyErr_SetString(PyExc_ValueError, "size must be positive");
    2040           0 :         return NULL;
    2041             :     }
    2042    52966000 :     switch (kind) {
    2043    52405400 :     case PyUnicode_1BYTE_KIND:
    2044    52405400 :         return _PyUnicode_FromUCS1(buffer, size);
    2045      159495 :     case PyUnicode_2BYTE_KIND:
    2046      159495 :         return _PyUnicode_FromUCS2(buffer, size);
    2047      401028 :     case PyUnicode_4BYTE_KIND:
    2048      401028 :         return _PyUnicode_FromUCS4(buffer, size);
    2049           0 :     default:
    2050           0 :         PyErr_SetString(PyExc_SystemError, "invalid kind");
    2051           0 :         return NULL;
    2052             :     }
    2053             : }
    2054             : 
    2055             : Py_UCS4
    2056      988732 : _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
    2057             : {
    2058             :     int kind;
    2059             :     const void *startptr, *endptr;
    2060             : 
    2061      988732 :     assert(0 <= start);
    2062      988732 :     assert(end <= PyUnicode_GET_LENGTH(unicode));
    2063      988732 :     assert(start <= end);
    2064             : 
    2065      988732 :     if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
    2066      187414 :         return PyUnicode_MAX_CHAR_VALUE(unicode);
    2067             : 
    2068      801318 :     if (start == end)
    2069        3074 :         return 127;
    2070             : 
    2071      798244 :     if (PyUnicode_IS_ASCII(unicode))
    2072      797817 :         return 127;
    2073             : 
    2074         427 :     kind = PyUnicode_KIND(unicode);
    2075         427 :     startptr = PyUnicode_DATA(unicode);
    2076         427 :     endptr = (char *)startptr + end * kind;
    2077         427 :     startptr = (char *)startptr + start * kind;
    2078         427 :     switch(kind) {
    2079          33 :     case PyUnicode_1BYTE_KIND:
    2080          33 :         return ucs1lib_find_max_char(startptr, endptr);
    2081         394 :     case PyUnicode_2BYTE_KIND:
    2082         394 :         return ucs2lib_find_max_char(startptr, endptr);
    2083           0 :     case PyUnicode_4BYTE_KIND:
    2084           0 :         return ucs4lib_find_max_char(startptr, endptr);
    2085           0 :     default:
    2086           0 :         Py_UNREACHABLE();
    2087             :     }
    2088             : }
    2089             : 
    2090             : /* Ensure that a string uses the most efficient storage, if it is not the
    2091             :    case: create a new string with of the right kind. Write NULL into *p_unicode
    2092             :    on error. */
    2093             : static void
    2094         140 : unicode_adjust_maxchar(PyObject **p_unicode)
    2095             : {
    2096             :     PyObject *unicode, *copy;
    2097             :     Py_UCS4 max_char;
    2098             :     Py_ssize_t len;
    2099             :     int kind;
    2100             : 
    2101         140 :     assert(p_unicode != NULL);
    2102         140 :     unicode = *p_unicode;
    2103         140 :     if (PyUnicode_IS_ASCII(unicode))
    2104           0 :         return;
    2105             : 
    2106         140 :     len = PyUnicode_GET_LENGTH(unicode);
    2107         140 :     kind = PyUnicode_KIND(unicode);
    2108         140 :     if (kind == PyUnicode_1BYTE_KIND) {
    2109           2 :         const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
    2110           2 :         max_char = ucs1lib_find_max_char(u, u + len);
    2111           2 :         if (max_char >= 128)
    2112           0 :             return;
    2113             :     }
    2114         138 :     else if (kind == PyUnicode_2BYTE_KIND) {
    2115         126 :         const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
    2116         126 :         max_char = ucs2lib_find_max_char(u, u + len);
    2117         126 :         if (max_char >= 256)
    2118           2 :             return;
    2119             :     }
    2120          12 :     else if (kind == PyUnicode_4BYTE_KIND) {
    2121          12 :         const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
    2122          12 :         max_char = ucs4lib_find_max_char(u, u + len);
    2123          12 :         if (max_char >= 0x10000)
    2124           4 :             return;
    2125             :     }
    2126             :     else
    2127           0 :         Py_UNREACHABLE();
    2128             : 
    2129         134 :     copy = PyUnicode_New(len, max_char);
    2130         134 :     if (copy != NULL)
    2131         134 :         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
    2132         134 :     Py_DECREF(unicode);
    2133         134 :     *p_unicode = copy;
    2134             : }
    2135             : 
    2136             : PyObject*
    2137       61705 : _PyUnicode_Copy(PyObject *unicode)
    2138             : {
    2139             :     Py_ssize_t length;
    2140             :     PyObject *copy;
    2141             : 
    2142       61705 :     if (!PyUnicode_Check(unicode)) {
    2143           0 :         PyErr_BadInternalCall();
    2144           0 :         return NULL;
    2145             :     }
    2146             : 
    2147       61705 :     length = PyUnicode_GET_LENGTH(unicode);
    2148       61705 :     copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
    2149       61705 :     if (!copy)
    2150           0 :         return NULL;
    2151       61705 :     assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
    2152             : 
    2153       61705 :     memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
    2154       61705 :               length * PyUnicode_KIND(unicode));
    2155       61705 :     assert(_PyUnicode_CheckConsistency(copy, 1));
    2156       61705 :     return copy;
    2157             : }
    2158             : 
    2159             : 
    2160             : /* Widen Unicode objects to larger buffers. Don't write terminating null
    2161             :    character. Return NULL on error. */
    2162             : 
    2163             : static void*
    2164       34647 : unicode_askind(int skind, void const *data, Py_ssize_t len, int kind)
    2165             : {
    2166             :     void *result;
    2167             : 
    2168       34647 :     assert(skind < kind);
    2169       34647 :     switch (kind) {
    2170       31327 :     case PyUnicode_2BYTE_KIND:
    2171       31327 :         result = PyMem_New(Py_UCS2, len);
    2172       31327 :         if (!result)
    2173           0 :             return PyErr_NoMemory();
    2174       31327 :         assert(skind == PyUnicode_1BYTE_KIND);
    2175       65019 :         _PyUnicode_CONVERT_BYTES(
    2176             :             Py_UCS1, Py_UCS2,
    2177             :             (const Py_UCS1 *)data,
    2178             :             ((const Py_UCS1 *)data) + len,
    2179             :             result);
    2180       31327 :         return result;
    2181        3320 :     case PyUnicode_4BYTE_KIND:
    2182        3320 :         result = PyMem_New(Py_UCS4, len);
    2183        3320 :         if (!result)
    2184           0 :             return PyErr_NoMemory();
    2185        3320 :         if (skind == PyUnicode_2BYTE_KIND) {
    2186         124 :             _PyUnicode_CONVERT_BYTES(
    2187             :                 Py_UCS2, Py_UCS4,
    2188             :                 (const Py_UCS2 *)data,
    2189             :                 ((const Py_UCS2 *)data) + len,
    2190             :                 result);
    2191             :         }
    2192             :         else {
    2193        3277 :             assert(skind == PyUnicode_1BYTE_KIND);
    2194        6631 :             _PyUnicode_CONVERT_BYTES(
    2195             :                 Py_UCS1, Py_UCS4,
    2196             :                 (const Py_UCS1 *)data,
    2197             :                 ((const Py_UCS1 *)data) + len,
    2198             :                 result);
    2199             :         }
    2200        3320 :         return result;
    2201           0 :     default:
    2202           0 :         Py_UNREACHABLE();
    2203             :         return NULL;
    2204             :     }
    2205             : }
    2206             : 
    2207             : static Py_UCS4*
    2208      103231 : as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
    2209             :         int copy_null)
    2210             : {
    2211             :     int kind;
    2212             :     const void *data;
    2213             :     Py_ssize_t len, targetlen;
    2214      103231 :     kind = PyUnicode_KIND(string);
    2215      103231 :     data = PyUnicode_DATA(string);
    2216      103231 :     len = PyUnicode_GET_LENGTH(string);
    2217      103231 :     targetlen = len;
    2218      103231 :     if (copy_null)
    2219          70 :         targetlen++;
    2220      103231 :     if (!target) {
    2221          46 :         target = PyMem_New(Py_UCS4, targetlen);
    2222          46 :         if (!target) {
    2223           0 :             PyErr_NoMemory();
    2224           0 :             return NULL;
    2225             :         }
    2226             :     }
    2227             :     else {
    2228      103185 :         if (targetsize < targetlen) {
    2229          12 :             PyErr_Format(PyExc_SystemError,
    2230             :                          "string is longer than the buffer");
    2231          12 :             if (copy_null && 0 < targetsize)
    2232           6 :                 target[0] = 0;
    2233          12 :             return NULL;
    2234             :         }
    2235             :     }
    2236      103219 :     if (kind == PyUnicode_1BYTE_KIND) {
    2237      102972 :         const Py_UCS1 *start = (const Py_UCS1 *) data;
    2238    24196800 :         _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
    2239             :     }
    2240         247 :     else if (kind == PyUnicode_2BYTE_KIND) {
    2241         206 :         const Py_UCS2 *start = (const Py_UCS2 *) data;
    2242      585890 :         _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
    2243             :     }
    2244          41 :     else if (kind == PyUnicode_4BYTE_KIND) {
    2245          41 :         memcpy(target, data, len * sizeof(Py_UCS4));
    2246             :     }
    2247             :     else {
    2248           0 :         Py_UNREACHABLE();
    2249             :     }
    2250      103219 :     if (copy_null)
    2251          64 :         target[len] = 0;
    2252      103219 :     return target;
    2253             : }
    2254             : 
    2255             : Py_UCS4*
    2256      103185 : PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
    2257             :                  int copy_null)
    2258             : {
    2259      103185 :     if (target == NULL || targetsize < 0) {
    2260           0 :         PyErr_BadInternalCall();
    2261           0 :         return NULL;
    2262             :     }
    2263      103185 :     return as_ucs4(string, target, targetsize, copy_null);
    2264             : }
    2265             : 
    2266             : Py_UCS4*
    2267          46 : PyUnicode_AsUCS4Copy(PyObject *string)
    2268             : {
    2269          46 :     return as_ucs4(string, NULL, 0, 1);
    2270             : }
    2271             : 
    2272             : /* maximum number of characters required for output of %lld or %p.
    2273             :    We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
    2274             :    plus 1 for the sign.  53/22 is an upper bound for log10(256). */
    2275             : #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
    2276             : 
    2277             : static int
    2278    13744100 : unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
    2279             :                              Py_ssize_t width, Py_ssize_t precision)
    2280             : {
    2281             :     Py_ssize_t length, fill, arglen;
    2282             :     Py_UCS4 maxchar;
    2283             : 
    2284    13744100 :     length = PyUnicode_GET_LENGTH(str);
    2285    13744100 :     if ((precision == -1 || precision >= length)
    2286    13744000 :         && width <= length)
    2287    13744000 :         return _PyUnicodeWriter_WriteStr(writer, str);
    2288             : 
    2289          46 :     if (precision != -1)
    2290          37 :         length = Py_MIN(precision, length);
    2291             : 
    2292          46 :     arglen = Py_MAX(length, width);
    2293          46 :     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
    2294           5 :         maxchar = _PyUnicode_FindMaxChar(str, 0, length);
    2295             :     else
    2296          41 :         maxchar = writer->maxchar;
    2297             : 
    2298          46 :     if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
    2299           0 :         return -1;
    2300             : 
    2301          46 :     if (width > length) {
    2302          14 :         fill = width - length;
    2303          14 :         if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
    2304           0 :             return -1;
    2305          14 :         writer->pos += fill;
    2306             :     }
    2307             : 
    2308          46 :     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
    2309             :                                   str, 0, length);
    2310          46 :     writer->pos += length;
    2311          46 :     return 0;
    2312             : }
    2313             : 
    2314             : static int
    2315     5103020 : unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
    2316             :                               Py_ssize_t width, Py_ssize_t precision)
    2317             : {
    2318             :     /* UTF-8 */
    2319             :     Py_ssize_t length;
    2320             :     PyObject *unicode;
    2321             :     int res;
    2322             : 
    2323     5103020 :     if (precision == -1) {
    2324      213236 :         length = strlen(str);
    2325             :     }
    2326             :     else {
    2327     4889780 :         length = 0;
    2328    41580400 :         while (length < precision && str[length]) {
    2329    36690600 :             length++;
    2330             :         }
    2331             :     }
    2332     5103020 :     unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
    2333     5103020 :     if (unicode == NULL)
    2334          32 :         return -1;
    2335             : 
    2336     5102990 :     res = unicode_fromformat_write_str(writer, unicode, width, -1);
    2337     5102990 :     Py_DECREF(unicode);
    2338     5102990 :     return res;
    2339             : }
    2340             : 
    2341             : static const char*
    2342    13898700 : unicode_fromformat_arg(_PyUnicodeWriter *writer,
    2343             :                        const char *f, va_list *vargs)
    2344             : {
    2345             :     const char *p;
    2346             :     Py_ssize_t len;
    2347             :     int zeropad;
    2348             :     Py_ssize_t width;
    2349             :     Py_ssize_t precision;
    2350             :     int longflag;
    2351             :     int longlongflag;
    2352             :     int size_tflag;
    2353             :     Py_ssize_t fill;
    2354             : 
    2355    13898700 :     p = f;
    2356    13898700 :     f++;
    2357    13898700 :     zeropad = 0;
    2358    13898700 :     if (*f == '0') {
    2359       10190 :         zeropad = 1;
    2360       10190 :         f++;
    2361             :     }
    2362             : 
    2363             :     /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
    2364    13898700 :     width = -1;
    2365    13898700 :     if (Py_ISDIGIT((unsigned)*f)) {
    2366       11439 :         width = *f - '0';
    2367       11439 :         f++;
    2368       11454 :         while (Py_ISDIGIT((unsigned)*f)) {
    2369          15 :             if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
    2370           0 :                 PyErr_SetString(PyExc_ValueError,
    2371             :                                 "width too big");
    2372           0 :                 return NULL;
    2373             :             }
    2374          15 :             width = (width * 10) + (*f - '0');
    2375          15 :             f++;
    2376             :         }
    2377             :     }
    2378    13898700 :     precision = -1;
    2379    13898700 :     if (*f == '.') {
    2380     4896620 :         f++;
    2381     4896620 :         if (Py_ISDIGIT((unsigned)*f)) {
    2382     4896620 :             precision = (*f - '0');
    2383     4896620 :             f++;
    2384    11024900 :             while (Py_ISDIGIT((unsigned)*f)) {
    2385     6128280 :                 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
    2386           0 :                     PyErr_SetString(PyExc_ValueError,
    2387             :                                     "precision too big");
    2388           0 :                     return NULL;
    2389             :                 }
    2390     6128280 :                 precision = (precision * 10) + (*f - '0');
    2391     6128280 :                 f++;
    2392             :             }
    2393             :         }
    2394     4896620 :         if (*f == '%') {
    2395             :             /* "%.3%s" => f points to "3" */
    2396           1 :             f--;
    2397             :         }
    2398             :     }
    2399    13898700 :     if (*f == '\0') {
    2400             :         /* bogus format "%.123" => go backward, f points to "3" */
    2401           1 :         f--;
    2402             :     }
    2403             : 
    2404             :     /* Handle %ld, %lu, %lld and %llu. */
    2405    13898700 :     longflag = 0;
    2406    13898700 :     longlongflag = 0;
    2407    13898700 :     size_tflag = 0;
    2408    13898700 :     if (*f == 'l') {
    2409        8267 :         if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
    2410        8164 :             longflag = 1;
    2411        8164 :             ++f;
    2412             :         }
    2413         103 :         else if (f[1] == 'l' &&
    2414         103 :                  (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
    2415         103 :             longlongflag = 1;
    2416         103 :             f += 2;
    2417             :         }
    2418             :     }
    2419             :     /* handle the size_t flag. */
    2420    13890400 :     else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
    2421       29786 :         size_tflag = 1;
    2422       29786 :         ++f;
    2423             :     }
    2424             : 
    2425    13898700 :     if (f[1] == '\0')
    2426      186073 :         writer->overallocate = 0;
    2427             : 
    2428    13898700 :     switch (*f) {
    2429       31262 :     case 'c':
    2430             :     {
    2431       31262 :         int ordinal = va_arg(*vargs, int);
    2432       31262 :         if (ordinal < 0 || ordinal > MAX_UNICODE) {
    2433           1 :             PyErr_SetString(PyExc_OverflowError,
    2434             :                             "character argument not in range(0x110000)");
    2435           1 :             return NULL;
    2436             :         }
    2437       31261 :         if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
    2438           0 :             return NULL;
    2439       31261 :         break;
    2440             :     }
    2441             : 
    2442      113595 :     case 'i':
    2443             :     case 'd':
    2444             :     case 'u':
    2445             :     case 'x':
    2446             :     {
    2447             :         /* used by sprintf */
    2448             :         char buffer[MAX_LONG_LONG_CHARS];
    2449             :         Py_ssize_t arglen;
    2450             : 
    2451      113595 :         if (*f == 'u') {
    2452        6861 :             if (longflag) {
    2453        6203 :                 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
    2454             :             }
    2455         658 :             else if (longlongflag) {
    2456           6 :                 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
    2457             :             }
    2458         652 :             else if (size_tflag) {
    2459         148 :                 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
    2460             :             }
    2461             :             else {
    2462         504 :                 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
    2463             :             }
    2464             :         }
    2465      106734 :         else if (*f == 'x') {
    2466         130 :             len = sprintf(buffer, "%x", va_arg(*vargs, int));
    2467             :         }
    2468             :         else {
    2469      106604 :             if (longflag) {
    2470        1961 :                 len = sprintf(buffer, "%li", va_arg(*vargs, long));
    2471             :             }
    2472      104643 :             else if (longlongflag) {
    2473          97 :                 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
    2474             :             }
    2475      104546 :             else if (size_tflag) {
    2476       29638 :                 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
    2477             :             }
    2478             :             else {
    2479       74908 :                 len = sprintf(buffer, "%i", va_arg(*vargs, int));
    2480             :             }
    2481             :         }
    2482      113595 :         assert(len >= 0);
    2483             : 
    2484      113595 :         if (precision < len)
    2485      109931 :             precision = len;
    2486             : 
    2487      113595 :         arglen = Py_MAX(precision, width);
    2488      113595 :         if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
    2489           0 :             return NULL;
    2490             : 
    2491      113595 :         if (width > precision) {
    2492             :             Py_UCS4 fillchar;
    2493        6493 :             fill = width - precision;
    2494        6493 :             fillchar = zeropad?'0':' ';
    2495        6493 :             if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
    2496           0 :                 return NULL;
    2497        6493 :             writer->pos += fill;
    2498             :         }
    2499      113595 :         if (precision > len) {
    2500         821 :             fill = precision - len;
    2501         821 :             if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
    2502           0 :                 return NULL;
    2503         821 :             writer->pos += fill;
    2504             :         }
    2505             : 
    2506      113595 :         if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
    2507           0 :             return NULL;
    2508      113595 :         break;
    2509             :     }
    2510             : 
    2511        9045 :     case 'p':
    2512             :     {
    2513             :         char number[MAX_LONG_LONG_CHARS];
    2514             : 
    2515        9045 :         len = sprintf(number, "%p", va_arg(*vargs, void*));
    2516        9045 :         assert(len >= 0);
    2517             : 
    2518             :         /* %p is ill-defined:  ensure leading 0x. */
    2519        9045 :         if (number[1] == 'X')
    2520           0 :             number[1] = 'x';
    2521        9045 :         else if (number[1] != 'x') {
    2522           0 :             memmove(number + 2, number,
    2523           0 :                     strlen(number) + 1);
    2524           0 :             number[0] = '0';
    2525           0 :             number[1] = 'x';
    2526           0 :             len += 2;
    2527             :         }
    2528             : 
    2529        9045 :         if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
    2530           0 :             return NULL;
    2531        9045 :         break;
    2532             :     }
    2533             : 
    2534     5103010 :     case 's':
    2535             :     {
    2536             :         /* UTF-8 */
    2537     5103010 :         const char *s = va_arg(*vargs, const char*);
    2538     5103010 :         if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
    2539          32 :             return NULL;
    2540     5102980 :         break;
    2541             :     }
    2542             : 
    2543     8503850 :     case 'U':
    2544             :     {
    2545     8503850 :         PyObject *obj = va_arg(*vargs, PyObject *);
    2546     8503850 :         assert(obj && _PyUnicode_CHECK(obj));
    2547             : 
    2548     8503850 :         if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
    2549           0 :             return NULL;
    2550     8503850 :         break;
    2551             :     }
    2552             : 
    2553        2114 :     case 'V':
    2554             :     {
    2555        2114 :         PyObject *obj = va_arg(*vargs, PyObject *);
    2556        2114 :         const char *str = va_arg(*vargs, const char *);
    2557        2114 :         if (obj) {
    2558        2106 :             assert(_PyUnicode_CHECK(obj));
    2559        2106 :             if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
    2560           0 :                 return NULL;
    2561             :         }
    2562             :         else {
    2563           8 :             assert(str != NULL);
    2564           8 :             if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
    2565           0 :                 return NULL;
    2566             :         }
    2567        2114 :         break;
    2568             :     }
    2569             : 
    2570       12234 :     case 'S':
    2571             :     {
    2572       12234 :         PyObject *obj = va_arg(*vargs, PyObject *);
    2573             :         PyObject *str;
    2574       12234 :         assert(obj);
    2575       12234 :         str = PyObject_Str(obj);
    2576       12234 :         if (!str)
    2577           2 :             return NULL;
    2578       12232 :         if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
    2579           0 :             Py_DECREF(str);
    2580           0 :             return NULL;
    2581             :         }
    2582       12232 :         Py_DECREF(str);
    2583       12232 :         break;
    2584             :     }
    2585             : 
    2586      123404 :     case 'R':
    2587             :     {
    2588      123404 :         PyObject *obj = va_arg(*vargs, PyObject *);
    2589             :         PyObject *repr;
    2590      123404 :         assert(obj);
    2591      123404 :         repr = PyObject_Repr(obj);
    2592      123404 :         if (!repr)
    2593         501 :             return NULL;
    2594      122903 :         if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
    2595           0 :             Py_DECREF(repr);
    2596           0 :             return NULL;
    2597             :         }
    2598      122903 :         Py_DECREF(repr);
    2599      122903 :         break;
    2600             :     }
    2601             : 
    2602           6 :     case 'A':
    2603             :     {
    2604           6 :         PyObject *obj = va_arg(*vargs, PyObject *);
    2605             :         PyObject *ascii;
    2606           6 :         assert(obj);
    2607           6 :         ascii = PyObject_ASCII(obj);
    2608           6 :         if (!ascii)
    2609           0 :             return NULL;
    2610           6 :         if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
    2611           0 :             Py_DECREF(ascii);
    2612           0 :             return NULL;
    2613             :         }
    2614           6 :         Py_DECREF(ascii);
    2615           6 :         break;
    2616             :     }
    2617             : 
    2618         133 :     case '%':
    2619         133 :         if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
    2620           0 :             return NULL;
    2621         133 :         break;
    2622             : 
    2623           3 :     default:
    2624             :         /* if we stumble upon an unknown formatting code, copy the rest
    2625             :            of the format string to the output string. (we cannot just
    2626             :            skip the code, since there's no way to know what's in the
    2627             :            argument list) */
    2628           3 :         len = strlen(p);
    2629           3 :         if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
    2630           0 :             return NULL;
    2631           3 :         f = p+len;
    2632           3 :         return f;
    2633             :     }
    2634             : 
    2635    13898100 :     f++;
    2636    13898100 :     return f;
    2637             : }
    2638             : 
    2639             : PyObject *
    2640     7591900 : PyUnicode_FromFormatV(const char *format, va_list vargs)
    2641             : {
    2642             :     va_list vargs2;
    2643             :     const char *f;
    2644             :     _PyUnicodeWriter writer;
    2645             : 
    2646     7591900 :     _PyUnicodeWriter_Init(&writer);
    2647     7591900 :     writer.min_length = strlen(format) + 100;
    2648     7591900 :     writer.overallocate = 1;
    2649             : 
    2650             :     // Copy varags to be able to pass a reference to a subfunction.
    2651     7591900 :     va_copy(vargs2, vargs);
    2652             : 
    2653    42523400 :     for (f = format; *f; ) {
    2654    34932000 :         if (*f == '%') {
    2655    13898700 :             f = unicode_fromformat_arg(&writer, f, &vargs2);
    2656    13898700 :             if (f == NULL)
    2657         536 :                 goto fail;
    2658             :         }
    2659             :         else {
    2660             :             const char *p;
    2661             :             Py_ssize_t len;
    2662             : 
    2663    21033400 :             p = f;
    2664             :             do
    2665             :             {
    2666   228322000 :                 if ((unsigned char)*p > 127) {
    2667           1 :                     PyErr_Format(PyExc_ValueError,
    2668             :                         "PyUnicode_FromFormatV() expects an ASCII-encoded format "
    2669             :                         "string, got a non-ASCII byte: 0x%02x",
    2670           1 :                         (unsigned char)*p);
    2671           1 :                     goto fail;
    2672             :                 }
    2673   228322000 :                 p++;
    2674             :             }
    2675   228322000 :             while (*p != '\0' && *p != '%');
    2676    21033400 :             len = p - f;
    2677             : 
    2678    21033400 :             if (*p == '\0')
    2679     7405300 :                 writer.overallocate = 0;
    2680             : 
    2681    21033400 :             if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
    2682           2 :                 goto fail;
    2683             : 
    2684    21033400 :             f = p;
    2685             :         }
    2686             :     }
    2687     7591360 :     va_end(vargs2);
    2688     7591360 :     return _PyUnicodeWriter_Finish(&writer);
    2689             : 
    2690         539 :   fail:
    2691         539 :     va_end(vargs2);
    2692         539 :     _PyUnicodeWriter_Dealloc(&writer);
    2693         539 :     return NULL;
    2694             : }
    2695             : 
    2696             : PyObject *
    2697      543485 : PyUnicode_FromFormat(const char *format, ...)
    2698             : {
    2699             :     PyObject* ret;
    2700             :     va_list vargs;
    2701             : 
    2702      543485 :     va_start(vargs, format);
    2703      543485 :     ret = PyUnicode_FromFormatV(format, vargs);
    2704      543485 :     va_end(vargs);
    2705      543485 :     return ret;
    2706             : }
    2707             : 
    2708             : static Py_ssize_t
    2709      452712 : unicode_get_widechar_size(PyObject *unicode)
    2710             : {
    2711             :     Py_ssize_t res;
    2712             : 
    2713      452712 :     assert(unicode != NULL);
    2714      452712 :     assert(_PyUnicode_CHECK(unicode));
    2715             : 
    2716      452712 :     res = _PyUnicode_LENGTH(unicode);
    2717             : #if SIZEOF_WCHAR_T == 2
    2718             :     if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
    2719             :         const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
    2720             :         const Py_UCS4 *end = s + res;
    2721             :         for (; s < end; ++s) {
    2722             :             if (*s > 0xFFFF) {
    2723             :                 ++res;
    2724             :             }
    2725             :         }
    2726             :     }
    2727             : #endif
    2728      452712 :     return res;
    2729             : }
    2730             : 
    2731             : static void
    2732      442696 : unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
    2733             : {
    2734      442696 :     assert(unicode != NULL);
    2735      442696 :     assert(_PyUnicode_CHECK(unicode));
    2736             : 
    2737      442696 :     if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
    2738          38 :         memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
    2739          38 :         return;
    2740             :     }
    2741             : 
    2742      442658 :     if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
    2743      436248 :         const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
    2744    20913200 :         for (; size--; ++s, ++w) {
    2745    20477000 :             *w = *s;
    2746             :         }
    2747             :     }
    2748             :     else {
    2749             : #if SIZEOF_WCHAR_T == 4
    2750        6410 :         assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
    2751        6410 :         const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
    2752       24128 :         for (; size--; ++s, ++w) {
    2753       17718 :             *w = *s;
    2754             :         }
    2755             : #else
    2756             :         assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
    2757             :         const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
    2758             :         for (; size--; ++s, ++w) {
    2759             :             Py_UCS4 ch = *s;
    2760             :             if (ch > 0xFFFF) {
    2761             :                 assert(ch <= MAX_UNICODE);
    2762             :                 /* encode surrogate pair in this case */
    2763             :                 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
    2764             :                 if (!size--)
    2765             :                     break;
    2766             :                 *w = Py_UNICODE_LOW_SURROGATE(ch);
    2767             :             }
    2768             :             else {
    2769             :                 *w = ch;
    2770             :             }
    2771             :         }
    2772             : #endif
    2773             :     }
    2774             : }
    2775             : 
    2776             : #ifdef HAVE_WCHAR_H
    2777             : 
    2778             : /* Convert a Unicode object to a wide character string.
    2779             : 
    2780             :    - If w is NULL: return the number of wide characters (including the null
    2781             :      character) required to convert the unicode object. Ignore size argument.
    2782             : 
    2783             :    - Otherwise: return the number of wide characters (excluding the null
    2784             :      character) written into w. Write at most size wide characters (including
    2785             :      the null character). */
    2786             : Py_ssize_t
    2787       20067 : PyUnicode_AsWideChar(PyObject *unicode,
    2788             :                      wchar_t *w,
    2789             :                      Py_ssize_t size)
    2790             : {
    2791             :     Py_ssize_t res;
    2792             : 
    2793       20067 :     if (unicode == NULL) {
    2794           0 :         PyErr_BadInternalCall();
    2795           0 :         return -1;
    2796             :     }
    2797       20067 :     if (!PyUnicode_Check(unicode)) {
    2798           0 :         PyErr_BadArgument();
    2799           0 :         return -1;
    2800             :     }
    2801             : 
    2802       20067 :     res = unicode_get_widechar_size(unicode);
    2803       20067 :     if (w == NULL) {
    2804       10016 :         return res + 1;
    2805             :     }
    2806             : 
    2807       10051 :     if (size > res) {
    2808          50 :         size = res + 1;
    2809             :     }
    2810             :     else {
    2811       10001 :         res = size;
    2812             :     }
    2813       10051 :     unicode_copy_as_widechar(unicode, w, size);
    2814             : 
    2815             : #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
    2816             :     /* Oracle Solaris uses non-Unicode internal wchar_t form for
    2817             :        non-Unicode locales and hence needs conversion first. */
    2818             :     if (_Py_LocaleUsesNonUnicodeWchar()) {
    2819             :         if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
    2820             :             return -1;
    2821             :         }
    2822             :     }
    2823             : #endif
    2824             : 
    2825       10051 :     return res;
    2826             : }
    2827             : 
    2828             : wchar_t*
    2829      432645 : PyUnicode_AsWideCharString(PyObject *unicode,
    2830             :                            Py_ssize_t *size)
    2831             : {
    2832             :     wchar_t *buffer;
    2833             :     Py_ssize_t buflen;
    2834             : 
    2835      432645 :     if (unicode == NULL) {
    2836           0 :         PyErr_BadInternalCall();
    2837           0 :         return NULL;
    2838             :     }
    2839      432645 :     if (!PyUnicode_Check(unicode)) {
    2840           0 :         PyErr_BadArgument();
    2841           0 :         return NULL;
    2842             :     }
    2843             : 
    2844      432645 :     buflen = unicode_get_widechar_size(unicode);
    2845      432645 :     buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
    2846      432645 :     if (buffer == NULL) {
    2847           0 :         PyErr_NoMemory();
    2848           0 :         return NULL;
    2849             :     }
    2850      432645 :     unicode_copy_as_widechar(unicode, buffer, buflen + 1);
    2851             : 
    2852             : #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
    2853             :     /* Oracle Solaris uses non-Unicode internal wchar_t form for
    2854             :        non-Unicode locales and hence needs conversion first. */
    2855             :     if (_Py_LocaleUsesNonUnicodeWchar()) {
    2856             :         if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
    2857             :             return NULL;
    2858             :         }
    2859             :     }
    2860             : #endif
    2861             : 
    2862      432645 :     if (size != NULL) {
    2863      298588 :         *size = buflen;
    2864             :     }
    2865      134057 :     else if (wcslen(buffer) != (size_t)buflen) {
    2866           5 :         PyMem_Free(buffer);
    2867           5 :         PyErr_SetString(PyExc_ValueError,
    2868             :                         "embedded null character");
    2869           5 :         return NULL;
    2870             :     }
    2871      432640 :     return buffer;
    2872             : }
    2873             : 
    2874             : #endif /* HAVE_WCHAR_H */
    2875             : 
    2876             : int
    2877           0 : _PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
    2878             : {
    2879           0 :     wchar_t **p = (wchar_t **)ptr;
    2880           0 :     if (obj == NULL) {
    2881           0 :         PyMem_Free(*p);
    2882           0 :         *p = NULL;
    2883           0 :         return 1;
    2884             :     }
    2885           0 :     if (PyUnicode_Check(obj)) {
    2886           0 :         *p = PyUnicode_AsWideCharString(obj, NULL);
    2887           0 :         if (*p == NULL) {
    2888           0 :             return 0;
    2889             :         }
    2890           0 :         return Py_CLEANUP_SUPPORTED;
    2891             :     }
    2892           0 :     PyErr_Format(PyExc_TypeError,
    2893             :                  "argument must be str, not %.50s",
    2894           0 :                  Py_TYPE(obj)->tp_name);
    2895           0 :     return 0;
    2896             : }
    2897             : 
    2898             : int
    2899           0 : _PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
    2900             : {
    2901           0 :     wchar_t **p = (wchar_t **)ptr;
    2902           0 :     if (obj == NULL) {
    2903           0 :         PyMem_Free(*p);
    2904           0 :         *p = NULL;
    2905           0 :         return 1;
    2906             :     }
    2907           0 :     if (obj == Py_None) {
    2908           0 :         *p = NULL;
    2909           0 :         return 1;
    2910             :     }
    2911           0 :     if (PyUnicode_Check(obj)) {
    2912           0 :         *p = PyUnicode_AsWideCharString(obj, NULL);
    2913           0 :         if (*p == NULL) {
    2914           0 :             return 0;
    2915             :         }
    2916           0 :         return Py_CLEANUP_SUPPORTED;
    2917             :     }
    2918           0 :     PyErr_Format(PyExc_TypeError,
    2919             :                  "argument must be str or None, not %.50s",
    2920           0 :                  Py_TYPE(obj)->tp_name);
    2921           0 :     return 0;
    2922             : }
    2923             : 
    2924             : PyObject *
    2925    12696800 : PyUnicode_FromOrdinal(int ordinal)
    2926             : {
    2927    12696800 :     if (ordinal < 0 || ordinal > MAX_UNICODE) {
    2928           6 :         PyErr_SetString(PyExc_ValueError,
    2929             :                         "chr() arg not in range(0x110000)");
    2930           6 :         return NULL;
    2931             :     }
    2932             : 
    2933    12696800 :     return unicode_char((Py_UCS4)ordinal);
    2934             : }
    2935             : 
    2936             : PyObject *
    2937     1225570 : PyUnicode_FromObject(PyObject *obj)
    2938             : {
    2939             :     /* XXX Perhaps we should make this API an alias of
    2940             :        PyObject_Str() instead ?! */
    2941     1225570 :     if (PyUnicode_CheckExact(obj)) {
    2942     1225560 :         Py_INCREF(obj);
    2943     1225560 :         return obj;
    2944             :     }
    2945           8 :     if (PyUnicode_Check(obj)) {
    2946             :         /* For a Unicode subtype that's not a Unicode object,
    2947             :            return a true Unicode object with the same data. */
    2948           8 :         return _PyUnicode_Copy(obj);
    2949             :     }
    2950           0 :     PyErr_Format(PyExc_TypeError,
    2951             :                  "Can't convert '%.100s' object to str implicitly",
    2952           0 :                  Py_TYPE(obj)->tp_name);
    2953           0 :     return NULL;
    2954             : }
    2955             : 
    2956             : PyObject *
    2957     8357010 : PyUnicode_FromEncodedObject(PyObject *obj,
    2958             :                             const char *encoding,
    2959             :                             const char *errors)
    2960             : {
    2961             :     Py_buffer buffer;
    2962             :     PyObject *v;
    2963             : 
    2964     8357010 :     if (obj == NULL) {
    2965           0 :         PyErr_BadInternalCall();
    2966           0 :         return NULL;
    2967             :     }
    2968             : 
    2969             :     /* Decoding bytes objects is the most common case and should be fast */
    2970     8357010 :     if (PyBytes_Check(obj)) {
    2971     7804760 :         if (PyBytes_GET_SIZE(obj) == 0) {
    2972       60827 :             if (unicode_check_encoding_errors(encoding, errors) < 0) {
    2973           6 :                 return NULL;
    2974             :             }
    2975       60821 :             _Py_RETURN_UNICODE_EMPTY();
    2976             :         }
    2977    15487900 :         return PyUnicode_Decode(
    2978     7743930 :                 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
    2979             :                 encoding, errors);
    2980             :     }
    2981             : 
    2982      552250 :     if (PyUnicode_Check(obj)) {
    2983           1 :         PyErr_SetString(PyExc_TypeError,
    2984             :                         "decoding str is not supported");
    2985           1 :         return NULL;
    2986             :     }
    2987             : 
    2988             :     /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
    2989      552249 :     if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
    2990           0 :         PyErr_Format(PyExc_TypeError,
    2991             :                      "decoding to str: need a bytes-like object, %.80s found",
    2992           0 :                      Py_TYPE(obj)->tp_name);
    2993           0 :         return NULL;
    2994             :     }
    2995             : 
    2996      552249 :     if (buffer.len == 0) {
    2997        2077 :         PyBuffer_Release(&buffer);
    2998        2077 :         if (unicode_check_encoding_errors(encoding, errors) < 0) {
    2999           1 :             return NULL;
    3000             :         }
    3001        2076 :         _Py_RETURN_UNICODE_EMPTY();
    3002             :     }
    3003             : 
    3004      550172 :     v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
    3005      550172 :     PyBuffer_Release(&buffer);
    3006      550172 :     return v;
    3007             : }
    3008             : 
    3009             : /* Normalize an encoding name: similar to encodings.normalize_encoding(), but
    3010             :    also convert to lowercase. Return 1 on success, or 0 on error (encoding is
    3011             :    longer than lower_len-1). */
    3012             : int
    3013    21681400 : _Py_normalize_encoding(const char *encoding,
    3014             :                        char *lower,
    3015             :                        size_t lower_len)
    3016             : {
    3017             :     const char *e;
    3018             :     char *l;
    3019             :     char *l_end;
    3020             :     int punct;
    3021             : 
    3022    21681400 :     assert(encoding != NULL);
    3023             : 
    3024    21681400 :     e = encoding;
    3025    21681400 :     l = lower;
    3026    21681400 :     l_end = &lower[lower_len - 1];
    3027    21681400 :     punct = 0;
    3028   149236000 :     while (1) {
    3029   170917000 :         char c = *e;
    3030   170917000 :         if (c == 0) {
    3031    20536000 :             break;
    3032             :         }
    3033             : 
    3034   150381000 :         if (Py_ISALNUM(c) || c == '.') {
    3035   128518000 :             if (punct && l != lower) {
    3036    21862600 :                 if (l == l_end) {
    3037          51 :                     return 0;
    3038             :                 }
    3039    21862500 :                 *l++ = '_';
    3040             :             }
    3041   128518000 :             punct = 0;
    3042             : 
    3043   128518000 :             if (l == l_end) {
    3044     1145280 :                 return 0;
    3045             :             }
    3046   127373000 :             *l++ = Py_TOLOWER(c);
    3047             :         }
    3048             :         else {
    3049    21862700 :             punct = 1;
    3050             :         }
    3051             : 
    3052   149236000 :         e++;
    3053             :     }
    3054    20536000 :     *l = '\0';
    3055    20536000 :     return 1;
    3056             : }
    3057             : 
    3058             : PyObject *
    3059     8301320 : PyUnicode_Decode(const char *s,
    3060             :                  Py_ssize_t size,
    3061             :                  const char *encoding,
    3062             :                  const char *errors)
    3063             : {
    3064     8301320 :     PyObject *buffer = NULL, *unicode;
    3065             :     Py_buffer info;
    3066             :     char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
    3067             : 
    3068     8301320 :     if (unicode_check_encoding_errors(encoding, errors) < 0) {
    3069          31 :         return NULL;
    3070             :     }
    3071             : 
    3072     8301280 :     if (size == 0) {
    3073           4 :         _Py_RETURN_UNICODE_EMPTY();
    3074             :     }
    3075             : 
    3076     8301280 :     if (encoding == NULL) {
    3077        5037 :         return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
    3078             :     }
    3079             : 
    3080             :     /* Shortcuts for common default encodings */
    3081     8296240 :     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
    3082     8236680 :         char *lower = buflower;
    3083             : 
    3084             :         /* Fast paths */
    3085     8236680 :         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
    3086     2456000 :             lower += 3;
    3087     2456000 :             if (*lower == '_') {
    3088             :                 /* Match "utf8" and "utf_8" */
    3089     2455240 :                 lower++;
    3090             :             }
    3091             : 
    3092     2456000 :             if (lower[0] == '8' && lower[1] == 0) {
    3093     2451480 :                 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
    3094             :             }
    3095        4519 :             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
    3096        1063 :                 return PyUnicode_DecodeUTF16(s, size, errors, 0);
    3097             :             }
    3098        3456 :             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
    3099          33 :                 return PyUnicode_DecodeUTF32(s, size, errors, 0);
    3100             :             }
    3101             :         }
    3102             :         else {
    3103     5780680 :             if (strcmp(lower, "ascii") == 0
    3104     5081370 :                 || strcmp(lower, "us_ascii") == 0) {
    3105      699455 :                 return PyUnicode_DecodeASCII(s, size, errors);
    3106             :             }
    3107             :     #ifdef MS_WINDOWS
    3108             :             else if (strcmp(lower, "mbcs") == 0) {
    3109             :                 return PyUnicode_DecodeMBCS(s, size, errors);
    3110             :             }
    3111             :     #endif
    3112     5081230 :             else if (strcmp(lower, "latin1") == 0
    3113     5070580 :                      || strcmp(lower, "latin_1") == 0
    3114      205853 :                      || strcmp(lower, "iso_8859_1") == 0
    3115      203816 :                      || strcmp(lower, "iso8859_1") == 0) {
    3116     4898930 :                 return PyUnicode_DecodeLatin1(s, size, errors);
    3117             :             }
    3118             :         }
    3119             :     }
    3120             : 
    3121             :     /* Decode via the codec registry */
    3122      245275 :     buffer = NULL;
    3123      245275 :     if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
    3124           0 :         goto onError;
    3125      245275 :     buffer = PyMemoryView_FromBuffer(&info);
    3126      245275 :     if (buffer == NULL)
    3127           0 :         goto onError;
    3128      245275 :     unicode = _PyCodec_DecodeText(buffer, encoding, errors);
    3129      245275 :     if (unicode == NULL)
    3130          74 :         goto onError;
    3131      245201 :     if (!PyUnicode_Check(unicode)) {
    3132           2 :         PyErr_Format(PyExc_TypeError,
    3133             :                      "'%.400s' decoder returned '%.400s' instead of 'str'; "
    3134             :                      "use codecs.decode() to decode to arbitrary types",
    3135             :                      encoding,
    3136           2 :                      Py_TYPE(unicode)->tp_name);
    3137           2 :         Py_DECREF(unicode);
    3138           2 :         goto onError;
    3139             :     }
    3140      245199 :     Py_DECREF(buffer);
    3141      245199 :     return unicode_result(unicode);
    3142             : 
    3143          76 :   onError:
    3144          76 :     Py_XDECREF(buffer);
    3145          76 :     return NULL;
    3146             : }
    3147             : 
    3148             : PyObject *
    3149           0 : PyUnicode_AsDecodedObject(PyObject *unicode,
    3150             :                           const char *encoding,
    3151             :                           const char *errors)
    3152             : {
    3153           0 :     if (!PyUnicode_Check(unicode)) {
    3154           0 :         PyErr_BadArgument();
    3155           0 :         return NULL;
    3156             :     }
    3157             : 
    3158           0 :     if (PyErr_WarnEx(PyExc_DeprecationWarning,
    3159             :                      "PyUnicode_AsDecodedObject() is deprecated; "
    3160             :                      "use PyCodec_Decode() to decode from str", 1) < 0)
    3161           0 :         return NULL;
    3162             : 
    3163           0 :     if (encoding == NULL)
    3164           0 :         encoding = PyUnicode_GetDefaultEncoding();
    3165             : 
    3166             :     /* Decode via the codec registry */
    3167           0 :     return PyCodec_Decode(unicode, encoding, errors);
    3168             : }
    3169             : 
    3170             : PyObject *
    3171           0 : PyUnicode_AsDecodedUnicode(PyObject *unicode,
    3172             :                            const char *encoding,
    3173             :                            const char *errors)
    3174             : {
    3175             :     PyObject *v;
    3176             : 
    3177           0 :     if (!PyUnicode_Check(unicode)) {
    3178           0 :         PyErr_BadArgument();
    3179           0 :         goto onError;
    3180             :     }
    3181             : 
    3182           0 :     if (PyErr_WarnEx(PyExc_DeprecationWarning,
    3183             :                      "PyUnicode_AsDecodedUnicode() is deprecated; "
    3184             :                      "use PyCodec_Decode() to decode from str to str", 1) < 0)
    3185           0 :         return NULL;
    3186             : 
    3187           0 :     if (encoding == NULL)
    3188           0 :         encoding = PyUnicode_GetDefaultEncoding();
    3189             : 
    3190             :     /* Decode via the codec registry */
    3191           0 :     v = PyCodec_Decode(unicode, encoding, errors);
    3192           0 :     if (v == NULL)
    3193           0 :         goto onError;
    3194           0 :     if (!PyUnicode_Check(v)) {
    3195           0 :         PyErr_Format(PyExc_TypeError,
    3196             :                      "'%.400s' decoder returned '%.400s' instead of 'str'; "
    3197             :                      "use codecs.decode() to decode to arbitrary types",
    3198             :                      encoding,
    3199           0 :                      Py_TYPE(unicode)->tp_name);
    3200           0 :         Py_DECREF(v);
    3201           0 :         goto onError;
    3202             :     }
    3203           0 :     return unicode_result(v);
    3204             : 
    3205           0 :   onError:
    3206           0 :     return NULL;
    3207             : }
    3208             : 
    3209             : PyObject *
    3210           0 : PyUnicode_AsEncodedObject(PyObject *unicode,
    3211             :                           const char *encoding,
    3212             :                           const char *errors)
    3213             : {
    3214             :     PyObject *v;
    3215             : 
    3216           0 :     if (!PyUnicode_Check(unicode)) {
    3217           0 :         PyErr_BadArgument();
    3218           0 :         goto onError;
    3219             :     }
    3220             : 
    3221           0 :     if (PyErr_WarnEx(PyExc_DeprecationWarning,
    3222             :                      "PyUnicode_AsEncodedObject() is deprecated; "
    3223             :                      "use PyUnicode_AsEncodedString() to encode from str to bytes "
    3224             :                      "or PyCodec_Encode() for generic encoding", 1) < 0)
    3225           0 :         return NULL;
    3226             : 
    3227           0 :     if (encoding == NULL)
    3228           0 :         encoding = PyUnicode_GetDefaultEncoding();
    3229             : 
    3230             :     /* Encode via the codec registry */
    3231           0 :     v = PyCodec_Encode(unicode, encoding, errors);
    3232           0 :     if (v == NULL)
    3233           0 :         goto onError;
    3234           0 :     return v;
    3235             : 
    3236           0 :   onError:
    3237           0 :     return NULL;
    3238             : }
    3239             : 
    3240             : 
    3241             : static PyObject *
    3242       92845 : unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
    3243             :                       int current_locale)
    3244             : {
    3245             :     Py_ssize_t wlen;
    3246       92845 :     wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
    3247       92845 :     if (wstr == NULL) {
    3248           0 :         return NULL;
    3249             :     }
    3250             : 
    3251       92845 :     if ((size_t)wlen != wcslen(wstr)) {
    3252           0 :         PyErr_SetString(PyExc_ValueError, "embedded null character");
    3253           0 :         PyMem_Free(wstr);
    3254           0 :         return NULL;
    3255             :     }
    3256             : 
    3257             :     char *str;
    3258             :     size_t error_pos;
    3259             :     const char *reason;
    3260       92845 :     int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
    3261             :                                  current_locale, error_handler);
    3262       92845 :     PyMem_Free(wstr);
    3263             : 
    3264       92845 :     if (res != 0) {
    3265           0 :         if (res == -2) {
    3266             :             PyObject *exc;
    3267           0 :             exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
    3268             :                     "locale", unicode,
    3269             :                     (Py_ssize_t)error_pos,
    3270           0 :                     (Py_ssize_t)(error_pos+1),
    3271             :                     reason);
    3272           0 :             if (exc != NULL) {
    3273           0 :                 PyCodec_StrictErrors(exc);
    3274           0 :                 Py_DECREF(exc);
    3275             :             }
    3276             :         }
    3277           0 :         else if (res == -3) {
    3278           0 :             PyErr_SetString(PyExc_ValueError, "unsupported error handler");
    3279             :         }
    3280             :         else {
    3281           0 :             PyErr_NoMemory();
    3282             :         }
    3283           0 :         return NULL;
    3284             :     }
    3285             : 
    3286       92845 :     PyObject *bytes = PyBytes_FromString(str);
    3287       92845 :     PyMem_RawFree(str);
    3288       92845 :     return bytes;
    3289             : }
    3290             : 
    3291             : PyObject *
    3292        1392 : PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
    3293             : {
    3294        1392 :     _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
    3295        1392 :     return unicode_encode_locale(unicode, error_handler, 1);
    3296             : }
    3297             : 
    3298             : PyObject *
    3299     2114420 : PyUnicode_EncodeFSDefault(PyObject *unicode)
    3300             : {
    3301     2114420 :     PyInterpreterState *interp = _PyInterpreterState_GET();
    3302     2114420 :     struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
    3303     2114420 :     if (fs_codec->utf8) {
    3304     2017660 :         return unicode_encode_utf8(unicode,
    3305             :                                    fs_codec->error_handler,
    3306     2017660 :                                    fs_codec->errors);
    3307             :     }
    3308             : #ifndef _Py_FORCE_UTF8_FS_ENCODING
    3309       96758 :     else if (fs_codec->encoding) {
    3310        5305 :         return PyUnicode_AsEncodedString(unicode,
    3311        5305 :                                          fs_codec->encoding,
    3312        5305 :                                          fs_codec->errors);
    3313             :     }
    3314             : #endif
    3315             :     else {
    3316             :         /* Before _PyUnicode_InitEncodings() is called, the Python codec
    3317             :            machinery is not ready and so cannot be used:
    3318             :            use wcstombs() in this case. */
    3319       91453 :         const PyConfig *config = _PyInterpreterState_GetConfig(interp);
    3320       91453 :         const wchar_t *filesystem_errors = config->filesystem_errors;
    3321       91453 :         assert(filesystem_errors != NULL);
    3322       91453 :         _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
    3323       91453 :         assert(errors != _Py_ERROR_UNKNOWN);
    3324             : #ifdef _Py_FORCE_UTF8_FS_ENCODING
    3325             :         return unicode_encode_utf8(unicode, errors, NULL);
    3326             : #else
    3327       91453 :         return unicode_encode_locale(unicode, errors, 0);
    3328             : #endif
    3329             :     }
    3330             : }
    3331             : 
    3332             : PyObject *
    3333     5431160 : PyUnicode_AsEncodedString(PyObject *unicode,
    3334             :                           const char *encoding,
    3335             :                           const char *errors)
    3336             : {
    3337             :     PyObject *v;
    3338             :     char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
    3339             : 
    3340     5431160 :     if (!PyUnicode_Check(unicode)) {
    3341           0 :         PyErr_BadArgument();
    3342           0 :         return NULL;
    3343             :     }
    3344             : 
    3345     5431160 :     if (unicode_check_encoding_errors(encoding, errors) < 0) {
    3346          32 :         return NULL;
    3347             :     }
    3348             : 
    3349     5431130 :     if (encoding == NULL) {
    3350       93534 :         return _PyUnicode_AsUTF8String(unicode, errors);
    3351             :     }
    3352             : 
    3353             :     /* Shortcuts for common default encodings */
    3354     5337600 :     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
    3355     4251830 :         char *lower = buflower;
    3356             : 
    3357             :         /* Fast paths */
    3358     4251830 :         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
    3359     2654310 :             lower += 3;
    3360     2654310 :             if (*lower == '_') {
    3361             :                 /* Match "utf8" and "utf_8" */
    3362     2596250 :                 lower++;
    3363             :             }
    3364             : 
    3365     2654310 :             if (lower[0] == '8' && lower[1] == 0) {
    3366     2644200 :                 return _PyUnicode_AsUTF8String(unicode, errors);
    3367             :             }
    3368       10115 :             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
    3369        1816 :                 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
    3370             :             }
    3371        8299 :             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
    3372         767 :                 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
    3373             :             }
    3374             :         }
    3375             :         else {
    3376     1597520 :             if (strcmp(lower, "ascii") == 0
    3377      187696 :                 || strcmp(lower, "us_ascii") == 0) {
    3378     1416840 :                 return _PyUnicode_AsASCIIString(unicode, errors);
    3379             :             }
    3380             : #ifdef MS_WINDOWS
    3381             :             else if (strcmp(lower, "mbcs") == 0) {
    3382             :                 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
    3383             :             }
    3384             : #endif
    3385      180672 :             else if (strcmp(lower, "latin1") == 0 ||
    3386      179772 :                      strcmp(lower, "latin_1") == 0 ||
    3387      168798 :                      strcmp(lower, "iso_8859_1") == 0 ||
    3388      166648 :                      strcmp(lower, "iso8859_1") == 0) {
    3389       14070 :                 return _PyUnicode_AsLatin1String(unicode, errors);
    3390             :             }
    3391             :         }
    3392             :     }
    3393             : 
    3394             :     /* Encode via the codec registry */
    3395     1259900 :     v = _PyCodec_EncodeText(unicode, encoding, errors);
    3396     1259900 :     if (v == NULL)
    3397          55 :         return NULL;
    3398             : 
    3399             :     /* The normal path */
    3400     1259850 :     if (PyBytes_Check(v))
    3401     1259840 :         return v;
    3402             : 
    3403             :     /* If the codec returns a buffer, raise a warning and convert to bytes */
    3404           2 :     if (PyByteArray_Check(v)) {
    3405             :         int error;
    3406             :         PyObject *b;
    3407             : 
    3408           0 :         error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
    3409             :             "encoder %s returned bytearray instead of bytes; "
    3410             :             "use codecs.encode() to encode to arbitrary types",
    3411             :             encoding);
    3412           0 :         if (error) {
    3413           0 :             Py_DECREF(v);
    3414           0 :             return NULL;
    3415             :         }
    3416             : 
    3417           0 :         b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
    3418             :                                       PyByteArray_GET_SIZE(v));
    3419           0 :         Py_DECREF(v);
    3420           0 :         return b;
    3421             :     }
    3422             : 
    3423           2 :     PyErr_Format(PyExc_TypeError,
    3424             :                  "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
    3425             :                  "use codecs.encode() to encode to arbitrary types",
    3426             :                  encoding,
    3427           2 :                  Py_TYPE(v)->tp_name);
    3428           2 :     Py_DECREF(v);
    3429           2 :     return NULL;
    3430             : }
    3431             : 
    3432             : PyObject *
    3433           0 : PyUnicode_AsEncodedUnicode(PyObject *unicode,
    3434             :                            const char *encoding,
    3435             :                            const char *errors)
    3436             : {
    3437             :     PyObject *v;
    3438             : 
    3439           0 :     if (!PyUnicode_Check(unicode)) {
    3440           0 :         PyErr_BadArgument();
    3441           0 :         goto onError;
    3442             :     }
    3443             : 
    3444           0 :     if (PyErr_WarnEx(PyExc_DeprecationWarning,
    3445             :                      "PyUnicode_AsEncodedUnicode() is deprecated; "
    3446             :                      "use PyCodec_Encode() to encode from str to str", 1) < 0)
    3447           0 :         return NULL;
    3448             : 
    3449           0 :     if (encoding == NULL)
    3450           0 :         encoding = PyUnicode_GetDefaultEncoding();
    3451             : 
    3452             :     /* Encode via the codec registry */
    3453           0 :     v = PyCodec_Encode(unicode, encoding, errors);
    3454           0 :     if (v == NULL)
    3455           0 :         goto onError;
    3456           0 :     if (!PyUnicode_Check(v)) {
    3457           0 :         PyErr_Format(PyExc_TypeError,
    3458             :                      "'%.400s' encoder returned '%.400s' instead of 'str'; "
    3459             :                      "use codecs.encode() to encode to arbitrary types",
    3460             :                      encoding,
    3461           0 :                      Py_TYPE(v)->tp_name);
    3462           0 :         Py_DECREF(v);
    3463           0 :         goto onError;
    3464             :     }
    3465           0 :     return v;
    3466             : 
    3467           0 :   onError:
    3468           0 :     return NULL;
    3469             : }
    3470             : 
    3471             : static PyObject*
    3472     2420670 : unicode_decode_locale(const char *str, Py_ssize_t len,
    3473             :                       _Py_error_handler errors, int current_locale)
    3474             : {
    3475     2420670 :     if (str[len] != '\0' || (size_t)len != strlen(str))  {
    3476           0 :         PyErr_SetString(PyExc_ValueError, "embedded null byte");
    3477           0 :         return NULL;
    3478             :     }
    3479             : 
    3480             :     wchar_t *wstr;
    3481             :     size_t wlen;
    3482             :     const char *reason;
    3483     2420670 :     int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
    3484             :                                  current_locale, errors);
    3485     2420670 :     if (res != 0) {
    3486           0 :         if (res == -2) {
    3487             :             PyObject *exc;
    3488           0 :             exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
    3489             :                                         "locale", str, len,
    3490             :                                         (Py_ssize_t)wlen,
    3491           0 :                                         (Py_ssize_t)(wlen + 1),
    3492             :                                         reason);
    3493           0 :             if (exc != NULL) {
    3494           0 :                 PyCodec_StrictErrors(exc);
    3495           0 :                 Py_DECREF(exc);
    3496             :             }
    3497             :         }
    3498           0 :         else if (res == -3) {
    3499           0 :             PyErr_SetString(PyExc_ValueError, "unsupported error handler");
    3500             :         }
    3501             :         else {
    3502           0 :             PyErr_NoMemory();
    3503             :         }
    3504           0 :         return NULL;
    3505             :     }
    3506             : 
    3507     2420670 :     PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
    3508     2420670 :     PyMem_RawFree(wstr);
    3509     2420670 :     return unicode;
    3510             : }
    3511             : 
    3512             : PyObject*
    3513           0 : PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
    3514             :                               const char *errors)
    3515             : {
    3516           0 :     _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
    3517           0 :     return unicode_decode_locale(str, len, error_handler, 1);
    3518             : }
    3519             : 
    3520             : PyObject*
    3521     1396150 : PyUnicode_DecodeLocale(const char *str, const char *errors)
    3522             : {
    3523     1396150 :     Py_ssize_t size = (Py_ssize_t)strlen(str);
    3524     1396150 :     _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
    3525     1396150 :     return unicode_decode_locale(str, size, error_handler, 1);
    3526             : }
    3527             : 
    3528             : 
    3529             : PyObject*
    3530      100002 : PyUnicode_DecodeFSDefault(const char *s) {
    3531      100002 :     Py_ssize_t size = (Py_ssize_t)strlen(s);
    3532      100002 :     return PyUnicode_DecodeFSDefaultAndSize(s, size);
    3533             : }
    3534             : 
    3535             : PyObject*
    3536     2441610 : PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
    3537             : {
    3538     2441610 :     PyInterpreterState *interp = _PyInterpreterState_GET();
    3539     2441610 :     struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
    3540     2441610 :     if (fs_codec->utf8) {
    3541     1410660 :         return unicode_decode_utf8(s, size,
    3542             :                                    fs_codec->error_handler,
    3543     1410660 :                                    fs_codec->errors,
    3544             :                                    NULL);
    3545             :     }
    3546             : #ifndef _Py_FORCE_UTF8_FS_ENCODING
    3547     1030950 :     else if (fs_codec->encoding) {
    3548        6422 :         return PyUnicode_Decode(s, size,
    3549        6422 :                                 fs_codec->encoding,
    3550        6422 :                                 fs_codec->errors);
    3551             :     }
    3552             : #endif
    3553             :     else {
    3554             :         /* Before _PyUnicode_InitEncodings() is called, the Python codec
    3555             :            machinery is not ready and so cannot be used:
    3556             :            use mbstowcs() in this case. */
    3557     1024530 :         const PyConfig *config = _PyInterpreterState_GetConfig(interp);
    3558     1024530 :         const wchar_t *filesystem_errors = config->filesystem_errors;
    3559     1024530 :         assert(filesystem_errors != NULL);
    3560     1024530 :         _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
    3561     1024530 :         assert(errors != _Py_ERROR_UNKNOWN);
    3562             : #ifdef _Py_FORCE_UTF8_FS_ENCODING
    3563             :         return unicode_decode_utf8(s, size, errors, NULL, NULL);
    3564             : #else
    3565     1024530 :         return unicode_decode_locale(s, size, errors, 0);
    3566             : #endif
    3567             :     }
    3568             : }
    3569             : 
    3570             : 
    3571             : int
    3572     2140050 : PyUnicode_FSConverter(PyObject* arg, void* addr)
    3573             : {
    3574     2140050 :     PyObject *path = NULL;
    3575     2140050 :     PyObject *output = NULL;
    3576             :     Py_ssize_t size;
    3577             :     const char *data;
    3578     2140050 :     if (arg == NULL) {
    3579           1 :         Py_DECREF(*(PyObject**)addr);
    3580           1 :         *(PyObject**)addr = NULL;
    3581           1 :         return 1;
    3582             :     }
    3583     2140050 :     path = PyOS_FSPath(arg);
    3584     2140050 :     if (path == NULL) {
    3585          11 :         return 0;
    3586             :     }
    3587     2140040 :     if (PyBytes_Check(path)) {
    3588       56616 :         output = path;
    3589             :     }
    3590             :     else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
    3591     2083430 :         output = PyUnicode_EncodeFSDefault(path);
    3592     2083430 :         Py_DECREF(path);
    3593     2083430 :         if (!output) {
    3594          53 :             return 0;
    3595             :         }
    3596     2083370 :         assert(PyBytes_Check(output));
    3597             :     }
    3598             : 
    3599     2139990 :     size = PyBytes_GET_SIZE(output);
    3600     2139990 :     data = PyBytes_AS_STRING(output);
    3601     2139990 :     if ((size_t)size != strlen(data)) {
    3602          70 :         PyErr_SetString(PyExc_ValueError, "embedded null byte");
    3603          70 :         Py_DECREF(output);
    3604          70 :         return 0;
    3605             :     }
    3606     2139920 :     *(PyObject**)addr = output;
    3607     2139920 :     return Py_CLEANUP_SUPPORTED;
    3608             : }
    3609             : 
    3610             : 
    3611             : int
    3612       31053 : PyUnicode_FSDecoder(PyObject* arg, void* addr)
    3613             : {
    3614       31053 :     int is_buffer = 0;
    3615       31053 :     PyObject *path = NULL;
    3616       31053 :     PyObject *output = NULL;
    3617       31053 :     if (arg == NULL) {
    3618           0 :         Py_DECREF(*(PyObject**)addr);
    3619           0 :         *(PyObject**)addr = NULL;
    3620           0 :         return 1;
    3621             :     }
    3622             : 
    3623       31053 :     is_buffer = PyObject_CheckBuffer(arg);
    3624       31053 :     if (!is_buffer) {
    3625       31047 :         path = PyOS_FSPath(arg);
    3626       31047 :         if (path == NULL) {
    3627           2 :             return 0;
    3628             :         }
    3629             :     }
    3630             :     else {
    3631           6 :         path = arg;
    3632           6 :         Py_INCREF(arg);
    3633             :     }
    3634             : 
    3635       31051 :     if (PyUnicode_Check(path)) {
    3636       31045 :         output = path;
    3637             :     }
    3638          12 :     else if (PyBytes_Check(path) || is_buffer) {
    3639           6 :         PyObject *path_bytes = NULL;
    3640             : 
    3641          10 :         if (!PyBytes_Check(path) &&
    3642           4 :             PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
    3643             :             "path should be string, bytes, or os.PathLike, not %.200s",
    3644           4 :             Py_TYPE(arg)->tp_name)) {
    3645           0 :                 Py_DECREF(path);
    3646           0 :             return 0;
    3647             :         }
    3648           6 :         path_bytes = PyBytes_FromObject(path);
    3649           6 :         Py_DECREF(path);
    3650           6 :         if (!path_bytes) {
    3651           0 :             return 0;
    3652             :         }
    3653           6 :         output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
    3654             :                                                   PyBytes_GET_SIZE(path_bytes));
    3655           6 :         Py_DECREF(path_bytes);
    3656           6 :         if (!output) {
    3657           0 :             return 0;
    3658             :         }
    3659             :     }
    3660             :     else {
    3661           0 :         PyErr_Format(PyExc_TypeError,
    3662             :                      "path should be string, bytes, or os.PathLike, not %.200s",
    3663           0 :                      Py_TYPE(arg)->tp_name);
    3664           0 :         Py_DECREF(path);
    3665           0 :         return 0;
    3666             :     }
    3667       31051 :     if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
    3668             :                  PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
    3669           0 :         PyErr_SetString(PyExc_ValueError, "embedded null character");
    3670           0 :         Py_DECREF(output);
    3671           0 :         return 0;
    3672             :     }
    3673       31051 :     *(PyObject**)addr = output;
    3674       31051 :     return Py_CLEANUP_SUPPORTED;
    3675             : }
    3676             : 
    3677             : 
    3678             : static int unicode_fill_utf8(PyObject *unicode);
    3679             : 
    3680             : const char *
    3681    25905300 : PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
    3682             : {
    3683    25905300 :     if (!PyUnicode_Check(unicode)) {
    3684           0 :         PyErr_BadArgument();
    3685           0 :         return NULL;
    3686             :     }
    3687             : 
    3688    25905300 :     if (PyUnicode_UTF8(unicode) == NULL) {
    3689        4540 :         if (unicode_fill_utf8(unicode) == -1) {
    3690          70 :             return NULL;
    3691             :         }
    3692             :     }
    3693             : 
    3694    25905300 :     if (psize)
    3695    24283500 :         *psize = PyUnicode_UTF8_LENGTH(unicode);
    3696    25905300 :     return PyUnicode_UTF8(unicode);
    3697             : }
    3698             : 
    3699             : const char *
    3700     1621790 : PyUnicode_AsUTF8(PyObject *unicode)
    3701             : {
    3702     1621790 :     return PyUnicode_AsUTF8AndSize(unicode, NULL);
    3703             : }
    3704             : 
    3705             : /*
    3706             : PyUnicode_GetSize() has been deprecated since Python 3.3
    3707             : because it returned length of Py_UNICODE.
    3708             : 
    3709             : But this function is part of stable abi, because it don't
    3710             : include Py_UNICODE in signature and it was not excluded from
    3711             : stable abi in PEP 384.
    3712             : */
    3713             : PyAPI_FUNC(Py_ssize_t)
    3714           0 : PyUnicode_GetSize(PyObject *unicode)
    3715             : {
    3716           0 :     PyErr_SetString(PyExc_RuntimeError,
    3717             :                     "PyUnicode_GetSize has been removed.");
    3718           0 :     return -1;
    3719             : }
    3720             : 
    3721             : Py_ssize_t
    3722       50414 : PyUnicode_GetLength(PyObject *unicode)
    3723             : {
    3724       50414 :     if (!PyUnicode_Check(unicode)) {
    3725           0 :         PyErr_BadArgument();
    3726           0 :         return -1;
    3727             :     }
    3728       50414 :     return PyUnicode_GET_LENGTH(unicode);
    3729             : }
    3730             : 
    3731             : Py_UCS4
    3732          21 : PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
    3733             : {
    3734             :     const void *data;
    3735             :     int kind;
    3736             : 
    3737          21 :     if (!PyUnicode_Check(unicode)) {
    3738           0 :         PyErr_BadArgument();
    3739           0 :         return (Py_UCS4)-1;
    3740             :     }
    3741          21 :     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
    3742           0 :         PyErr_SetString(PyExc_IndexError, "string index out of range");
    3743           0 :         return (Py_UCS4)-1;
    3744             :     }
    3745          21 :     data = PyUnicode_DATA(unicode);
    3746          21 :     kind = PyUnicode_KIND(unicode);
    3747          21 :     return PyUnicode_READ(kind, data, index);
    3748             : }
    3749             : 
    3750             : int
    3751          12 : PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
    3752             : {
    3753          12 :     if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
    3754           0 :         PyErr_BadArgument();
    3755           0 :         return -1;
    3756             :     }
    3757          12 :     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
    3758           0 :         PyErr_SetString(PyExc_IndexError, "string index out of range");
    3759           0 :         return -1;
    3760             :     }
    3761          12 :     if (unicode_check_modifiable(unicode))
    3762           0 :         return -1;
    3763          12 :     if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
    3764           0 :         PyErr_SetString(PyExc_ValueError, "character out of range");
    3765           0 :         return -1;
    3766             :     }
    3767          12 :     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
    3768             :                     index, ch);
    3769          12 :     return 0;
    3770             : }
    3771             : 
    3772             : const char *
    3773        2098 : PyUnicode_GetDefaultEncoding(void)
    3774             : {
    3775        2098 :     return "utf-8";
    3776             : }
    3777             : 
    3778             : /* create or adjust a UnicodeDecodeError */
    3779             : static void
    3780        7384 : make_decode_exception(PyObject **exceptionObject,
    3781             :                       const char *encoding,
    3782             :                       const char *input, Py_ssize_t length,
    3783             :                       Py_ssize_t startpos, Py_ssize_t endpos,
    3784             :                       const char *reason)
    3785             : {
    3786        7384 :     if (*exceptionObject == NULL) {
    3787        5629 :         *exceptionObject = PyUnicodeDecodeError_Create(
    3788             :             encoding, input, length, startpos, endpos, reason);
    3789             :     }
    3790             :     else {
    3791        1755 :         if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
    3792           0 :             goto onError;
    3793        1755 :         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
    3794           0 :             goto onError;
    3795        1755 :         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
    3796           0 :             goto onError;
    3797             :     }
    3798        7384 :     return;
    3799             : 
    3800           0 : onError:
    3801           0 :     Py_CLEAR(*exceptionObject);
    3802             : }
    3803             : 
    3804             : #ifdef MS_WINDOWS
    3805             : static int
    3806             : widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
    3807             : {
    3808             :     if (newsize > *size) {
    3809             :         wchar_t *newbuf = *buf;
    3810             :         if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
    3811             :             PyErr_NoMemory();
    3812             :             return -1;
    3813             :         }
    3814             :         *buf = newbuf;
    3815             :     }
    3816             :     *size = newsize;
    3817             :     return 0;
    3818             : }
    3819             : 
    3820             : /* error handling callback helper:
    3821             :    build arguments, call the callback and check the arguments,
    3822             :    if no exception occurred, copy the replacement to the output
    3823             :    and adjust various state variables.
    3824             :    return 0 on success, -1 on error
    3825             : */
    3826             : 
    3827             : static int
    3828             : unicode_decode_call_errorhandler_wchar(
    3829             :     const char *errors, PyObject **errorHandler,
    3830             :     const char *encoding, const char *reason,
    3831             :     const char **input, const char **inend, Py_ssize_t *startinpos,
    3832             :     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
    3833             :     wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
    3834             : {
    3835             :     static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
    3836             : 
    3837             :     PyObject *restuple = NULL;
    3838             :     PyObject *repunicode = NULL;
    3839             :     Py_ssize_t outsize;
    3840             :     Py_ssize_t insize;
    3841             :     Py_ssize_t requiredsize;
    3842             :     Py_ssize_t newpos;
    3843             :     PyObject *inputobj = NULL;
    3844             :     Py_ssize_t repwlen;
    3845             : 
    3846             :     if (*errorHandler == NULL) {
    3847             :         *errorHandler = PyCodec_LookupError(errors);
    3848             :         if (*errorHandler == NULL)
    3849             :             goto onError;
    3850             :     }
    3851             : 
    3852             :     make_decode_exception(exceptionObject,
    3853             :         encoding,
    3854             :         *input, *inend - *input,
    3855             :         *startinpos, *endinpos,
    3856             :         reason);
    3857             :     if (*exceptionObject == NULL)
    3858             :         goto onError;
    3859             : 
    3860             :     restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
    3861             :     if (restuple == NULL)
    3862             :         goto onError;
    3863             :     if (!PyTuple_Check(restuple)) {
    3864             :         PyErr_SetString(PyExc_TypeError, &argparse[3]);
    3865             :         goto onError;
    3866             :     }
    3867             :     if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
    3868             :         goto onError;
    3869             : 
    3870             :     /* Copy back the bytes variables, which might have been modified by the
    3871             :        callback */
    3872             :     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
    3873             :     if (!inputobj)
    3874             :         goto onError;
    3875             :     *input = PyBytes_AS_STRING(inputobj);
    3876             :     insize = PyBytes_GET_SIZE(inputobj);
    3877             :     *inend = *input + insize;
    3878             :     /* we can DECREF safely, as the exception has another reference,
    3879             :        so the object won't go away. */
    3880             :     Py_DECREF(inputobj);
    3881             : 
    3882             :     if (newpos<0)
    3883             :         newpos = insize+newpos;
    3884             :     if (newpos<0 || newpos>insize) {
    3885             :         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
    3886             :         goto onError;
    3887             :     }
    3888             : 
    3889             :     repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
    3890             :     if (repwlen < 0)
    3891             :         goto onError;
    3892             :     repwlen--;
    3893             :     /* need more space? (at least enough for what we
    3894             :        have+the replacement+the rest of the string (starting
    3895             :        at the new input position), so we won't have to check space
    3896             :        when there are no errors in the rest of the string) */
    3897             :     requiredsize = *outpos;
    3898             :     if (requiredsize > PY_SSIZE_T_MAX - repwlen)
    3899             :         goto overflow;
    3900             :     requiredsize += repwlen;
    3901             :     if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
    3902             :         goto overflow;
    3903             :     requiredsize += insize - newpos;
    3904             :     outsize = *bufsize;
    3905             :     if (requiredsize > outsize) {
    3906             :         if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
    3907             :             requiredsize = 2*outsize;
    3908             :         if (widechar_resize(buf, bufsize, requiredsize) < 0) {
    3909             :             goto onError;
    3910             :         }
    3911             :     }
    3912             :     PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
    3913             :     *outpos += repwlen;
    3914             :     *endinpos = newpos;
    3915             :     *inptr = *input + newpos;
    3916             : 
    3917             :     /* we made it! */
    3918             :     Py_DECREF(restuple);
    3919             :     return 0;
    3920             : 
    3921             :   overflow:
    3922             :     PyErr_SetString(PyExc_OverflowError,
    3923             :                     "decoded result is too long for a Python string");
    3924             : 
    3925             :   onError:
    3926             :     Py_XDECREF(restuple);
    3927             :     return -1;
    3928             : }
    3929             : #endif   /* MS_WINDOWS */
    3930             : 
    3931             : static int
    3932        7384 : unicode_decode_call_errorhandler_writer(
    3933             :     const char *errors, PyObject **errorHandler,
    3934             :     const char *encoding, const char *reason,
    3935             :     const char **input, const char **inend, Py_ssize_t *startinpos,
    3936             :     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
    3937             :     _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
    3938             : {
    3939             :     static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
    3940             : 
    3941        7384 :     PyObject *restuple = NULL;
    3942        7384 :     PyObject *repunicode = NULL;
    3943             :     Py_ssize_t insize;
    3944             :     Py_ssize_t newpos;
    3945             :     Py_ssize_t replen;
    3946             :     Py_ssize_t remain;
    3947        7384 :     PyObject *inputobj = NULL;
    3948        7384 :     int need_to_grow = 0;
    3949             :     const char *new_inptr;
    3950             : 
    3951        7384 :     if (*errorHandler == NULL) {
    3952        5629 :         *errorHandler = PyCodec_LookupError(errors);
    3953        5629 :         if (*errorHandler == NULL)
    3954           0 :             goto onError;
    3955             :     }
    3956             : 
    3957        7384 :     make_decode_exception(exceptionObject,
    3958             :         encoding,
    3959        7384 :         *input, *inend - *input,
    3960             :         *startinpos, *endinpos,
    3961             :         reason);
    3962        7384 :     if (*exceptionObject == NULL)
    3963           0 :         goto onError;
    3964             : 
    3965        7384 :     restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
    3966        7384 :     if (restuple == NULL)
    3967        4063 :         goto onError;
    3968        3321 :     if (!PyTuple_Check(restuple)) {
    3969          12 :         PyErr_SetString(PyExc_TypeError, &argparse[3]);
    3970          12 :         goto onError;
    3971             :     }
    3972        3309 :     if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
    3973          22 :         goto onError;
    3974             : 
    3975             :     /* Copy back the bytes variables, which might have been modified by the
    3976             :        callback */
    3977        3287 :     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
    3978        3287 :     if (!inputobj)
    3979           7 :         goto onError;
    3980        3280 :     remain = *inend - *input - *endinpos;
    3981        3280 :     *input = PyBytes_AS_STRING(inputobj);
    3982        3280 :     insize = PyBytes_GET_SIZE(inputobj);
    3983        3280 :     *inend = *input + insize;
    3984             :     /* we can DECREF safely, as the exception has another reference,
    3985             :        so the object won't go away. */
    3986        3280 :     Py_DECREF(inputobj);
    3987             : 
    3988        3280 :     if (newpos<0)
    3989           3 :         newpos = insize+newpos;
    3990        3280 :     if (newpos<0 || newpos>insize) {
    3991           2 :         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
    3992           2 :         goto onError;
    3993             :     }
    3994             : 
    3995        3278 :     replen = PyUnicode_GET_LENGTH(repunicode);
    3996        3278 :     if (replen > 1) {
    3997          71 :         writer->min_length += replen - 1;
    3998          71 :         need_to_grow = 1;
    3999             :     }
    4000        3278 :     new_inptr = *input + newpos;
    4001        3278 :     if (*inend - new_inptr > remain) {
    4002             :         /* We don't know the decoding algorithm here so we make the worst
    4003             :            assumption that one byte decodes to one unicode character.
    4004             :            If unfortunately one byte could decode to more unicode characters,
    4005             :            the decoder may write out-of-bound then.  Is it possible for the
    4006             :            algorithms using this function? */
    4007         267 :         writer->min_length += *inend - new_inptr - remain;
    4008         267 :         need_to_grow = 1;
    4009             :     }
    4010        3278 :     if (need_to_grow) {
    4011         337 :         writer->overallocate = 1;
    4012         477 :         if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
    4013         280 :                             PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
    4014           0 :             goto onError;
    4015             :     }
    4016        3278 :     if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
    4017           0 :         goto onError;
    4018             : 
    4019        3278 :     *endinpos = newpos;
    4020        3278 :     *inptr = new_inptr;
    4021             : 
    4022             :     /* we made it! */
    4023        3278 :     Py_DECREF(restuple);
    4024        3278 :     return 0;
    4025             : 
    4026        4106 :   onError:
    4027        4106 :     Py_XDECREF(restuple);
    4028        4106 :     return -1;
    4029             : }
    4030             : 
    4031             : /* --- UTF-7 Codec -------------------------------------------------------- */
    4032             : 
    4033             : /* See RFC2152 for details.  We encode conservatively and decode liberally. */
    4034             : 
    4035             : /* Three simple macros defining base-64. */
    4036             : 
    4037             : /* Is c a base-64 character? */
    4038             : 
    4039             : #define IS_BASE64(c) \
    4040             :     (((c) >= 'A' && (c) <= 'Z') ||     \
    4041             :      ((c) >= 'a' && (c) <= 'z') ||     \
    4042             :      ((c) >= '0' && (c) <= '9') ||     \
    4043             :      (c) == '+' || (c) == '/')
    4044             : 
    4045             : /* given that c is a base-64 character, what is its base-64 value? */
    4046             : 
    4047             : #define FROM_BASE64(c)                                                  \
    4048             :     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
    4049             :      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
    4050             :      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
    4051             :      (c) == '+' ? 62 : 63)
    4052             : 
    4053             : /* What is the base-64 character of the bottom 6 bits of n? */
    4054             : 
    4055             : #define TO_BASE64(n)  \
    4056             :     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
    4057             : 
    4058             : /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
    4059             :  * decoded as itself.  We are permissive on decoding; the only ASCII
    4060             :  * byte not decoding to itself is the + which begins a base64
    4061             :  * string. */
    4062             : 
    4063             : #define DECODE_DIRECT(c)                                \
    4064             :     ((c) <= 127 && (c) != '+')
    4065             : 
    4066             : /* The UTF-7 encoder treats ASCII characters differently according to
    4067             :  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
    4068             :  * the above).  See RFC2152.  This array identifies these different
    4069             :  * sets:
    4070             :  * 0 : "Set D"
    4071             :  *     alphanumeric and '(),-./:?
    4072             :  * 1 : "Set O"
    4073             :  *     !"#$%&*;<=>@[]^_`{|}
    4074             :  * 2 : "whitespace"
    4075             :  *     ht nl cr sp
    4076             :  * 3 : special (must be base64 encoded)
    4077             :  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
    4078             :  */
    4079             : 
    4080             : static
    4081             : char utf7_category[128] = {
    4082             : /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
    4083             :     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
    4084             : /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
    4085             :     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
    4086             : /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
    4087             :     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
    4088             : /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
    4089             :     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
    4090             : /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
    4091             :     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
    4092             : /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
    4093             :     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
    4094             : /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
    4095             :     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
    4096             : /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
    4097             :     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
    4098             : };
    4099             : 
    4100             : /* ENCODE_DIRECT: this character should be encoded as itself.  The
    4101             :  * answer depends on whether we are encoding set O as itself, and also
    4102             :  * on whether we are encoding whitespace as itself.  RFC2152 makes it
    4103             :  * clear that the answers to these questions vary between
    4104             :  * applications, so this code needs to be flexible.  */
    4105             : 
    4106             : #define ENCODE_DIRECT(c, directO, directWS)             \
    4107             :     ((c) < 128 && (c) > 0 &&                            \
    4108             :      ((utf7_category[(c)] == 0) ||                      \
    4109             :       (directWS && (utf7_category[(c)] == 2)) ||        \
    4110             :       (directO && (utf7_category[(c)] == 1))))
    4111             : 
    4112             : PyObject *
    4113           0 : PyUnicode_DecodeUTF7(const char *s,
    4114             :                      Py_ssize_t size,
    4115             :                      const char *errors)
    4116             : {
    4117           0 :     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
    4118             : }
    4119             : 
    4120             : /* The decoder.  The only state we preserve is our read position,
    4121             :  * i.e. how many characters we have consumed.  So if we end in the
    4122             :  * middle of a shift sequence we have to back off the read position
    4123             :  * and the output to the beginning of the sequence, otherwise we lose
    4124             :  * all the shift state (seen bits, number of bits seen, high
    4125             :  * surrogate). */
    4126             : 
    4127             : PyObject *
    4128        5662 : PyUnicode_DecodeUTF7Stateful(const char *s,
    4129             :                              Py_ssize_t size,
    4130             :                              const char *errors,
    4131             :                              Py_ssize_t *consumed)
    4132             : {
    4133        5662 :     const char *starts = s;
    4134             :     Py_ssize_t startinpos;
    4135             :     Py_ssize_t endinpos;
    4136             :     const char *e;
    4137             :     _PyUnicodeWriter writer;
    4138        5662 :     const char *errmsg = "";
    4139        5662 :     int inShift = 0;
    4140             :     Py_ssize_t shiftOutStart;
    4141        5662 :     unsigned int base64bits = 0;
    4142        5662 :     unsigned long base64buffer = 0;
    4143        5662 :     Py_UCS4 surrogate = 0;
    4144        5662 :     PyObject *errorHandler = NULL;
    4145        5662 :     PyObject *exc = NULL;
    4146             : 
    4147        5662 :     if (size == 0) {
    4148          18 :         if (consumed)
    4149           8 :             *consumed = 0;
    4150          18 :         _Py_RETURN_UNICODE_EMPTY();
    4151             :     }
    4152             : 
    4153             :     /* Start off assuming it's all ASCII. Widen later as necessary. */
    4154        5644 :     _PyUnicodeWriter_Init(&writer);
    4155        5644 :     writer.min_length = size;
    4156             : 
    4157        5644 :     shiftOutStart = 0;
    4158        5644 :     e = s + size;
    4159             : 
    4160      356931 :     while (s < e) {
    4161             :         Py_UCS4 ch;
    4162      351314 :       restart:
    4163      351314 :         ch = (unsigned char) *s;
    4164             : 
    4165      351314 :         if (inShift) { /* in a base-64 section */
    4166       53274 :             if (IS_BASE64(ch)) { /* consume a base-64 character */
    4167       50649 :                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
    4168       50649 :                 base64bits += 6;
    4169       50649 :                 s++;
    4170       50649 :                 if (base64bits >= 16) {
    4171             :                     /* we have enough bits for a UTF-16 value */
    4172       18553 :                     Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
    4173       18553 :                     base64bits -= 16;
    4174       18553 :                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
    4175       18553 :                     assert(outCh <= 0xffff);
    4176       18553 :                     if (surrogate) {
    4177             :                         /* expecting a second surrogate */
    4178          17 :                         if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
    4179          16 :                             Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
    4180          16 :                             if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
    4181           0 :                                 goto onError;
    4182          16 :                             surrogate = 0;
    4183          16 :                             continue;
    4184             :                         }
    4185             :                         else {
    4186           1 :                             if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
    4187           0 :                                 goto onError;
    4188           1 :                             surrogate = 0;
    4189             :                         }
    4190             :                     }
    4191       18537 :                     if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
    4192             :                         /* first surrogate */
    4193          52 :                         surrogate = outCh;
    4194             :                     }
    4195             :                     else {
    4196       18485 :                         if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
    4197           0 :                             goto onError;
    4198             :                     }
    4199             :                 }
    4200             :             }
    4201             :             else { /* now leaving a base-64 section */
    4202        2625 :                 inShift = 0;
    4203        2625 :                 if (base64bits > 0) { /* left-over bits */
    4204        2615 :                     if (base64bits >= 6) {
    4205             :                         /* We've seen at least one base-64 character */
    4206          28 :                         s++;
    4207          28 :                         errmsg = "partial character in shift sequence";
    4208          28 :                         goto utf7Error;
    4209             :                     }
    4210             :                     else {
    4211             :                         /* Some bits remain; they should be zero */
    4212        2587 :                         if (base64buffer != 0) {
    4213           8 :                             s++;
    4214           8 :                             errmsg = "non-zero padding bits in shift sequence";
    4215           8 :                             goto utf7Error;
    4216             :                         }
    4217             :                     }
    4218             :                 }
    4219        2589 :                 if (surrogate && DECODE_DIRECT(ch)) {
    4220           9 :                     if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
    4221           0 :                         goto onError;
    4222             :                 }
    4223        2589 :                 surrogate = 0;
    4224        2589 :                 if (ch == '-') {
    4225             :                     /* '-' is absorbed; other terminating
    4226             :                        characters are preserved */
    4227        2578 :                     s++;
    4228             :                 }
    4229             :             }
    4230             :         }
    4231      298040 :         else if ( ch == '+' ) {
    4232        2899 :             startinpos = s-starts;
    4233        2899 :             s++; /* consume '+' */
    4234        2899 :             if (s < e && *s == '-') { /* '+-' encodes '+' */
    4235           7 :                 s++;
    4236           7 :                 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
    4237           0 :                     goto onError;
    4238             :             }
    4239        2892 :             else if (s < e && !IS_BASE64(*s)) {
    4240           6 :                 s++;
    4241           6 :                 errmsg = "ill-formed sequence";
    4242           6 :                 goto utf7Error;
    4243             :             }
    4244             :             else { /* begin base64-encoded section */
    4245        2886 :                 inShift = 1;
    4246        2886 :                 surrogate = 0;
    4247        2886 :                 shiftOutStart = writer.pos;
    4248        2886 :                 base64bits = 0;
    4249        2886 :                 base64buffer = 0;
    4250             :             }
    4251             :         }
    4252      295141 :         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
    4253      294871 :             s++;
    4254      294871 :             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
    4255           0 :                 goto onError;
    4256             :         }
    4257             :         else {
    4258         270 :             startinpos = s-starts;
    4259         270 :             s++;
    4260         270 :             errmsg = "unexpected special character";
    4261         270 :             goto utf7Error;
    4262             :         }
    4263      350986 :         continue;
    4264         312 : utf7Error:
    4265         312 :         endinpos = s-starts;
    4266         312 :         if (unicode_decode_call_errorhandler_writer(
    4267             :                 errors, &errorHandler,
    4268             :                 "utf7", errmsg,
    4269             :                 &starts, &e, &startinpos, &endinpos, &exc, &s,
    4270             :                 &writer))
    4271          27 :             goto onError;
    4272             :     }
    4273             : 
    4274             :     /* end of string */
    4275             : 
    4276        5617 :     if (inShift && !consumed) { /* in shift sequence, no more to follow */
    4277             :         /* if we're in an inconsistent state, that's an error */
    4278          15 :         inShift = 0;
    4279          15 :         if (surrogate ||
    4280           7 :                 (base64bits >= 6) ||
    4281           6 :                 (base64bits > 0 && base64buffer != 0)) {
    4282          12 :             endinpos = size;
    4283          12 :             if (unicode_decode_call_errorhandler_writer(
    4284             :                     errors, &errorHandler,
    4285             :                     "utf7", "unterminated shift sequence",
    4286             :                     &starts, &e, &startinpos, &endinpos, &exc, &s,
    4287             :                     &writer))
    4288           5 :                 goto onError;
    4289           7 :             if (s < e)
    4290           0 :                 goto restart;
    4291             :         }
    4292             :     }
    4293             : 
    4294             :     /* return state */
    4295        5612 :     if (consumed) {
    4296        4429 :         if (inShift) {
    4297         246 :             *consumed = startinpos;
    4298         246 :             if (writer.pos != shiftOutStart && writer.maxchar > 127) {
    4299          72 :                 PyObject *result = PyUnicode_FromKindAndData(
    4300          72 :                         writer.kind, writer.data, shiftOutStart);
    4301          72 :                 Py_XDECREF(errorHandler);
    4302          72 :                 Py_XDECREF(exc);
    4303          72 :                 _PyUnicodeWriter_Dealloc(&writer);
    4304          72 :                 return result;
    4305             :             }
    4306         174 :             writer.pos = shiftOutStart; /* back off output */
    4307             :         }
    4308             :         else {
    4309        4183 :             *consumed = s-starts;
    4310             :         }
    4311             :     }
    4312             : 
    4313        5540 :     Py_XDECREF(errorHandler);
    4314        5540 :     Py_XDECREF(exc);
    4315        5540 :     return _PyUnicodeWriter_Finish(&writer);
    4316             : 
    4317          32 :   onError:
    4318          32 :     Py_XDECREF(errorHandler);
    4319          32 :     Py_XDECREF(exc);
    4320          32 :     _PyUnicodeWriter_Dealloc(&writer);
    4321          32 :     return NULL;
    4322             : }
    4323             : 
    4324             : 
    4325             : PyObject *
    4326        1907 : _PyUnicode_EncodeUTF7(PyObject *str,
    4327             :                       int base64SetO,
    4328             :                       int base64WhiteSpace,
    4329             :                       const char *errors)
    4330             : {
    4331             :     int kind;
    4332             :     const void *data;
    4333             :     Py_ssize_t len;
    4334             :     PyObject *v;
    4335        1907 :     int inShift = 0;
    4336             :     Py_ssize_t i;
    4337        1907 :     unsigned int base64bits = 0;
    4338        1907 :     unsigned long base64buffer = 0;
    4339             :     char * out;
    4340             :     const char * start;
    4341             : 
    4342        1907 :     kind = PyUnicode_KIND(str);
    4343        1907 :     data = PyUnicode_DATA(str);
    4344        1907 :     len = PyUnicode_GET_LENGTH(str);
    4345             : 
    4346        1907 :     if (len == 0)
    4347          20 :         return PyBytes_FromStringAndSize(NULL, 0);
    4348             : 
    4349             :     /* It might be possible to tighten this worst case */
    4350        1887 :     if (len > PY_SSIZE_T_MAX / 8)
    4351           0 :         return PyErr_NoMemory();
    4352        1887 :     v = PyBytes_FromStringAndSize(NULL, len * 8);
    4353        1887 :     if (v == NULL)
    4354           0 :         return NULL;
    4355             : 
    4356        1887 :     start = out = PyBytes_AS_STRING(v);
    4357      351497 :     for (i = 0; i < len; ++i) {
    4358      349610 :         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
    4359             : 
    4360      349610 :         if (inShift) {
    4361       29654 :             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
    4362             :                 /* shifting out */
    4363       13622 :                 if (base64bits) { /* output remaining bits */
    4364       13619 :                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
    4365       13619 :                     base64buffer = 0;
    4366       13619 :                     base64bits = 0;
    4367             :                 }
    4368       13622 :                 inShift = 0;
    4369             :                 /* Characters not in the BASE64 set implicitly unshift the sequence
    4370             :                    so no '-' is required, except if the character is itself a '-' */
    4371       13622 :                 if (IS_BASE64(ch) || ch == '-') {
    4372       13612 :                     *out++ = '-';
    4373             :                 }
    4374       13622 :                 *out++ = (char) ch;
    4375             :             }
    4376             :             else {
    4377       16032 :                 goto encode_char;
    4378             :             }
    4379             :         }
    4380             :         else { /* not in a shift sequence */
    4381      319956 :             if (ch == '+') {
    4382          13 :                 *out++ = '+';
    4383          13 :                         *out++ = '-';
    4384             :             }
    4385      319943 :             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
    4386      305351 :                 *out++ = (char) ch;
    4387             :             }
    4388             :             else {
    4389       14592 :                 *out++ = '+';
    4390       14592 :                 inShift = 1;
    4391       14592 :                 goto encode_char;
    4392             :             }
    4393             :         }
    4394      318986 :         continue;
    4395       30624 : encode_char:
    4396       30624 :         if (ch >= 0x10000) {
    4397           9 :             assert(ch <= MAX_UNICODE);
    4398             : 
    4399             :             /* code first surrogate */
    4400           9 :             base64bits += 16;
    4401           9 :             base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
    4402          30 :             while (base64bits >= 6) {
    4403          21 :                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
    4404          21 :                 base64bits -= 6;
    4405             :             }
    4406             :             /* prepare second surrogate */
    4407           9 :             ch = Py_UNICODE_LOW_SURROGATE(ch);
    4408             :         }
    4409       30624 :         base64bits += 16;
    4410       30624 :         base64buffer = (base64buffer << 16) | ch;
    4411      102575 :         while (base64bits >= 6) {
    4412       71951 :             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
    4413       71951 :             base64bits -= 6;
    4414             :         }
    4415             :     }
    4416        1887 :     if (base64bits)
    4417         961 :         *out++= TO_BASE64(base64buffer << (6-base64bits) );
    4418        1887 :     if (inShift)
    4419         970 :         *out++ = '-';
    4420        1887 :     if (_PyBytes_Resize(&v, out - start) < 0)
    4421           0 :         return NULL;
    4422        1887 :     return v;
    4423             : }
    4424             : 
    4425             : #undef IS_BASE64
    4426             : #undef FROM_BASE64
    4427             : #undef TO_BASE64
    4428             : #undef DECODE_DIRECT
    4429             : #undef ENCODE_DIRECT
    4430             : 
    4431             : /* --- UTF-8 Codec -------------------------------------------------------- */
    4432             : 
    4433             : PyObject *
    4434    34867200 : PyUnicode_DecodeUTF8(const char *s,
    4435             :                      Py_ssize_t size,
    4436             :                      const char *errors)
    4437             : {
    4438    34867200 :     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
    4439             : }
    4440             : 
    4441             : #include "stringlib/asciilib.h"
    4442             : #include "stringlib/codecs.h"
    4443             : #include "stringlib/undef.h"
    4444             : 
    4445             : #include "stringlib/ucs1lib.h"
    4446             : #include "stringlib/codecs.h"
    4447             : #include "stringlib/undef.h"
    4448             : 
    4449             : #include "stringlib/ucs2lib.h"
    4450             : #include "stringlib/codecs.h"
    4451             : #include "stringlib/undef.h"
    4452             : 
    4453             : #include "stringlib/ucs4lib.h"
    4454             : #include "stringlib/codecs.h"
    4455             : #include "stringlib/undef.h"
    4456             : 
    4457             : /* Mask to quickly check whether a C 'size_t' contains a
    4458             :    non-ASCII, UTF8-encoded char. */
    4459             : #if (SIZEOF_SIZE_T == 8)
    4460             : # define ASCII_CHAR_MASK 0x8080808080808080ULL
    4461             : #elif (SIZEOF_SIZE_T == 4)
    4462             : # define ASCII_CHAR_MASK 0x80808080U
    4463             : #else
    4464             : # error C 'size_t' size should be either 4 or 8!
    4465             : #endif
    4466             : 
    4467             : static Py_ssize_t
    4468    86136700 : ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
    4469             : {
    4470    86136700 :     const char *p = start;
    4471             : 
    4472             : #if SIZEOF_SIZE_T <= SIZEOF_VOID_P
    4473    86136700 :     assert(_Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T));
    4474    86136700 :     if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
    4475             :         /* Fast path, see in STRINGLIB(utf8_decode) for
    4476             :            an explanation. */
    4477             :         /* Help allocation */
    4478    50435100 :         const char *_p = p;
    4479    50435100 :         Py_UCS1 * q = dest;
    4480   130927000 :         while (_p + SIZEOF_SIZE_T <= end) {
    4481    80653100 :             size_t value = *(const size_t *) _p;
    4482    80653100 :             if (value & ASCII_CHAR_MASK)
    4483      161373 :                 break;
    4484    80491700 :             *((size_t *)q) = value;
    4485    80491700 :             _p += SIZEOF_SIZE_T;
    4486    80491700 :             q += SIZEOF_SIZE_T;
    4487             :         }
    4488    50435100 :         p = _p;
    4489   259068000 :         while (p < end) {
    4490   208938000 :             if ((unsigned char)*p & 0x80)
    4491      304842 :                 break;
    4492   208633000 :             *q++ = *p++;
    4493             :         }
    4494    50435100 :         return p - start;
    4495             :     }
    4496             : #endif
    4497   272199000 :     while (p < end) {
    4498             :         /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
    4499             :            for an explanation. */
    4500   238330000 :         if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
    4501             :             /* Help allocation */
    4502    27027700 :             const char *_p = p;
    4503    47908600 :             while (_p + SIZEOF_SIZE_T <= end) {
    4504    20889900 :                 size_t value = *(const size_t *) _p;
    4505    20889900 :                 if (value & ASCII_CHAR_MASK)
    4506        8929 :                     break;
    4507    20880900 :                 _p += SIZEOF_SIZE_T;
    4508             :             }
    4509    27027700 :             p = _p;
    4510    27027700 :             if (_p == end)
    4511     1638080 :                 break;
    4512             :         }
    4513   236692000 :         if ((unsigned char)*p & 0x80)
    4514      194034 :             break;
    4515   236498000 :         ++p;
    4516             :     }
    4517    35701600 :     memcpy(dest, start, p - start);
    4518    35701600 :     return p - start;
    4519             : }
    4520             : 
    4521             : static PyObject *
    4522    99028300 : unicode_decode_utf8(const char *s, Py_ssize_t size,
    4523             :                     _Py_error_handler error_handler, const char *errors,
    4524             :                     Py_ssize_t *consumed)
    4525             : {
    4526    99028300 :     if (size == 0) {
    4527     4030610 :         if (consumed)
    4528          38 :             *consumed = 0;
    4529     4030610 :         _Py_RETURN_UNICODE_EMPTY();
    4530             :     }
    4531             : 
    4532             :     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
    4533    94997700 :     if (size == 1 && (unsigned char)s[0] < 128) {
    4534     9141670 :         if (consumed) {
    4535        6973 :             *consumed = 1;
    4536             :         }
    4537     9141670 :         return get_latin1_char((unsigned char)s[0]);
    4538             :     }
    4539             : 
    4540    85856000 :     const char *starts = s;
    4541    85856000 :     const char *end = s + size;
    4542             : 
    4543             :     // fast path: try ASCII string.
    4544    85856000 :     PyObject *u = PyUnicode_New(size, 127);
    4545    85856000 :     if (u == NULL) {
    4546         279 :         return NULL;
    4547             :     }
    4548    85855700 :     s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
    4549    85855700 :     if (s == end) {
    4550    85357300 :         return u;
    4551             :     }
    4552             : 
    4553             :     // Use _PyUnicodeWriter after fast path is failed.
    4554             :     _PyUnicodeWriter writer;
    4555      498446 :     _PyUnicodeWriter_InitWithBuffer(&writer, u);
    4556      498446 :     writer.pos = s - starts;
    4557             : 
    4558             :     Py_ssize_t startinpos, endinpos;
    4559      498446 :     const char *errmsg = "";
    4560      498446 :     PyObject *error_handler_obj = NULL;
    4561      498446 :     PyObject *exc = NULL;
    4562             : 
    4563      967959 :     while (s < end) {
    4564             :         Py_UCS4 ch;
    4565      781947 :         int kind = writer.kind;
    4566             : 
    4567      781947 :         if (kind == PyUnicode_1BYTE_KIND) {
    4568      587538 :             if (PyUnicode_IS_ASCII(writer.buffer))
    4569      499205 :                 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
    4570             :             else
    4571       88333 :                 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
    4572      194409 :         } else if (kind == PyUnicode_2BYTE_KIND) {
    4573      168099 :             ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
    4574             :         } else {
    4575       26310 :             assert(kind == PyUnicode_4BYTE_KIND);
    4576       26310 :             ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
    4577             :         }
    4578             : 
    4579      781947 :         switch (ch) {
    4580      315492 :         case 0:
    4581      315492 :             if (s == end || consumed)
    4582      309011 :                 goto End;
    4583        6481 :             errmsg = "unexpected end of data";
    4584        6481 :             startinpos = s - starts;
    4585        6481 :             endinpos = end - starts;
    4586        6481 :             break;
    4587        4927 :         case 1:
    4588        4927 :             errmsg = "invalid start byte";
    4589        4927 :             startinpos = s - starts;
    4590        4927 :             endinpos = startinpos + 1;
    4591        4927 :             break;
    4592       11188 :         case 2:
    4593       11188 :             if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
    4594           4 :                 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
    4595             :             {
    4596             :                 /* Truncated surrogate code in range D800-DFFF */
    4597           4 :                 goto End;
    4598             :             }
    4599             :             /* fall through */
    4600             :         case 3:
    4601             :         case 4:
    4602       12050 :             errmsg = "invalid continuation byte";
    4603       12050 :             startinpos = s - starts;
    4604       12050 :             endinpos = startinpos + ch - 1;
    4605       12050 :             break;
    4606      449474 :         default:
    4607      449474 :             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
    4608           0 :                 goto onError;
    4609      449474 :             continue;
    4610             :         }
    4611             : 
    4612       23458 :         if (error_handler == _Py_ERROR_UNKNOWN)
    4613       13589 :             error_handler = _Py_GetErrorHandler(errors);
    4614             : 
    4615       23458 :         switch (error_handler) {
    4616         894 :         case _Py_ERROR_IGNORE:
    4617         894 :             s += (endinpos - startinpos);
    4618         894 :             break;
    4619             : 
    4620        1286 :         case _Py_ERROR_REPLACE:
    4621        1286 :             if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
    4622           0 :                 goto onError;
    4623        1286 :             s += (endinpos - startinpos);
    4624        1286 :             break;
    4625             : 
    4626       16416 :         case _Py_ERROR_SURROGATEESCAPE:
    4627             :         {
    4628             :             Py_ssize_t i;
    4629             : 
    4630       16416 :             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
    4631           0 :                 goto onError;
    4632       32832 :             for (i=startinpos; i<endinpos; i++) {
    4633       16416 :                 ch = (Py_UCS4)(unsigned char)(starts[i]);
    4634       16416 :                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
    4635             :                                 ch + 0xdc00);
    4636       16416 :                 writer.pos++;
    4637             :             }
    4638       16416 :             s += (endinpos - startinpos);
    4639       16416 :             break;
    4640             :         }
    4641             : 
    4642        4862 :         default:
    4643        4862 :             if (unicode_decode_call_errorhandler_writer(
    4644             :                     errors, &error_handler_obj,
    4645             :                     "utf-8", errmsg,
    4646             :                     &starts, &end, &startinpos, &endinpos, &exc, &s,
    4647             :                     &writer))
    4648        3419 :                 goto onError;
    4649             :         }
    4650             :     }
    4651             : 
    4652      186012 : End:
    4653      495027 :     if (consumed)
    4654      200026 :         *consumed = s - starts;
    4655             : 
    4656      495027 :     Py_XDECREF(error_handler_obj);
    4657      495027 :     Py_XDECREF(exc);
    4658      495027 :     return _PyUnicodeWriter_Finish(&writer);
    4659             : 
    4660        3419 : onError:
    4661        3419 :     Py_XDECREF(error_handler_obj);
    4662        3419 :     Py_XDECREF(exc);
    4663        3419 :     _PyUnicodeWriter_Dealloc(&writer);
    4664        3419 :     return NULL;
    4665             : }
    4666             : 
    4667             : 
    4668             : PyObject *
    4669    97617600 : PyUnicode_DecodeUTF8Stateful(const char *s,
    4670             :                              Py_ssize_t size,
    4671             :                              const char *errors,
    4672             :                              Py_ssize_t *consumed)
    4673             : {
    4674    97617600 :     return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
    4675             : }
    4676             : 
    4677             : 
    4678             : /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
    4679             :    non-zero, use strict error handler otherwise.
    4680             : 
    4681             :    On success, write a pointer to a newly allocated wide character string into
    4682             :    *wstr (use PyMem_RawFree() to free the memory) and write the output length
    4683             :    (in number of wchar_t units) into *wlen (if wlen is set).
    4684             : 
    4685             :    On memory allocation failure, return -1.
    4686             : 
    4687             :    On decoding error (if surrogateescape is zero), return -2. If wlen is
    4688             :    non-NULL, write the start of the illegal byte sequence into *wlen. If reason
    4689             :    is not NULL, write the decoding error message into *reason. */
    4690             : int
    4691       26367 : _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
    4692             :                  const char **reason, _Py_error_handler errors)
    4693             : {
    4694       26367 :     const char *orig_s = s;
    4695             :     const char *e;
    4696             :     wchar_t *unicode;
    4697             :     Py_ssize_t outpos;
    4698             : 
    4699       26367 :     int surrogateescape = 0;
    4700       26367 :     int surrogatepass = 0;
    4701       26367 :     switch (errors)
    4702             :     {
    4703           0 :     case _Py_ERROR_STRICT:
    4704           0 :         break;
    4705       26367 :     case _Py_ERROR_SURROGATEESCAPE:
    4706       26367 :         surrogateescape = 1;
    4707       26367 :         break;
    4708           0 :     case _Py_ERROR_SURROGATEPASS:
    4709           0 :         surrogatepass = 1;
    4710           0 :         break;
    4711           0 :     default:
    4712           0 :         return -3;
    4713             :     }
    4714             : 
    4715             :     /* Note: size will always be longer than the resulting Unicode
    4716             :        character count */
    4717       26367 :     if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
    4718           0 :         return -1;
    4719             :     }
    4720             : 
    4721       26367 :     unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
    4722       26367 :     if (!unicode) {
    4723           0 :         return -1;
    4724             :     }
    4725             : 
    4726             :     /* Unpack UTF-8 encoded data */
    4727       26367 :     e = s + size;
    4728       26367 :     outpos = 0;
    4729       26417 :     while (s < e) {
    4730             :         Py_UCS4 ch;
    4731             : #if SIZEOF_WCHAR_T == 4
    4732       26413 :         ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
    4733             : #else
    4734             :         ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
    4735             : #endif
    4736       26413 :         if (ch > 0xFF) {
    4737             : #if SIZEOF_WCHAR_T == 4
    4738           0 :             Py_UNREACHABLE();
    4739             : #else
    4740             :             assert(ch > 0xFFFF && ch <= MAX_UNICODE);
    4741             :             /* write a surrogate pair */
    4742             :             unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
    4743             :             unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
    4744             : #endif
    4745             :         }
    4746             :         else {
    4747       26413 :             if (!ch && s == e) {
    4748       26363 :                 break;
    4749             :             }
    4750             : 
    4751          50 :             if (surrogateescape) {
    4752          50 :                 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
    4753             :             }
    4754             :             else {
    4755             :                 /* Is it a valid three-byte code? */
    4756           0 :                 if (surrogatepass
    4757           0 :                     && (e - s) >= 3
    4758           0 :                     && (s[0] & 0xf0) == 0xe0
    4759           0 :                     && (s[1] & 0xc0) == 0x80
    4760           0 :                     && (s[2] & 0xc0) == 0x80)
    4761             :                 {
    4762           0 :                     ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
    4763           0 :                     s += 3;
    4764           0 :                     unicode[outpos++] = ch;
    4765             :                 }
    4766             :                 else {
    4767           0 :                     PyMem_RawFree(unicode );
    4768           0 :                     if (reason != NULL) {
    4769           0 :                         switch (ch) {
    4770           0 :                         case 0:
    4771           0 :                             *reason = "unexpected end of data";
    4772           0 :                             break;
    4773           0 :                         case 1:
    4774           0 :                             *reason = "invalid start byte";
    4775           0 :                             break;
    4776             :                         /* 2, 3, 4 */
    4777           0 :                         default:
    4778           0 :                             *reason = "invalid continuation byte";
    4779           0 :                             break;
    4780             :                         }
    4781           0 :                     }
    4782           0 :                     if (wlen != NULL) {
    4783           0 :                         *wlen = s - orig_s;
    4784             :                     }
    4785           0 :                     return -2;
    4786             :                 }
    4787             :             }
    4788             :         }
    4789             :     }
    4790       26367 :     unicode[outpos] = L'\0';
    4791       26367 :     if (wlen) {
    4792       26367 :         *wlen = outpos;
    4793             :     }
    4794       26367 :     *wstr = unicode;
    4795       26367 :     return 0;
    4796             : }
    4797             : 
    4798             : 
    4799             : wchar_t*
    4800        3183 : _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
    4801             :                                size_t *wlen)
    4802             : {
    4803             :     wchar_t *wstr;
    4804        3183 :     int res = _Py_DecodeUTF8Ex(arg, arglen,
    4805             :                                &wstr, wlen,
    4806             :                                NULL, _Py_ERROR_SURROGATEESCAPE);
    4807        3183 :     if (res != 0) {
    4808             :         /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
    4809           0 :         assert(res != -3);
    4810           0 :         if (wlen) {
    4811           0 :             *wlen = (size_t)res;
    4812             :         }
    4813           0 :         return NULL;
    4814             :     }
    4815        3183 :     return wstr;
    4816             : }
    4817             : 
    4818             : 
    4819             : /* UTF-8 encoder using the surrogateescape error handler .
    4820             : 
    4821             :    On success, return 0 and write the newly allocated character string (use
    4822             :    PyMem_Free() to free the memory) into *str.
    4823             : 
    4824             :    On encoding failure, return -2 and write the position of the invalid
    4825             :    surrogate character into *error_pos (if error_pos is set) and the decoding
    4826             :    error message into *reason (if reason is set).
    4827             : 
    4828             :    On memory allocation failure, return -1. */
    4829             : int
    4830       15271 : _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
    4831             :                  const char **reason, int raw_malloc, _Py_error_handler errors)
    4832             : {
    4833       15271 :     const Py_ssize_t max_char_size = 4;
    4834       15271 :     Py_ssize_t len = wcslen(text);
    4835             : 
    4836       15271 :     assert(len >= 0);
    4837             : 
    4838       15271 :     int surrogateescape = 0;
    4839       15271 :     int surrogatepass = 0;
    4840       15271 :     switch (errors)
    4841             :     {
    4842       12520 :     case _Py_ERROR_STRICT:
    4843       12520 :         break;
    4844        2751 :     case _Py_ERROR_SURROGATEESCAPE:
    4845        2751 :         surrogateescape = 1;
    4846        2751 :         break;
    4847           0 :     case _Py_ERROR_SURROGATEPASS:
    4848           0 :         surrogatepass = 1;
    4849           0 :         break;
    4850           0 :     default:
    4851           0 :         return -3;
    4852             :     }
    4853             : 
    4854       15271 :     if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
    4855           0 :         return -1;
    4856             :     }
    4857             :     char *bytes;
    4858       15271 :     if (raw_malloc) {
    4859       15271 :         bytes = PyMem_RawMalloc((len + 1) * max_char_size);
    4860             :     }
    4861             :     else {
    4862           0 :         bytes = PyMem_Malloc((len + 1) * max_char_size);
    4863             :     }
    4864       15271 :     if (bytes == NULL) {
    4865           0 :         return -1;
    4866             :     }
    4867             : 
    4868       15271 :     char *p = bytes;
    4869             :     Py_ssize_t i;
    4870      248900 :     for (i = 0; i < len; ) {
    4871      233629 :         Py_ssize_t ch_pos = i;
    4872      233629 :         Py_UCS4 ch = text[i];
    4873      233629 :         i++;
    4874             : #if Py_UNICODE_SIZE == 2
    4875             :         if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
    4876             :             && i < len
    4877             :             && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
    4878             :         {
    4879             :             ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
    4880             :             i++;
    4881             :         }
    4882             : #endif
    4883             : 
    4884      233629 :         if (ch < 0x80) {
    4885             :             /* Encode ASCII */
    4886      233629 :             *p++ = (char) ch;
    4887             : 
    4888             :         }
    4889           0 :         else if (ch < 0x0800) {
    4890             :             /* Encode Latin-1 */
    4891           0 :             *p++ = (char)(0xc0 | (ch >> 6));
    4892           0 :             *p++ = (char)(0x80 | (ch & 0x3f));
    4893             :         }
    4894           0 :         else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
    4895             :             /* surrogateescape error handler */
    4896           0 :             if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
    4897           0 :                 if (error_pos != NULL) {
    4898           0 :                     *error_pos = (size_t)ch_pos;
    4899             :                 }
    4900           0 :                 if (reason != NULL) {
    4901           0 :                     *reason = "encoding error";
    4902             :                 }
    4903           0 :                 if (raw_malloc) {
    4904           0 :                     PyMem_RawFree(bytes);
    4905             :                 }
    4906             :                 else {
    4907           0 :                     PyMem_Free(bytes);
    4908             :                 }
    4909           0 :                 return -2;
    4910             :             }
    4911           0 :             *p++ = (char)(ch & 0xff);
    4912             :         }
    4913           0 :         else if (ch < 0x10000) {
    4914           0 :             *p++ = (char)(0xe0 | (ch >> 12));
    4915           0 :             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
    4916           0 :             *p++ = (char)(0x80 | (ch & 0x3f));
    4917             :         }
    4918             :         else {  /* ch >= 0x10000 */
    4919           0 :             assert(ch <= MAX_UNICODE);
    4920             :             /* Encode UCS4 Unicode ordinals */
    4921           0 :             *p++ = (char)(0xf0 | (ch >> 18));
    4922           0 :             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
    4923           0 :             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
    4924           0 :             *p++ = (char)(0x80 | (ch & 0x3f));
    4925             :         }
    4926             :     }
    4927       15271 :     *p++ = '\0';
    4928             : 
    4929       15271 :     size_t final_size = (p - bytes);
    4930             :     char *bytes2;
    4931       15271 :     if (raw_malloc) {
    4932       15271 :         bytes2 = PyMem_RawRealloc(bytes, final_size);
    4933             :     }
    4934             :     else {
    4935           0 :         bytes2 = PyMem_Realloc(bytes, final_size);
    4936             :     }
    4937       15271 :     if (bytes2 == NULL) {
    4938           0 :         if (error_pos != NULL) {
    4939           0 :             *error_pos = (size_t)-1;
    4940             :         }
    4941           0 :         if (raw_malloc) {
    4942           0 :             PyMem_RawFree(bytes);
    4943             :         }
    4944             :         else {
    4945           0 :             PyMem_Free(bytes);
    4946             :         }
    4947           0 :         return -1;
    4948             :     }
    4949       15271 :     *str = bytes2;
    4950       15271 :     return 0;
    4951             : }
    4952             : 
    4953             : 
    4954             : /* Primary internal function which creates utf8 encoded bytes objects.
    4955             : 
    4956             :    Allocation strategy:  if the string is short, convert into a stack buffer
    4957             :    and allocate exactly as much space needed at the end.  Else allocate the
    4958             :    maximum possible needed (4 result bytes per Unicode character), and return
    4959             :    the excess memory at the end.
    4960             : */
    4961             : static PyObject *
    4962     4973320 : unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
    4963             :                     const char *errors)
    4964             : {
    4965     4973320 :     if (!PyUnicode_Check(unicode)) {
    4966           0 :         PyErr_BadArgument();
    4967           0 :         return NULL;
    4968             :     }
    4969             : 
    4970     4973320 :     if (PyUnicode_UTF8(unicode))
    4971     3479190 :         return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
    4972     3479190 :                                          PyUnicode_UTF8_LENGTH(unicode));
    4973             : 
    4974     1494130 :     int kind = PyUnicode_KIND(unicode);
    4975     1494130 :     const void *data = PyUnicode_DATA(unicode);
    4976     1494130 :     Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
    4977             : 
    4978             :     _PyBytesWriter writer;
    4979             :     char *end;
    4980             : 
    4981     1494130 :     switch (kind) {
    4982           0 :     default:
    4983           0 :         Py_UNREACHABLE();
    4984      183755 :     case PyUnicode_1BYTE_KIND:
    4985             :         /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
    4986      183755 :         assert(!PyUnicode_IS_ASCII(unicode));
    4987      183755 :         end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
    4988      183755 :         break;
    4989      246909 :     case PyUnicode_2BYTE_KIND:
    4990      246909 :         end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
    4991      246909 :         break;
    4992     1063460 :     case PyUnicode_4BYTE_KIND:
    4993     1063460 :         end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
    4994     1063460 :         break;
    4995             :     }
    4996             : 
    4997     1494130 :     if (end == NULL) {
    4998         312 :         _PyBytesWriter_Dealloc(&writer);
    4999         312 :         return NULL;
    5000             :     }
    5001     1493820 :     return _PyBytesWriter_Finish(&writer, end);
    5002             : }
    5003             : 
    5004             : static int
    5005        4540 : unicode_fill_utf8(PyObject *unicode)
    5006             : {
    5007             :     /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
    5008        4540 :     assert(!PyUnicode_IS_ASCII(unicode));
    5009             : 
    5010        4540 :     int kind = PyUnicode_KIND(unicode);
    5011        4540 :     const void *data = PyUnicode_DATA(unicode);
    5012        4540 :     Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
    5013             : 
    5014             :     _PyBytesWriter writer;
    5015             :     char *end;
    5016             : 
    5017        4540 :     switch (kind) {
    5018           0 :     default:
    5019           0 :         Py_UNREACHABLE();
    5020        3467 :     case PyUnicode_1BYTE_KIND:
    5021        3467 :         end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
    5022             :                                    _Py_ERROR_STRICT, NULL);
    5023        3467 :         break;
    5024         960 :     case PyUnicode_2BYTE_KIND:
    5025         960 :         end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
    5026             :                                    _Py_ERROR_STRICT, NULL);
    5027         960 :         break;
    5028         113 :     case PyUnicode_4BYTE_KIND:
    5029         113 :         end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
    5030             :                                    _Py_ERROR_STRICT, NULL);
    5031         113 :         break;
    5032             :     }
    5033        4540 :     if (end == NULL) {
    5034          70 :         _PyBytesWriter_Dealloc(&writer);
    5035          70 :         return -1;
    5036             :     }
    5037             : 
    5038        4470 :     const char *start = writer.use_small_buffer ? writer.small_buffer :
    5039        2814 :                     PyBytes_AS_STRING(writer.buffer);
    5040        4470 :     Py_ssize_t len = end - start;
    5041             : 
    5042        4470 :     char *cache = PyObject_Malloc(len + 1);
    5043        4470 :     if (cache == NULL) {
    5044           0 :         _PyBytesWriter_Dealloc(&writer);
    5045           0 :         PyErr_NoMemory();
    5046           0 :         return -1;
    5047             :     }
    5048        4470 :     _PyUnicode_UTF8(unicode) = cache;
    5049        4470 :     _PyUnicode_UTF8_LENGTH(unicode) = len;
    5050        4470 :     memcpy(cache, start, len);
    5051        4470 :     cache[len] = '\0';
    5052        4470 :     _PyBytesWriter_Dealloc(&writer);
    5053        4470 :     return 0;
    5054             : }
    5055             : 
    5056             : PyObject *
    5057     2955660 : _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
    5058             : {
    5059     2955660 :     return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
    5060             : }
    5061             : 
    5062             : 
    5063             : PyObject *
    5064       42243 : PyUnicode_AsUTF8String(PyObject *unicode)
    5065             : {
    5066       42243 :     return _PyUnicode_AsUTF8String(unicode, NULL);
    5067             : }
    5068             : 
    5069             : /* --- UTF-32 Codec ------------------------------------------------------- */
    5070             : 
    5071             : PyObject *
    5072          34 : PyUnicode_DecodeUTF32(const char *s,
    5073             :                       Py_ssize_t size,
    5074             :                       const char *errors,
    5075             :                       int *byteorder)
    5076             : {
    5077          34 :     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
    5078             : }
    5079             : 
    5080             : PyObject *
    5081       57996 : PyUnicode_DecodeUTF32Stateful(const char *s,
    5082             :                               Py_ssize_t size,
    5083             :                               const char *errors,
    5084             :                               int *byteorder,
    5085             :                               Py_ssize_t *consumed)
    5086             : {
    5087       57996 :     const char *starts = s;
    5088             :     Py_ssize_t startinpos;
    5089             :     Py_ssize_t endinpos;
    5090             :     _PyUnicodeWriter writer;
    5091             :     const unsigned char *q, *e;
    5092       57996 :     int le, bo = 0;       /* assume native ordering by default */
    5093             :     const char *encoding;
    5094       57996 :     const char *errmsg = "";
    5095       57996 :     PyObject *errorHandler = NULL;
    5096       57996 :     PyObject *exc = NULL;
    5097             : 
    5098       57996 :     q = (const unsigned char *)s;
    5099       57996 :     e = q + size;
    5100             : 
    5101       57996 :     if (byteorder)
    5102       57963 :         bo = *byteorder;
    5103             : 
    5104             :     /* Check for BOM marks (U+FEFF) in the input and adjust current
    5105             :        byte order setting accordingly. In native mode, the leading BOM
    5106             :        mark is skipped, in all other modes, it is copied to the output
    5107             :        stream as-is (giving a ZWNBSP character). */
    5108       57996 :     if (bo == 0 && size >= 4) {
    5109         977 :         Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
    5110         977 :         if (bom == 0x0000FEFF) {
    5111         935 :             bo = -1;
    5112         935 :             q += 4;
    5113             :         }
    5114          42 :         else if (bom == 0xFFFE0000) {
    5115          40 :             bo = 1;
    5116          40 :             q += 4;
    5117             :         }
    5118         977 :         if (byteorder)
    5119         947 :             *byteorder = bo;
    5120             :     }
    5121             : 
    5122       57996 :     if (q == e) {
    5123        1142 :         if (consumed)
    5124          51 :             *consumed = size;
    5125        1142 :         _Py_RETURN_UNICODE_EMPTY();
    5126             :     }
    5127             : 
    5128             : #ifdef WORDS_BIGENDIAN
    5129             :     le = bo < 0;
    5130             : #else
    5131       56854 :     le = bo <= 0;
    5132             : #endif
    5133       56854 :     encoding = le ? "utf-32-le" : "utf-32-be";
    5134             : 
    5135       56854 :     _PyUnicodeWriter_Init(&writer);
    5136       56854 :     writer.min_length = (e - q + 3) / 4;
    5137       56854 :     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
    5138           0 :         goto onError;
    5139             : 
    5140        4955 :     while (1) {
    5141       61809 :         Py_UCS4 ch = 0;
    5142       61809 :         Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
    5143             : 
    5144       61809 :         if (e - q >= 4) {
    5145       57042 :             int kind = writer.kind;
    5146       57042 :             void *data = writer.data;
    5147       57042 :             const unsigned char *last = e - 4;
    5148       57042 :             Py_ssize_t pos = writer.pos;
    5149       57042 :             if (le) {
    5150             :                 do {
    5151      600426 :                     ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
    5152      600426 :                     if (ch > maxch)
    5153        3265 :                         break;
    5154      634421 :                     if (kind != PyUnicode_1BYTE_KIND &&
    5155       37260 :                         Py_UNICODE_IS_SURROGATE(ch))
    5156          10 :                         break;
    5157      597151 :                     PyUnicode_WRITE(kind, data, pos++, ch);
    5158      597151 :                     q += 4;
    5159      597151 :                 } while (q <= last);
    5160             :             }
    5161             :             else {
    5162             :                 do {
    5163      301597 :                     ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
    5164      301597 :                     if (ch > maxch)
    5165        1682 :                         break;
    5166      319891 :                     if (kind != PyUnicode_1BYTE_KIND &&
    5167       19976 :                         Py_UNICODE_IS_SURROGATE(ch))
    5168           5 :                         break;
    5169      299910 :                     PyUnicode_WRITE(kind, data, pos++, ch);
    5170      299910 :                     q += 4;
    5171      299910 :                 } while (q <= last);
    5172             :             }
    5173       57042 :             writer.pos = pos;
    5174             :         }
    5175             : 
    5176       61809 :         if (Py_UNICODE_IS_SURROGATE(ch)) {
    5177         122 :             errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
    5178         122 :             startinpos = ((const char *)q) - starts;
    5179         122 :             endinpos = startinpos + 4;
    5180             :         }
    5181       61687 :         else if (ch <= maxch) {
    5182       56847 :             if (q == e || consumed)
    5183             :                 break;
    5184             :             /* remaining bytes at the end? (size should be divisible by 4) */
    5185           8 :             errmsg = "truncated data";
    5186           8 :             startinpos = ((const char *)q) - starts;
    5187           8 :             endinpos = ((const char *)e) - starts;
    5188             :         }
    5189             :         else {
    5190        4840 :             if (ch < 0x110000) {
    5191        4832 :                 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
    5192           0 :                     goto onError;
    5193        4832 :                 q += 4;
    5194        4832 :                 continue;
    5195             :             }
    5196           8 :             errmsg = "code point not in range(0x110000)";
    5197           8 :             startinpos = ((const char *)q) - starts;
    5198           8 :             endinpos = startinpos + 4;
    5199             :         }
    5200             : 
    5201             :         /* The remaining input chars are ignored if the callback
    5202             :            chooses to skip the input */
    5203         138 :         if (unicode_decode_call_errorhandler_writer(
    5204             :                 errors, &errorHandler,
    5205             :                 encoding, errmsg,
    5206             :                 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
    5207             :                 &writer))
    5208          15 :             goto onError;
    5209             :     }
    5210             : 
    5211       56839 :     if (consumed)
    5212       56657 :         *consumed = (const char *)q-starts;
    5213             : 
    5214       56839 :     Py_XDECREF(errorHandler);
    5215       56839 :     Py_XDECREF(exc);
    5216       56839 :     return _PyUnicodeWriter_Finish(&writer);
    5217             : 
    5218          15 :   onError:
    5219          15 :     _PyUnicodeWriter_Dealloc(&writer);
    5220          15 :     Py_XDECREF(errorHandler);
    5221          15 :     Py_XDECREF(exc);
    5222          15 :     return NULL;
    5223             : }
    5224             : 
    5225             : PyObject *
    5226        2457 : _PyUnicode_EncodeUTF32(PyObject *str,
    5227             :                        const char *errors,
    5228             :                        int byteorder)
    5229             : {
    5230             :     int kind;
    5231             :     const void *data;
    5232             :     Py_ssize_t len;
    5233             :     PyObject *v;
    5234             :     uint32_t *out;
    5235             : #if PY_LITTLE_ENDIAN
    5236        2457 :     int native_ordering = byteorder <= 0;
    5237             : #else
    5238             :     int native_ordering = byteorder >= 0;
    5239             : #endif
    5240             :     const char *encoding;
    5241             :     Py_ssize_t nsize, pos;
    5242        2457 :     PyObject *errorHandler = NULL;
    5243        2457 :     PyObject *exc = NULL;
    5244        2457 :     PyObject *rep = NULL;
    5245             : 
    5246        2457 :     if (!PyUnicode_Check(str)) {
    5247           0 :         PyErr_BadArgument();
    5248           0 :         return NULL;
    5249             :     }
    5250        2457 :     kind = PyUnicode_KIND(str);
    5251        2457 :     data = PyUnicode_DATA(str);
    5252        2457 :     len = PyUnicode_GET_LENGTH(str);
    5253             : 
    5254        2457 :     if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
    5255           0 :         return PyErr_NoMemory();
    5256        2457 :     nsize = len + (byteorder == 0);
    5257        2457 :     v = PyBytes_FromStringAndSize(NULL, nsize * 4);
    5258        2457 :     if (v == NULL)
    5259           0 :         return NULL;
    5260             : 
    5261             :     /* output buffer is 4-bytes aligned */
    5262        2457 :     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
    5263        2457 :     out = (uint32_t *)PyBytes_AS_STRING(v);
    5264        2457 :     if (byteorder == 0)
    5265         806 :         *out++ = 0xFEFF;
    5266        2457 :     if (len == 0)
    5267           5 :         goto done;
    5268             : 
    5269        2452 :     if (byteorder == -1)
    5270         868 :         encoding = "utf-32-le";
    5271        1584 :     else if (byteorder == 1)
    5272         779 :         encoding = "utf-32-be";
    5273             :     else
    5274         805 :         encoding = "utf-32";
    5275             : 
    5276        2452 :     if (kind == PyUnicode_1BYTE_KIND) {
    5277        1816 :         ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
    5278        1816 :         goto done;
    5279             :     }
    5280             : 
    5281         636 :     pos = 0;
    5282         731 :     while (pos < len) {
    5283             :         Py_ssize_t newpos, repsize, moreunits;
    5284             : 
    5285         724 :         if (kind == PyUnicode_2BYTE_KIND) {
    5286         690 :             pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
    5287             :                                         &out, native_ordering);
    5288             :         }
    5289             :         else {
    5290          34 :             assert(kind == PyUnicode_4BYTE_KIND);
    5291          34 :             pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
    5292             :                                         &out, native_ordering);
    5293             :         }
    5294         724 :         if (pos == len)
    5295         615 :             break;
    5296             : 
    5297         109 :         rep = unicode_encode_call_errorhandler(
    5298             :                 errors, &errorHandler,
    5299             :                 encoding, "surrogates not allowed",
    5300             :                 str, &exc, pos, pos + 1, &newpos);
    5301         109 :         if (!rep)
    5302          14 :             goto error;
    5303             : 
    5304         106 :         if (PyBytes_Check(rep)) {
    5305          25 :             repsize = PyBytes_GET_SIZE(rep);
    5306          25 :             if (repsize & 3) {
    5307           8 :                 raise_encode_exception(&exc, encoding,
    5308             :                                        str, pos, pos + 1,
    5309             :                                        "surrogates not allowed");
    5310           8 :                 goto error;
    5311             :             }
    5312          17 :             moreunits = repsize / 4;
    5313             :         }
    5314             :         else {
    5315          81 :             assert(PyUnicode_Check(rep));
    5316          81 :             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
    5317          81 :             if (!PyUnicode_IS_ASCII(rep)) {
    5318           3 :                 raise_encode_exception(&exc, encoding,
    5319             :                                        str, pos, pos + 1,
    5320             :                                        "surrogates not allowed");
    5321           3 :                 goto error;
    5322             :             }
    5323             :         }
    5324          95 :         moreunits += pos - newpos;
    5325          95 :         pos = newpos;
    5326             : 
    5327             :         /* four bytes are reserved for each surrogate */
    5328          95 :         if (moreunits > 0) {
    5329          59 :             Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
    5330          59 :             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
    5331             :                 /* integer overflow */
    5332           0 :                 PyErr_NoMemory();
    5333           0 :                 goto error;
    5334             :             }
    5335          59 :             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0)
    5336           0 :                 goto error;
    5337          59 :             out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
    5338             :         }
    5339             : 
    5340          95 :         if (PyBytes_Check(rep)) {
    5341          17 :             memcpy(out, PyBytes_AS_STRING(rep), repsize);
    5342          17 :             out += repsize / 4;
    5343             :         } else /* rep is unicode */ {
    5344          78 :             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
    5345          78 :             ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
    5346             :                                  &out, native_ordering);
    5347             :         }
    5348             : 
    5349          95 :         Py_CLEAR(rep);
    5350             :     }
    5351             : 
    5352             :     /* Cut back to size actually needed. This is necessary for, for example,
    5353             :        encoding of a string containing isolated surrogates and the 'ignore'
    5354             :        handler is used. */
    5355         622 :     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
    5356         622 :     if (nsize != PyBytes_GET_SIZE(v))
    5357           6 :       _PyBytes_Resize(&v, nsize);
    5358         622 :     Py_XDECREF(errorHandler);
    5359         622 :     Py_XDECREF(exc);
    5360        2443 :   done:
    5361        2443 :     return v;
    5362          14 :   error:
    5363          14 :     Py_XDECREF(rep);
    5364          14 :     Py_XDECREF(errorHandler);
    5365          14 :     Py_XDECREF(exc);
    5366          14 :     Py_XDECREF(v);
    5367          14 :     return NULL;
    5368             : }
    5369             : 
    5370             : PyObject *
    5371           0 : PyUnicode_AsUTF32String(PyObject *unicode)
    5372             : {
    5373           0 :     return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
    5374             : }
    5375             : 
    5376             : /* --- UTF-16 Codec ------------------------------------------------------- */
    5377             : 
    5378             : PyObject *
    5379        1065 : PyUnicode_DecodeUTF16(const char *s,
    5380             :                       Py_ssize_t size,
    5381             :                       const char *errors,
    5382             :                       int *byteorder)
    5383             : {
    5384        1065 :     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
    5385             : }
    5386             : 
    5387             : PyObject *
    5388       33856 : PyUnicode_DecodeUTF16Stateful(const char *s,
    5389             :                               Py_ssize_t size,
    5390             :                               const char *errors,
    5391             :                               int *byteorder,
    5392             :                               Py_ssize_t *consumed)
    5393             : {
    5394       33856 :     const char *starts = s;
    5395             :     Py_ssize_t startinpos;
    5396             :     Py_ssize_t endinpos;
    5397             :     _PyUnicodeWriter writer;
    5398             :     const unsigned char *q, *e;
    5399       33856 :     int bo = 0;       /* assume native ordering by default */
    5400             :     int native_ordering;
    5401       33856 :     const char *errmsg = "";
    5402       33856 :     PyObject *errorHandler = NULL;
    5403       33856 :     PyObject *exc = NULL;
    5404             :     const char *encoding;
    5405             : 
    5406       33856 :     q = (const unsigned char *)s;
    5407       33856 :     e = q + size;
    5408             : 
    5409       33856 :     if (byteorder)
    5410       32793 :         bo = *byteorder;
    5411             : 
    5412             :     /* Check for BOM marks (U+FEFF) in the input and adjust current
    5413             :        byte order setting accordingly. In native mode, the leading BOM
    5414             :        mark is skipped, in all other modes, it is copied to the output
    5415             :        stream as-is (giving a ZWNBSP character). */
    5416       33856 :     if (bo == 0 && size >= 2) {
    5417        2000 :         const Py_UCS4 bom = (q[1] << 8) | q[0];
    5418        2000 :         if (bom == 0xFEFF) {
    5419        1977 :             q += 2;
    5420        1977 :             bo = -1;
    5421             :         }
    5422          23 :         else if (bom == 0xFFFE) {
    5423          21 :             q += 2;
    5424          21 :             bo = 1;
    5425             :         }
    5426        2000 :         if (byteorder)
    5427         940 :             *byteorder = bo;
    5428             :     }
    5429             : 
    5430       33856 :     if (q == e) {
    5431        1190 :         if (consumed)
    5432          72 :             *consumed = size;
    5433        1190 :         _Py_RETURN_UNICODE_EMPTY();
    5434             :     }
    5435             : 
    5436             : #if PY_LITTLE_ENDIAN
    5437       32666 :     native_ordering = bo <= 0;
    5438       32666 :     encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
    5439             : #else
    5440             :     native_ordering = bo >= 0;
    5441             :     encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
    5442             : #endif
    5443             : 
    5444             :     /* Note: size will always be longer than the resulting Unicode
    5445             :        character count normally.  Error handler will take care of
    5446             :        resizing when needed. */
    5447       32666 :     _PyUnicodeWriter_Init(&writer);
    5448       32666 :     writer.min_length = (e - q + 1) / 2;
    5449       32666 :     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
    5450           0 :         goto onError;
    5451             : 
    5452        7106 :     while (1) {
    5453       39772 :         Py_UCS4 ch = 0;
    5454       39772 :         if (e - q >= 2) {
    5455       35617 :             int kind = writer.kind;
    5456       35617 :             if (kind == PyUnicode_1BYTE_KIND) {
    5457       31487 :                 if (PyUnicode_IS_ASCII(writer.buffer))
    5458       31469 :                     ch = asciilib_utf16_decode(&q, e,
    5459       31469 :                             (Py_UCS1*)writer.data, &writer.pos,
    5460             :                             native_ordering);
    5461             :                 else
    5462          18 :                     ch = ucs1lib_utf16_decode(&q, e,
    5463          18 :                             (Py_UCS1*)writer.data, &writer.pos,
    5464             :                             native_ordering);
    5465        4130 :             } else if (kind == PyUnicode_2BYTE_KIND) {
    5466        4092 :                 ch = ucs2lib_utf16_decode(&q, e,
    5467        4092 :                         (Py_UCS2*)writer.data, &writer.pos,
    5468             :                         native_ordering);
    5469             :             } else {
    5470          38 :                 assert(kind == PyUnicode_4BYTE_KIND);
    5471          38 :                 ch = ucs4lib_utf16_decode(&q, e,
    5472          38 :                         (Py_UCS4*)writer.data, &writer.pos,
    5473             :                         native_ordering);
    5474             :             }
    5475             :         }
    5476             : 
    5477       39772 :         switch (ch)
    5478             :         {
    5479       32628 :         case 0:
    5480             :             /* remaining byte at the end? (size should be even) */
    5481       32628 :             if (q == e || consumed)
    5482       32609 :                 goto End;
    5483          19 :             errmsg = "truncated data";
    5484          19 :             startinpos = ((const char *)q) - starts;
    5485          19 :             endinpos = ((const char *)e) - starts;
    5486          19 :             break;
    5487             :             /* The remaining input chars are ignored if the callback
    5488             :                chooses to skip the input */
    5489          44 :         case 1:
    5490          44 :             q -= 2;
    5491          44 :             if (consumed)
    5492          30 :                 goto End;
    5493          14 :             errmsg = "unexpected end of data";
    5494          14 :             startinpos = ((const char *)q) - starts;
    5495          14 :             endinpos = ((const char *)e) - starts;
    5496          14 :             break;
    5497         105 :         case 2:
    5498         105 :             errmsg = "illegal encoding";
    5499         105 :             startinpos = ((const char *)q) - 2 - starts;
    5500         105 :             endinpos = startinpos + 2;
    5501         105 :             break;
    5502          12 :         case 3:
    5503          12 :             errmsg = "illegal UTF-16 surrogate";
    5504          12 :             startinpos = ((const char *)q) - 4 - starts;
    5505          12 :             endinpos = startinpos + 2;
    5506          12 :             break;
    5507        6983 :         default:
    5508        6983 :             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
    5509           0 :                 goto onError;
    5510        6983 :             continue;
    5511             :         }
    5512             : 
    5513         150 :         if (unicode_decode_call_errorhandler_writer(
    5514             :                 errors,
    5515             :                 &errorHandler,
    5516             :                 encoding, errmsg,
    5517             :                 &starts,
    5518             :                 (const char **)&e,
    5519             :                 &startinpos,
    5520             :                 &endinpos,
    5521             :                 &exc,
    5522             :                 (const char **)&q,
    5523             :                 &writer))
    5524          27 :             goto onError;
    5525             :     }
    5526             : 
    5527       32639 : End:
    5528       32639 :     if (consumed)
    5529       29336 :         *consumed = (const char *)q-starts;
    5530             : 
    5531       32639 :     Py_XDECREF(errorHandler);
    5532       32639 :     Py_XDECREF(exc);
    5533       32639 :     return _PyUnicodeWriter_Finish(&writer);
    5534             : 
    5535          27 :   onError:
    5536          27 :     _PyUnicodeWriter_Dealloc(&writer);
    5537          27 :     Py_XDECREF(errorHandler);
    5538          27 :     Py_XDECREF(exc);
    5539          27 :     return NULL;
    5540             : }
    5541             : 
    5542             : PyObject *
    5543        5836 : _PyUnicode_EncodeUTF16(PyObject *str,
    5544             :                        const char *errors,
    5545             :                        int byteorder)
    5546             : {
    5547             :     int kind;
    5548             :     const void *data;
    5549             :     Py_ssize_t len;
    5550             :     PyObject *v;
    5551             :     unsigned short *out;
    5552             :     Py_ssize_t pairs;
    5553             : #if PY_BIG_ENDIAN
    5554             :     int native_ordering = byteorder >= 0;
    5555             : #else
    5556        5836 :     int native_ordering = byteorder <= 0;
    5557             : #endif
    5558             :     const char *encoding;
    5559             :     Py_ssize_t nsize, pos;
    5560        5836 :     PyObject *errorHandler = NULL;
    5561        5836 :     PyObject *exc = NULL;
    5562        5836 :     PyObject *rep = NULL;
    5563             : 
    5564        5836 :     if (!PyUnicode_Check(str)) {
    5565           0 :         PyErr_BadArgument();
    5566           0 :         return NULL;
    5567             :     }
    5568        5836 :     kind = PyUnicode_KIND(str);
    5569        5836 :     data = PyUnicode_DATA(str);
    5570        5836 :     len = PyUnicode_GET_LENGTH(str);
    5571             : 
    5572        5836 :     pairs = 0;
    5573        5836 :     if (kind == PyUnicode_4BYTE_KIND) {
    5574          56 :         const Py_UCS4 *in = (const Py_UCS4 *)data;
    5575          56 :         const Py_UCS4 *end = in + len;
    5576        1454 :         while (in < end) {
    5577        1398 :             if (*in++ >= 0x10000) {
    5578          94 :                 pairs++;
    5579             :             }
    5580             :         }
    5581             :     }
    5582        5836 :     if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
    5583           0 :         return PyErr_NoMemory();
    5584             :     }
    5585        5836 :     nsize = len + pairs + (byteorder == 0);
    5586        5836 :     v = PyBytes_FromStringAndSize(NULL, nsize * 2);
    5587        5836 :     if (v == NULL) {
    5588           0 :         return NULL;
    5589             :     }
    5590             : 
    5591             :     /* output buffer is 2-bytes aligned */
    5592        5836 :     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
    5593        5836 :     out = (unsigned short *)PyBytes_AS_STRING(v);
    5594        5836 :     if (byteorder == 0) {
    5595        1887 :         *out++ = 0xFEFF;
    5596             :     }
    5597        5836 :     if (len == 0) {
    5598          23 :         goto done;
    5599             :     }
    5600             : 
    5601        5813 :     if (kind == PyUnicode_1BYTE_KIND) {
    5602        2834 :         ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
    5603        2834 :         goto done;
    5604             :     }
    5605             : 
    5606        2979 :     if (byteorder < 0) {
    5607         996 :         encoding = "utf-16-le";
    5608             :     }
    5609        1983 :     else if (byteorder > 0) {
    5610         988 :         encoding = "utf-16-be";
    5611             :     }
    5612             :     else {
    5613         995 :         encoding = "utf-16";
    5614             :     }
    5615             : 
    5616        2979 :     pos = 0;
    5617        3074 :     while (pos < len) {
    5618             :         Py_ssize_t newpos, repsize, moreunits;
    5619             : 
    5620        3067 :         if (kind == PyUnicode_2BYTE_KIND) {
    5621        3005 :             pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
    5622             :                                         &out, native_ordering);
    5623             :         }
    5624             :         else {
    5625          62 :             assert(kind == PyUnicode_4BYTE_KIND);
    5626          62 :             pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
    5627             :                                         &out, native_ordering);
    5628             :         }
    5629        3067 :         if (pos == len)
    5630        2960 :             break;
    5631             : 
    5632         107 :         rep = unicode_encode_call_errorhandler(
    5633             :                 errors, &errorHandler,
    5634             :                 encoding, "surrogates not allowed",
    5635             :                 str, &exc, pos, pos + 1, &newpos);
    5636         107 :         if (!rep)
    5637          12 :             goto error;
    5638             : 
    5639         102 :         if (PyBytes_Check(rep)) {
    5640          21 :             repsize = PyBytes_GET_SIZE(rep);
    5641          21 :             if (repsize & 1) {
    5642           4 :                 raise_encode_exception(&exc, encoding,
    5643             :                                        str, pos, pos + 1,
    5644             :                                        "surrogates not allowed");
    5645           4 :                 goto error;
    5646             :             }
    5647          17 :             moreunits = repsize / 2;
    5648             :         }
    5649             :         else {
    5650          81 :             assert(PyUnicode_Check(rep));
    5651          81 :             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
    5652          81 :             if (!PyUnicode_IS_ASCII(rep)) {
    5653           3 :                 raise_encode_exception(&exc, encoding,
    5654             :                                        str, pos, pos + 1,
    5655             :                                        "surrogates not allowed");
    5656           3 :                 goto error;
    5657             :             }
    5658             :         }
    5659          95 :         moreunits += pos - newpos;
    5660          95 :         pos = newpos;
    5661             : 
    5662             :         /* two bytes are reserved for each surrogate */
    5663          95 :         if (moreunits > 0) {
    5664          59 :             Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
    5665          59 :             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
    5666             :                 /* integer overflow */
    5667           0 :                 PyErr_NoMemory();
    5668           0 :                 goto error;
    5669             :             }
    5670          59 :             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * moreunits) < 0)
    5671           0 :                 goto error;
    5672          59 :             out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
    5673             :         }
    5674             : 
    5675          95 :         if (PyBytes_Check(rep)) {
    5676          17 :             memcpy(out, PyBytes_AS_STRING(rep), repsize);
    5677          17 :             out += repsize / 2;
    5678             :         } else /* rep is unicode */ {
    5679          78 :             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
    5680          78 :             ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
    5681             :                                  &out, native_ordering);
    5682             :         }
    5683             : 
    5684          95 :         Py_CLEAR(rep);
    5685             :     }
    5686             : 
    5687             :     /* Cut back to size actually needed. This is necessary for, for example,
    5688             :     encoding of a string containing isolated surrogates and the 'ignore' handler
    5689             :     is used. */
    5690        2967 :     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
    5691        2967 :     if (nsize != PyBytes_GET_SIZE(v))
    5692           6 :       _PyBytes_Resize(&v, nsize);
    5693        2967 :     Py_XDECREF(errorHandler);
    5694        2967 :     Py_XDECREF(exc);
    5695        5824 :   done:
    5696        5824 :     return v;
    5697          12 :   error:
    5698          12 :     Py_XDECREF(rep);
    5699          12 :     Py_XDECREF(errorHandler);
    5700          12 :     Py_XDECREF(exc);
    5701          12 :     Py_XDECREF(v);
    5702          12 :     return NULL;
    5703             : #undef STORECHAR
    5704             : }
    5705             : 
    5706             : PyObject *
    5707           0 : PyUnicode_AsUTF16String(PyObject *unicode)
    5708             : {
    5709           0 :     return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
    5710             : }
    5711             : 
    5712             : /* --- Unicode Escape Codec ----------------------------------------------- */
    5713             : 
    5714             : static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
    5715             : 
    5716             : PyObject *
    5717      246315 : _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
    5718             :                                Py_ssize_t size,
    5719             :                                const char *errors,
    5720             :                                Py_ssize_t *consumed,
    5721             :                                const char **first_invalid_escape)
    5722             : {
    5723      246315 :     const char *starts = s;
    5724             :     _PyUnicodeWriter writer;
    5725             :     const char *end;
    5726      246315 :     PyObject *errorHandler = NULL;
    5727      246315 :     PyObject *exc = NULL;
    5728             : 
    5729             :     // so we can remember if we've seen an invalid escape char or not
    5730      246315 :     *first_invalid_escape = NULL;
    5731             : 
    5732      246315 :     if (size == 0) {
    5733          17 :         if (consumed) {
    5734           8 :             *consumed = 0;
    5735             :         }
    5736          17 :         _Py_RETURN_UNICODE_EMPTY();
    5737             :     }
    5738             :     /* Escaped strings will always be longer than the resulting
    5739             :        Unicode string, so we start with size here and then reduce the
    5740             :        length after conversion to the true value.
    5741             :        (but if the error callback returns a long replacement string
    5742             :        we'll have to allocate more space) */
    5743      246298 :     _PyUnicodeWriter_Init(&writer);
    5744      246298 :     writer.min_length = size;
    5745      246298 :     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
    5746           0 :         goto onError;
    5747             :     }
    5748             : 
    5749      246298 :     end = s + size;
    5750     4176480 :     while (s < end) {
    5751     3931180 :         unsigned char c = (unsigned char) *s++;
    5752             :         Py_UCS4 ch;
    5753             :         int count;
    5754             :         const char *message;
    5755             : 
    5756             : #define WRITE_ASCII_CHAR(ch)                                                  \
    5757             :             do {                                                              \
    5758             :                 assert(ch <= 127);                                            \
    5759             :                 assert(writer.pos < writer.size);                             \
    5760             :                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
    5761             :             } while(0)
    5762             : 
    5763             : #define WRITE_CHAR(ch)                                                        \
    5764             :             do {                                                              \
    5765             :                 if (ch <= writer.maxchar) {                                   \
    5766             :                     assert(writer.pos < writer.size);                         \
    5767             :                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
    5768             :                 }                                                             \
    5769             :                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
    5770             :                     goto onError;                                             \
    5771             :                 }                                                             \
    5772             :             } while(0)
    5773             : 
    5774             :         /* Non-escape characters are interpreted as Unicode ordinals */
    5775     3931180 :         if (c != '\\') {
    5776     3644980 :             WRITE_CHAR(c);
    5777     3930140 :             continue;
    5778             :         }
    5779             : 
    5780      286733 :         Py_ssize_t startinpos = s - starts - 1;
    5781             :         /* \ - Escapes */
    5782      286733 :         if (s >= end) {
    5783         226 :             message = "\\ at end of string";
    5784         226 :             goto incomplete;
    5785             :         }
    5786      286507 :         c = (unsigned char) *s++;
    5787             : 
    5788      286507 :         assert(writer.pos < writer.size);
    5789      286507 :         switch (c) {
    5790             : 
    5791             :             /* \x escapes */
    5792        3080 :         case '\n': continue;
    5793       24078 :         case '\\': WRITE_ASCII_CHAR('\\'); continue;
    5794        3327 :         case '\'': WRITE_ASCII_CHAR('\''); continue;
    5795        1348 :         case '\"': WRITE_ASCII_CHAR('\"'); continue;
    5796         202 :         case 'b': WRITE_ASCII_CHAR('\b'); continue;
    5797             :         /* FF */
    5798         244 :         case 'f': WRITE_ASCII_CHAR('\014'); continue;
    5799        6624 :         case 't': WRITE_ASCII_CHAR('\t'); continue;
    5800       74419 :         case 'n': WRITE_ASCII_CHAR('\n'); continue;
    5801        7731 :         case 'r': WRITE_ASCII_CHAR('\r'); continue;
    5802             :         /* VT */
    5803         116 :         case 'v': WRITE_ASCII_CHAR('\013'); continue;
    5804             :         /* BEL, not classic C */
    5805          96 :         case 'a': WRITE_ASCII_CHAR('\007'); continue;
    5806             : 
    5807             :             /* \OOO (octal) escapes */
    5808        1415 :         case '0': case '1': case '2': case '3':
    5809             :         case '4': case '5': case '6': case '7':
    5810        1415 :             ch = c - '0';
    5811        1415 :             if (s < end && '0' <= *s && *s <= '7') {
    5812         764 :                 ch = (ch<<3) + *s++ - '0';
    5813         764 :                 if (s < end && '0' <= *s && *s <= '7') {
    5814         740 :                     ch = (ch<<3) + *s++ - '0';
    5815             :                 }
    5816             :             }
    5817        1415 :             if (ch > 0377) {
    5818         514 :                 if (*first_invalid_escape == NULL) {
    5819         514 :                     *first_invalid_escape = s-3; /* Back up 3 chars, since we've
    5820             :                                                     already incremented s. */
    5821             :                 }
    5822             :             }
    5823        1415 :             WRITE_CHAR(ch);
    5824        1415 :             continue;
    5825             : 
    5826             :             /* hex escapes */
    5827             :             /* \xXX */
    5828       16678 :         case 'x':
    5829       16678 :             count = 2;
    5830       16678 :             message = "truncated \\xXX escape";
    5831       16678 :             goto hexescape;
    5832             : 
    5833             :             /* \uXXXX */
    5834       87468 :         case 'u':
    5835       87468 :             count = 4;
    5836       87468 :             message = "truncated \\uXXXX escape";
    5837       87468 :             goto hexescape;
    5838             : 
    5839             :             /* \UXXXXXXXX */
    5840       58897 :         case 'U':
    5841       58897 :             count = 8;
    5842       58897 :             message = "truncated \\UXXXXXXXX escape";
    5843      163043 :         hexescape:
    5844     1016480 :             for (ch = 0; count; ++s, --count) {
    5845      853765 :                 if (s >= end) {
    5846         293 :                     goto incomplete;
    5847             :                 }
    5848      853472 :                 c = (unsigned char)*s;
    5849      853472 :                 ch <<= 4;
    5850      853472 :                 if (c >= '0' && c <= '9') {
    5851      658816 :                     ch += c - '0';
    5852             :                 }
    5853      194656 :                 else if (c >= 'a' && c <= 'f') {
    5854      187803 :                     ch += c - ('a' - 10);
    5855             :                 }
    5856        6853 :                 else if (c >= 'A' && c <= 'F') {
    5857        6818 :                     ch += c - ('A' - 10);
    5858             :                 }
    5859             :                 else {
    5860          35 :                     goto error;
    5861             :                 }
    5862             :             }
    5863             : 
    5864             :             /* when we get here, ch is a 32-bit unicode character */
    5865      162715 :             if (ch > MAX_UNICODE) {
    5866          10 :                 message = "illegal Unicode character";
    5867          10 :                 goto error;
    5868             :             }
    5869             : 
    5870      162705 :             WRITE_CHAR(ch);
    5871      162705 :             continue;
    5872             : 
    5873             :             /* \N{name} */
    5874         633 :         case 'N':
    5875         633 :             if (ucnhash_capi == NULL) {
    5876             :                 /* load the unicode data module */
    5877          19 :                 ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
    5878             :                                                 PyUnicodeData_CAPSULE_NAME, 1);
    5879          19 :                 if (ucnhash_capi == NULL) {
    5880           1 :                     PyErr_SetString(
    5881             :                         PyExc_UnicodeError,
    5882             :                         "\\N escapes not supported (can't load unicodedata module)"
    5883             :                         );
    5884           1 :                     goto onError;
    5885             :                 }
    5886             :             }
    5887             : 
    5888         632 :             message = "malformed \\N character escape";
    5889         632 :             if (s >= end) {
    5890           2 :                 goto incomplete;
    5891             :             }
    5892         630 :             if (*s == '{') {
    5893         625 :                 const char *start = ++s;
    5894             :                 size_t namelen;
    5895             :                 /* look for the closing brace */
    5896      115149 :                 while (s < end && *s != '}')
    5897      114524 :                     s++;
    5898         625 :                 if (s >= end) {
    5899           5 :                     goto incomplete;
    5900             :                 }
    5901         620 :                 namelen = s - start;
    5902         620 :                 if (namelen) {
    5903             :                     /* found a name.  look it up in the unicode database */
    5904         620 :                     s++;
    5905         620 :                     ch = 0xffffffff; /* in case 'getcode' messes up */
    5906        1240 :                     if (namelen <= INT_MAX &&
    5907         620 :                         ucnhash_capi->getcode(start, (int)namelen,
    5908             :                                               &ch, 0)) {
    5909         151 :                         assert(ch <= MAX_UNICODE);
    5910         151 :                         WRITE_CHAR(ch);
    5911         151 :                         continue;
    5912             :                     }
    5913         469 :                     message = "unknown Unicode character name";
    5914             :                 }
    5915             :             }
    5916         474 :             goto error;
    5917             : 
    5918         151 :         default:
    5919         151 :             if (*first_invalid_escape == NULL) {
    5920         151 :                 *first_invalid_escape = s-1; /* Back up one char, since we've
    5921             :                                                 already incremented s. */
    5922             :             }
    5923         151 :             WRITE_ASCII_CHAR('\\');
    5924         151 :             WRITE_CHAR(c);
    5925         151 :             continue;
    5926             :         }
    5927             : 
    5928         526 :       incomplete:
    5929         526 :         if (consumed) {
    5930         470 :             *consumed = startinpos;
    5931         470 :             break;
    5932             :         }
    5933          56 :       error:;
    5934         575 :         Py_ssize_t endinpos = s-starts;
    5935         575 :         writer.min_length = end - s + writer.pos;
    5936         575 :         if (unicode_decode_call_errorhandler_writer(
    5937             :                 errors, &errorHandler,
    5938             :                 "unicodeescape", message,
    5939             :                 &starts, &end, &startinpos, &endinpos, &exc, &s,
    5940             :                 &writer)) {
    5941         527 :             goto onError;
    5942             :         }
    5943          48 :         assert(end - s <= writer.size - writer.pos);
    5944             : 
    5945             : #undef WRITE_ASCII_CHAR
    5946             : #undef WRITE_CHAR
    5947             :     }
    5948             : 
    5949      245770 :     Py_XDECREF(errorHandler);
    5950      245770 :     Py_XDECREF(exc);
    5951      245770 :     return _PyUnicodeWriter_Finish(&writer);
    5952             : 
    5953         528 :   onError:
    5954         528 :     _PyUnicodeWriter_Dealloc(&writer);
    5955         528 :     Py_XDECREF(errorHandler);
    5956         528 :     Py_XDECREF(exc);
    5957         528 :     return NULL;
    5958             : }
    5959             : 
    5960             : PyObject *
    5961        6315 : _PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
    5962             :                               Py_ssize_t size,
    5963             :                               const char *errors,
    5964             :                               Py_ssize_t *consumed)
    5965             : {
    5966             :     const char *first_invalid_escape;
    5967        6315 :     PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
    5968             :                                                       consumed,
    5969             :                                                       &first_invalid_escape);
    5970        6315 :     if (result == NULL)
    5971          32 :         return NULL;
    5972        6283 :     if (first_invalid_escape != NULL) {
    5973         300 :         unsigned char c = *first_invalid_escape;
    5974         300 :         if ('4' <= c && c <= '7') {
    5975         256 :             if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
    5976             :                                  "invalid octal escape sequence '\\%.3s'",
    5977             :                                  first_invalid_escape) < 0)
    5978             :             {
    5979           0 :                 Py_DECREF(result);
    5980           0 :                 return NULL;
    5981             :             }
    5982             :         }
    5983             :         else {
    5984          44 :             if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
    5985             :                                  "invalid escape sequence '\\%c'",
    5986             :                                  c) < 0)
    5987             :             {
    5988           0 :                 Py_DECREF(result);
    5989           0 :                 return NULL;
    5990             :             }
    5991             :         }
    5992             :     }
    5993        6283 :     return result;
    5994             : }
    5995             : 
    5996             : PyObject *
    5997           0 : PyUnicode_DecodeUnicodeEscape(const char *s,
    5998             :                               Py_ssize_t size,
    5999             :                               const char *errors)
    6000             : {
    6001           0 :     return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
    6002             : }
    6003             : 
    6004             : /* Return a Unicode-Escape string version of the Unicode object. */
    6005             : 
    6006             : PyObject *
    6007        3793 : PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
    6008             : {
    6009             :     Py_ssize_t i, len;
    6010             :     PyObject *repr;
    6011             :     char *p;
    6012             :     int kind;
    6013             :     const void *data;
    6014             :     Py_ssize_t expandsize;
    6015             : 
    6016             :     /* Initial allocation is based on the longest-possible character
    6017             :        escape.
    6018             : 
    6019             :        For UCS1 strings it's '\xxx', 4 bytes per source character.
    6020             :        For UCS2 strings it's '\uxxxx', 6 bytes per source character.
    6021             :        For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
    6022             :     */
    6023             : 
    6024        3793 :     if (!PyUnicode_Check(unicode)) {
    6025           0 :         PyErr_BadArgument();
    6026           0 :         return NULL;
    6027             :     }
    6028             : 
    6029        3793 :     len = PyUnicode_GET_LENGTH(unicode);
    6030        3793 :     if (len == 0) {
    6031           7 :         return PyBytes_FromStringAndSize(NULL, 0);
    6032             :     }
    6033             : 
    6034        3786 :     kind = PyUnicode_KIND(unicode);
    6035        3786 :     data = PyUnicode_DATA(unicode);
    6036             :     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
    6037             :        bytes, and 1 byte characters 4. */
    6038        3786 :     expandsize = kind * 2 + 2;
    6039        3786 :     if (len > PY_SSIZE_T_MAX / expandsize) {
    6040           0 :         return PyErr_NoMemory();
    6041             :     }
    6042        3786 :     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
    6043        3786 :     if (repr == NULL) {
    6044           0 :         return NULL;
    6045             :     }
    6046             : 
    6047        3786 :     p = PyBytes_AS_STRING(repr);
    6048      301901 :     for (i = 0; i < len; i++) {
    6049      298115 :         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
    6050             : 
    6051             :         /* U+0000-U+00ff range */
    6052      298115 :         if (ch < 0x100) {
    6053      291722 :             if (ch >= ' ' && ch < 127) {
    6054      277279 :                 if (ch != '\\') {
    6055             :                     /* Copy printable US ASCII as-is */
    6056      276597 :                     *p++ = (char) ch;
    6057             :                 }
    6058             :                 /* Escape backslashes */
    6059             :                 else {
    6060         682 :                     *p++ = '\\';
    6061         682 :                     *p++ = '\\';
    6062             :                 }
    6063             :             }
    6064             : 
    6065             :             /* Map special whitespace to '\t', \n', '\r' */
    6066       14443 :             else if (ch == '\t') {
    6067          14 :                 *p++ = '\\';
    6068          14 :                 *p++ = 't';
    6069             :             }
    6070       14429 :             else if (ch == '\n') {
    6071       10785 :                 *p++ = '\\';
    6072       10785 :                 *p++ = 'n';
    6073             :             }
    6074        3644 :             else if (ch == '\r') {
    6075        3319 :                 *p++ = '\\';
    6076        3319 :                 *p++ = 'r';
    6077             :             }
    6078             : 
    6079             :             /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
    6080             :             else {
    6081         325 :                 *p++ = '\\';
    6082         325 :                 *p++ = 'x';
    6083         325 :                 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
    6084         325 :                 *p++ = Py_hexdigits[ch & 0x000F];
    6085             :             }
    6086             :         }
    6087             :         /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
    6088        6393 :         else if (ch < 0x10000) {
    6089        6383 :             *p++ = '\\';
    6090        6383 :             *p++ = 'u';
    6091        6383 :             *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
    6092        6383 :             *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
    6093        6383 :             *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
    6094        6383 :             *p++ = Py_hexdigits[ch & 0x000F];
    6095             :         }
    6096             :         /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
    6097             :         else {
    6098             : 
    6099             :             /* Make sure that the first two digits are zero */
    6100          10 :             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
    6101          10 :             *p++ = '\\';
    6102          10 :             *p++ = 'U';
    6103          10 :             *p++ = '0';
    6104          10 :             *p++ = '0';
    6105          10 :             *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
    6106          10 :             *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
    6107          10 :             *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
    6108          10 :             *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
    6109          10 :             *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
    6110          10 :             *p++ = Py_hexdigits[ch & 0x0000000F];
    6111             :         }
    6112             :     }
    6113             : 
    6114        3786 :     assert(p - PyBytes_AS_STRING(repr) > 0);
    6115        3786 :     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
    6116           0 :         return NULL;
    6117             :     }
    6118        3786 :     return repr;
    6119             : }
    6120             : 
    6121             : /* --- Raw Unicode Escape Codec ------------------------------------------- */
    6122             : 
    6123             : PyObject *
    6124       82692 : _PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
    6125             :                                           Py_ssize_t size,
    6126             :                                           const char *errors,
    6127             :                                           Py_ssize_t *consumed)
    6128             : {
    6129       82692 :     const char *starts = s;
    6130             :     _PyUnicodeWriter writer;
    6131             :     const char *end;
    6132       82692 :     PyObject *errorHandler = NULL;
    6133       82692 :     PyObject *exc = NULL;
    6134             : 
    6135       82692 :     if (size == 0) {
    6136          34 :         if (consumed) {
    6137           8 :             *consumed = 0;
    6138             :         }
    6139          34 :         _Py_RETURN_UNICODE_EMPTY();
    6140             :     }
    6141             : 
    6142             :     /* Escaped strings will always be longer than the resulting
    6143             :        Unicode string, so we start with size here and then reduce the
    6144             :        length after conversion to the true value. (But decoding error
    6145             :        handler might have to resize the string) */
    6146       82658 :     _PyUnicodeWriter_Init(&writer);
    6147       82658 :     writer.min_length = size;
    6148       82658 :     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
    6149           0 :         goto onError;
    6150             :     }
    6151             : 
    6152       82658 :     end = s + size;
    6153     1121760 :     while (s < end) {
    6154     1039410 :         unsigned char c = (unsigned char) *s++;
    6155             :         Py_UCS4 ch;
    6156             :         int count;
    6157             :         const char *message;
    6158             : 
    6159             : #define WRITE_CHAR(ch)                                                        \
    6160             :             do {                                                              \
    6161             :                 if (ch <= writer.maxchar) {                                   \
    6162             :                     assert(writer.pos < writer.size);                         \
    6163             :                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
    6164             :                 }                                                             \
    6165             :                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
    6166             :                     goto onError;                                             \
    6167             :                 }                                                             \
    6168             :             } while(0)
    6169             : 
    6170             :         /* Non-escape characters are interpreted as Unicode ordinals */
    6171     1039410 :         if (c != '\\' || (s >= end && !consumed)) {
    6172     1032020 :             WRITE_CHAR(c);
    6173     1039070 :             continue;
    6174             :         }
    6175             : 
    6176        7409 :         Py_ssize_t startinpos = s - starts - 1;
    6177             :         /* \ - Escapes */
    6178        7409 :         if (s >= end) {
    6179          49 :             assert(consumed);
    6180             :             // Set message to silent compiler warning.
    6181             :             // Actually it is never used.
    6182          49 :             message = "\\ at end of string";
    6183          49 :             goto incomplete;
    6184             :         }
    6185             : 
    6186        7360 :         c = (unsigned char) *s++;
    6187        7360 :         if (c == 'u') {
    6188        7000 :             count = 4;
    6189        7000 :             message = "truncated \\uXXXX escape";
    6190             :         }
    6191         360 :         else if (c == 'U') {
    6192         101 :             count = 8;
    6193         101 :             message = "truncated \\UXXXXXXXX escape";
    6194             :         }
    6195             :         else {
    6196         259 :             assert(writer.pos < writer.size);
    6197         259 :             PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
    6198         259 :             WRITE_CHAR(c);
    6199         259 :             continue;
    6200             :         }
    6201             : 
    6202             :         /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
    6203       34989 :         for (ch = 0; count; ++s, --count) {
    6204       28172 :             if (s >= end) {
    6205         256 :                 goto incomplete;
    6206             :             }
    6207       27916 :             c = (unsigned char)*s;
    6208       27916 :             ch <<= 4;
    6209       27916 :             if (c >= '0' && c <= '9') {
    6210       27026 :                 ch += c - '0';
    6211             :             }
    6212         890 :             else if (c >= 'a' && c <= 'f') {
    6213         862 :                 ch += c - ('a' - 10);
    6214             :             }
    6215          28 :             else if (c >= 'A' && c <= 'F') {
    6216           0 :                 ch += c - ('A' - 10);
    6217             :             }
    6218             :             else {
    6219          28 :                 goto error;
    6220             :             }
    6221             :         }
    6222        6817 :         if (ch > MAX_UNICODE) {
    6223           7 :             message = "\\Uxxxxxxxx out of range";
    6224           7 :             goto error;
    6225             :         }
    6226        6810 :         WRITE_CHAR(ch);
    6227        6810 :         continue;
    6228             : 
    6229         305 :       incomplete:
    6230         305 :         if (consumed) {
    6231         281 :             *consumed = startinpos;
    6232         281 :             break;
    6233             :         }
    6234          24 :       error:;
    6235          59 :         Py_ssize_t endinpos = s-starts;
    6236          59 :         writer.min_length = end - s + writer.pos;
    6237          59 :         if (unicode_decode_call_errorhandler_writer(
    6238             :                 errors, &errorHandler,
    6239             :                 "rawunicodeescape", message,
    6240             :                 &starts, &end, &startinpos, &endinpos, &exc, &s,
    6241             :                 &writer)) {
    6242          21 :             goto onError;
    6243             :         }
    6244          38 :         assert(end - s <= writer.size - writer.pos);
    6245             : 
    6246             : #undef WRITE_CHAR
    6247             :     }
    6248       82637 :     Py_XDECREF(errorHandler);
    6249       82637 :     Py_XDECREF(exc);
    6250       82637 :     return _PyUnicodeWriter_Finish(&writer);
    6251             : 
    6252          21 :   onError:
    6253          21 :     _PyUnicodeWriter_Dealloc(&writer);
    6254          21 :     Py_XDECREF(errorHandler);
    6255          21 :     Py_XDECREF(exc);
    6256          21 :     return NULL;
    6257             : }
    6258             : 
    6259             : PyObject *
    6260       41933 : PyUnicode_DecodeRawUnicodeEscape(const char *s,
    6261             :                                  Py_ssize_t size,
    6262             :                                  const char *errors)
    6263             : {
    6264       41933 :     return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
    6265             : }
    6266             : 
    6267             : 
    6268             : PyObject *
    6269       11252 : PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
    6270             : {
    6271             :     PyObject *repr;
    6272             :     char *p;
    6273             :     Py_ssize_t expandsize, pos;
    6274             :     int kind;
    6275             :     const void *data;
    6276             :     Py_ssize_t len;
    6277             : 
    6278       11252 :     if (!PyUnicode_Check(unicode)) {
    6279           0 :         PyErr_BadArgument();
    6280           0 :         return NULL;
    6281             :     }
    6282       11252 :     kind = PyUnicode_KIND(unicode);
    6283       11252 :     data = PyUnicode_DATA(unicode);
    6284       11252 :     len = PyUnicode_GET_LENGTH(unicode);
    6285       11252 :     if (kind == PyUnicode_1BYTE_KIND) {
    6286       10294 :         return PyBytes_FromStringAndSize(data, len);
    6287             :     }
    6288             : 
    6289             :     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
    6290             :        bytes, and 1 byte characters 4. */
    6291         958 :     expandsize = kind * 2 + 2;
    6292             : 
    6293         958 :     if (len > PY_SSIZE_T_MAX / expandsize) {
    6294           0 :         return PyErr_NoMemory();
    6295             :     }
    6296         958 :     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
    6297         958 :     if (repr == NULL) {
    6298           0 :         return NULL;
    6299             :     }
    6300         958 :     if (len == 0) {
    6301           0 :         return repr;
    6302             :     }
    6303             : 
    6304         958 :     p = PyBytes_AS_STRING(repr);
    6305       77140 :     for (pos = 0; pos < len; pos++) {
    6306       76182 :         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
    6307             : 
    6308             :         /* U+0000-U+00ff range: Copy 8-bit characters as-is */
    6309       76182 :         if (ch < 0x100) {
    6310       69776 :             *p++ = (char) ch;
    6311             :         }
    6312             :         /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
    6313        6406 :         else if (ch < 0x10000) {
    6314        6389 :             *p++ = '\\';
    6315        6389 :             *p++ = 'u';
    6316        6389 :             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
    6317        6389 :             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
    6318        6389 :             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
    6319        6389 :             *p++ = Py_hexdigits[ch & 15];
    6320             :         }
    6321             :         /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
    6322             :         else {
    6323          17 :             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
    6324          17 :             *p++ = '\\';
    6325          17 :             *p++ = 'U';
    6326          17 :             *p++ = '0';
    6327          17 :             *p++ = '0';
    6328          17 :             *p++ = Py_hexdigits[(ch >> 20) & 0xf];
    6329          17 :             *p++ = Py_hexdigits[(ch >> 16) & 0xf];
    6330          17 :             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
    6331          17 :             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
    6332          17 :             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
    6333          17 :             *p++ = Py_hexdigits[ch & 15];
    6334             :         }
    6335             :     }
    6336             : 
    6337         958 :     assert(p > PyBytes_AS_STRING(repr));
    6338         958 :     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
    6339           0 :         return NULL;
    6340             :     }
    6341         958 :     return repr;
    6342             : }
    6343             : 
    6344             : /* --- Latin-1 Codec ------------------------------------------------------ */
    6345             : 
    6346             : PyObject *
    6347     4907560 : PyUnicode_DecodeLatin1(const char *s,
    6348             :                        Py_ssize_t size,
    6349             :                        const char *errors)
    6350             : {
    6351             :     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
    6352     4907560 :     return _PyUnicode_FromUCS1((const unsigned char*)s, size);
    6353             : }
    6354             : 
    6355             : /* create or adjust a UnicodeEncodeError */
    6356             : static void
    6357        8148 : make_encode_exception(PyObject **exceptionObject,
    6358             :                       const char *encoding,
    6359             :                       PyObject *unicode,
    6360             :                       Py_ssize_t startpos, Py_ssize_t endpos,
    6361             :                       const char *reason)
    6362             : {
    6363        8148 :     if (*exceptionObject == NULL) {
    6364        1798 :         *exceptionObject = PyObject_CallFunction(
    6365             :             PyExc_UnicodeEncodeError, "sOnns",
    6366             :             encoding, unicode, startpos, endpos, reason);
    6367             :     }
    6368             :     else {
    6369        6350 :         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
    6370           0 :             goto onError;
    6371        6350 :         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
    6372           0 :             goto onError;
    6373        6350 :         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
    6374           0 :             goto onError;
    6375        6350 :         return;
    6376           0 :       onError:
    6377           0 :         Py_CLEAR(*exceptionObject);
    6378             :     }
    6379             : }
    6380             : 
    6381             : /* raises a UnicodeEncodeError */
    6382             : static void
    6383        1213 : raise_encode_exception(PyObject **exceptionObject,
    6384             :                        const char *encoding,
    6385             :                        PyObject *unicode,
    6386             :                        Py_ssize_t startpos, Py_ssize_t endpos,
    6387             :                        const char *reason)
    6388             : {
    6389        1213 :     make_encode_exception(exceptionObject,
    6390             :                           encoding, unicode, startpos, endpos, reason);
    6391        1213 :     if (*exceptionObject != NULL)
    6392        1213 :         PyCodec_StrictErrors(*exceptionObject);
    6393        1213 : }
    6394             : 
    6395             : /* error handling callback helper:
    6396             :    build arguments, call the callback and check the arguments,
    6397             :    put the result into newpos and return the replacement string, which
    6398             :    has to be freed by the caller */
    6399             : static PyObject *
    6400        6935 : unicode_encode_call_errorhandler(const char *errors,
    6401             :                                  PyObject **errorHandler,
    6402             :                                  const char *encoding, const char *reason,
    6403             :                                  PyObject *unicode, PyObject **exceptionObject,
    6404             :                                  Py_ssize_t startpos, Py_ssize_t endpos,
    6405             :                                  Py_ssize_t *newpos)
    6406             : {
    6407             :     static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
    6408             :     Py_ssize_t len;
    6409             :     PyObject *restuple;
    6410             :     PyObject *resunicode;
    6411             : 
    6412        6935 :     if (*errorHandler == NULL) {
    6413         615 :         *errorHandler = PyCodec_LookupError(errors);
    6414         615 :         if (*errorHandler == NULL)
    6415           0 :             return NULL;
    6416             :     }
    6417             : 
    6418        6935 :     len = PyUnicode_GET_LENGTH(unicode);
    6419             : 
    6420        6935 :     make_encode_exception(exceptionObject,
    6421             :                           encoding, unicode, startpos, endpos, reason);
    6422        6935 :     if (*exceptionObject == NULL)
    6423           0 :         return NULL;
    6424             : 
    6425        6935 :     restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
    6426        6935 :     if (restuple == NULL)
    6427         396 :         return NULL;
    6428        6539 :     if (!PyTuple_Check(restuple)) {
    6429           9 :         PyErr_SetString(PyExc_TypeError, &argparse[3]);
    6430           9 :         Py_DECREF(restuple);
    6431           9 :         return NULL;
    6432             :     }
    6433        6530 :     if (!PyArg_ParseTuple(restuple, argparse,
    6434             :                           &resunicode, newpos)) {
    6435          29 :         Py_DECREF(restuple);
    6436          29 :         return NULL;
    6437             :     }
    6438        6501 :     if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
    6439           0 :         PyErr_SetString(PyExc_TypeError, &argparse[3]);
    6440           0 :         Py_DECREF(restuple);
    6441           0 :         return NULL;
    6442             :     }
    6443        6501 :     if (*newpos<0)
    6444           3 :         *newpos = len + *newpos;
    6445        6501 :     if (*newpos<0 || *newpos>len) {
    6446           2 :         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
    6447           2 :         Py_DECREF(restuple);
    6448           2 :         return NULL;
    6449             :     }
    6450        6499 :     Py_INCREF(resunicode);
    6451        6499 :     Py_DECREF(restuple);
    6452        6499 :     return resunicode;
    6453             : }
    6454             : 
    6455             : static PyObject *
    6456        6098 : unicode_encode_ucs1(PyObject *unicode,
    6457             :                     const char *errors,
    6458             :                     const Py_UCS4 limit)
    6459             : {
    6460             :     /* input state */
    6461        6098 :     Py_ssize_t pos=0, size;
    6462             :     int kind;
    6463             :     const void *data;
    6464             :     /* pointer into the output */
    6465             :     char *str;
    6466        6098 :     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
    6467        6098 :     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
    6468        6098 :     PyObject *error_handler_obj = NULL;
    6469        6098 :     PyObject *exc = NULL;
    6470        6098 :     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
    6471        6098 :     PyObject *rep = NULL;
    6472             :     /* output object */
    6473             :     _PyBytesWriter writer;
    6474             : 
    6475        6098 :     size = PyUnicode_GET_LENGTH(unicode);
    6476        6098 :     kind = PyUnicode_KIND(unicode);
    6477        6098 :     data = PyUnicode_DATA(unicode);
    6478             :     /* allocate enough for a simple encoding without
    6479             :        replacements, if we need more, we'll resize */
    6480        6098 :     if (size == 0)
    6481           6 :         return PyBytes_FromStringAndSize(NULL, 0);
    6482             : 
    6483        6092 :     _PyBytesWriter_Init(&writer);
    6484        6092 :     str = _PyBytesWriter_Alloc(&writer, size);
    6485        6092 :     if (str == NULL)
    6486           0 :         return NULL;
    6487             : 
    6488      331180 :     while (pos < size) {
    6489      326305 :         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
    6490             : 
    6491             :         /* can we encode this? */
    6492      326305 :         if (ch < limit) {
    6493             :             /* no overflow check, because we know that the space is enough */
    6494      286369 :             *str++ = (char)ch;
    6495      286369 :             ++pos;
    6496             :         }
    6497             :         else {
    6498             :             Py_ssize_t newpos, i;
    6499             :             /* startpos for collecting unencodable chars */
    6500       39936 :             Py_ssize_t collstart = pos;
    6501       39936 :             Py_ssize_t collend = collstart + 1;
    6502             :             /* find all unecodable characters */
    6503             : 
    6504      227809 :             while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
    6505      187873 :                 ++collend;
    6506             : 
    6507             :             /* Only overallocate the buffer if it's not the last write */
    6508       39936 :             writer.overallocate = (collend < size);
    6509             : 
    6510             :             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
    6511       39936 :             if (error_handler == _Py_ERROR_UNKNOWN)
    6512        6040 :                 error_handler = _Py_GetErrorHandler(errors);
    6513             : 
    6514       39936 :             switch (error_handler) {
    6515        1176 :             case _Py_ERROR_STRICT:
    6516        1176 :                 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
    6517        1217 :                 goto onError;
    6518             : 
    6519        1193 :             case _Py_ERROR_REPLACE:
    6520        1193 :                 memset(str, '?', collend - collstart);
    6521        1193 :                 str += (collend - collstart);
    6522             :                 /* fall through */
    6523        2209 :             case _Py_ERROR_IGNORE:
    6524        2209 :                 pos = collend;
    6525        2209 :                 break;
    6526             : 
    6527        8387 :             case _Py_ERROR_BACKSLASHREPLACE:
    6528             :                 /* subtract preallocated bytes */
    6529        8387 :                 writer.min_size -= (collend - collstart);
    6530        8387 :                 str = backslashreplace(&writer, str,
    6531             :                                        unicode, collstart, collend);
    6532        8387 :                 if (str == NULL)
    6533           0 :                     goto onError;
    6534        8387 :                 pos = collend;
    6535        8387 :                 break;
    6536             : 
    6537        1240 :             case _Py_ERROR_XMLCHARREFREPLACE:
    6538             :                 /* subtract preallocated bytes */
    6539        1240 :                 writer.min_size -= (collend - collstart);
    6540        1240 :                 str = xmlcharrefreplace(&writer, str,
    6541             :                                         unicode, collstart, collend);
    6542        1240 :                 if (str == NULL)
    6543           0 :                     goto onError;
    6544        1240 :                 pos = collend;
    6545        1240 :                 break;
    6546             : 
    6547       20745 :             case _Py_ERROR_SURROGATEESCAPE:
    6548      180165 :                 for (i = collstart; i < collend; ++i) {
    6549      159422 :                     ch = PyUnicode_READ(kind, data, i);
    6550      159422 :                     if (ch < 0xdc80 || 0xdcff < ch) {
    6551             :                         /* Not a UTF-8b surrogate */
    6552             :                         break;
    6553             :                     }
    6554      159420 :                     *str++ = (char)(ch - 0xdc00);
    6555      159420 :                     ++pos;
    6556             :                 }
    6557       20745 :                 if (i >= collend)
    6558       20743 :                     break;
    6559           2 :                 collstart = pos;
    6560           2 :                 assert(collstart != collend);
    6561             :                 /* fall through */
    6562             : 
    6563             :             default:
    6564        6181 :                 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
    6565             :                                                        encoding, reason, unicode, &exc,
    6566             :                                                        collstart, collend, &newpos);
    6567        6181 :                 if (rep == NULL)
    6568          37 :                     goto onError;
    6569             : 
    6570        6144 :                 if (newpos < collstart) {
    6571         102 :                     writer.overallocate = 1;
    6572         102 :                     str = _PyBytesWriter_Prepare(&writer, str,
    6573             :                                                  collstart - newpos);
    6574         102 :                     if (str == NULL)
    6575           0 :                         goto onError;
    6576             :                 }
    6577             :                 else {
    6578             :                     /* subtract preallocated bytes */
    6579        6042 :                     writer.min_size -= newpos - collstart;
    6580             :                     /* Only overallocate the buffer if it's not the last write */
    6581        6042 :                     writer.overallocate = (newpos < size);
    6582             :                 }
    6583             : 
    6584        6144 :                 if (PyBytes_Check(rep)) {
    6585             :                     /* Directly copy bytes result to output. */
    6586           4 :                     str = _PyBytesWriter_WriteBytes(&writer, str,
    6587           2 :                                                     PyBytes_AS_STRING(rep),
    6588             :                                                     PyBytes_GET_SIZE(rep));
    6589             :                 }
    6590             :                 else {
    6591        6142 :                     assert(PyUnicode_Check(rep));
    6592             : 
    6593       12284 :                     if (limit == 256 ?
    6594          70 :                         PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
    6595        6072 :                         !PyUnicode_IS_ASCII(rep))
    6596             :                     {
    6597             :                         /* Not all characters are smaller than limit */
    6598           4 :                         raise_encode_exception(&exc, encoding, unicode,
    6599             :                                                collstart, collend, reason);
    6600           4 :                         goto onError;
    6601             :                     }
    6602        6138 :                     assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
    6603       12276 :                     str = _PyBytesWriter_WriteBytes(&writer, str,
    6604        6138 :                                                     PyUnicode_DATA(rep),
    6605             :                                                     PyUnicode_GET_LENGTH(rep));
    6606             :                 }
    6607        6140 :                 if (str == NULL)
    6608           0 :                     goto onError;
    6609             : 
    6610        6140 :                 pos = newpos;
    6611        6140 :                 Py_CLEAR(rep);
    6612             :             }
    6613             : 
    6614             :             /* If overallocation was disabled, ensure that it was the last
    6615             :                write. Otherwise, we missed an optimization */
    6616       38719 :             assert(writer.overallocate || pos == size);
    6617             :         }
    6618             :     }
    6619             : 
    6620        4875 :     Py_XDECREF(error_handler_obj);
    6621        4875 :     Py_XDECREF(exc);
    6622        4875 :     return _PyBytesWriter_Finish(&writer, str);
    6623             : 
    6624        1217 :   onError:
    6625        1217 :     Py_XDECREF(rep);
    6626        1217 :     _PyBytesWriter_Dealloc(&writer);
    6627        1217 :     Py_XDECREF(error_handler_obj);
    6628        1217 :     Py_XDECREF(exc);
    6629        1217 :     return NULL;
    6630             : }
    6631             : 
    6632             : PyObject *
    6633       25199 : _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
    6634             : {
    6635       25199 :     if (!PyUnicode_Check(unicode)) {
    6636           0 :         PyErr_BadArgument();
    6637           0 :         return NULL;
    6638             :     }
    6639             :     /* Fast path: if it is a one-byte string, construct
    6640             :        bytes object directly. */
    6641       25199 :     if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
    6642       25117 :         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
    6643             :                                          PyUnicode_GET_LENGTH(unicode));
    6644             :     /* Non-Latin-1 characters present. Defer to above function to
    6645             :        raise the exception. */
    6646          82 :     return unicode_encode_ucs1(unicode, errors, 256);
    6647             : }
    6648             : 
    6649             : PyObject*
    6650          42 : PyUnicode_AsLatin1String(PyObject *unicode)
    6651             : {
    6652          42 :     return _PyUnicode_AsLatin1String(unicode, NULL);
    6653             : }
    6654             : 
    6655             : /* --- 7-bit ASCII Codec -------------------------------------------------- */
    6656             : 
    6657             : PyObject *
    6658      799253 : PyUnicode_DecodeASCII(const char *s,
    6659             :                       Py_ssize_t size,
    6660             :                       const char *errors)
    6661             : {
    6662      799253 :     const char *starts = s;
    6663      799253 :     const char *e = s + size;
    6664      799253 :     PyObject *error_handler_obj = NULL;
    6665      799253 :     PyObject *exc = NULL;
    6666      799253 :     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
    6667             : 
    6668      799253 :     if (size == 0)
    6669         621 :         _Py_RETURN_UNICODE_EMPTY();
    6670             : 
    6671             :     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
    6672      798632 :     if (size == 1 && (unsigned char)s[0] < 128) {
    6673      517607 :         return get_latin1_char((unsigned char)s[0]);
    6674             :     }
    6675             : 
    6676             :     // Shortcut for simple case
    6677      281025 :     PyObject *u = PyUnicode_New(size, 127);
    6678      281025 :     if (u == NULL) {
    6679           0 :         return NULL;
    6680             :     }
    6681      281025 :     Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
    6682      281025 :     if (outpos == size) {
    6683      280595 :         return u;
    6684             :     }
    6685             : 
    6686             :     _PyUnicodeWriter writer;
    6687         430 :     _PyUnicodeWriter_InitWithBuffer(&writer, u);
    6688         430 :     writer.pos = outpos;
    6689             : 
    6690         430 :     s += outpos;
    6691         430 :     int kind = writer.kind;
    6692         430 :     void *data = writer.data;
    6693             :     Py_ssize_t startinpos, endinpos;
    6694             : 
    6695      232877 :     while (s < e) {
    6696      232503 :         unsigned char c = (unsigned char)*s;
    6697      232503 :         if (c < 128) {
    6698       73502 :             PyUnicode_WRITE(kind, data, writer.pos, c);
    6699       73502 :             writer.pos++;
    6700       73502 :             ++s;
    6701       73502 :             continue;
    6702             :         }
    6703             : 
    6704             :         /* byte outsize range 0x00..0x7f: call the error handler */
    6705             : 
    6706      159001 :         if (error_handler == _Py_ERROR_UNKNOWN)
    6707         430 :             error_handler = _Py_GetErrorHandler(errors);
    6708             : 
    6709      159001 :         switch (error_handler)
    6710             :         {
    6711      158870 :         case _Py_ERROR_REPLACE:
    6712             :         case _Py_ERROR_SURROGATEESCAPE:
    6713             :             /* Fast-path: the error handler only writes one character,
    6714             :                but we may switch to UCS2 at the first write */
    6715      158870 :             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
    6716           0 :                 goto onError;
    6717      158870 :             kind = writer.kind;
    6718      158870 :             data = writer.data;
    6719             : 
    6720      158870 :             if (error_handler == _Py_ERROR_REPLACE)
    6721         894 :                 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
    6722             :             else
    6723      157976 :                 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
    6724      158870 :             writer.pos++;
    6725      158870 :             ++s;
    6726      158870 :             break;
    6727             : 
    6728          11 :         case _Py_ERROR_IGNORE:
    6729          11 :             ++s;
    6730          11 :             break;
    6731             : 
    6732         120 :         default:
    6733         120 :             startinpos = s-starts;
    6734         120 :             endinpos = startinpos + 1;
    6735         120 :             if (unicode_decode_call_errorhandler_writer(
    6736             :                     errors, &error_handler_obj,
    6737             :                     "ascii", "ordinal not in range(128)",
    6738             :                     &starts, &e, &startinpos, &endinpos, &exc, &s,
    6739             :                     &writer))
    6740          56 :                 goto onError;
    6741          64 :             kind = writer.kind;
    6742          64 :             data = writer.data;
    6743             :         }
    6744             :     }
    6745         374 :     Py_XDECREF(error_handler_obj);
    6746         374 :     Py_XDECREF(exc);
    6747         374 :     return _PyUnicodeWriter_Finish(&writer);
    6748             : 
    6749          56 :   onError:
    6750          56 :     _PyUnicodeWriter_Dealloc(&writer);
    6751          56 :     Py_XDECREF(error_handler_obj);
    6752          56 :     Py_XDECREF(exc);
    6753          56 :     return NULL;
    6754             : }
    6755             : 
    6756             : PyObject *
    6757     2166990 : _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
    6758             : {
    6759     2166990 :     if (!PyUnicode_Check(unicode)) {
    6760           0 :         PyErr_BadArgument();
    6761           0 :         return NULL;
    6762             :     }
    6763             :     /* Fast path: if it is an ASCII-only string, construct bytes object
    6764             :        directly. Else defer to above function to raise the exception. */
    6765     2166990 :     if (PyUnicode_IS_ASCII(unicode))
    6766     2161030 :         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
    6767             :                                          PyUnicode_GET_LENGTH(unicode));
    6768        5958 :     return unicode_encode_ucs1(unicode, errors, 128);
    6769             : }
    6770             : 
    6771             : PyObject *
    6772      745819 : PyUnicode_AsASCIIString(PyObject *unicode)
    6773             : {
    6774      745819 :     return _PyUnicode_AsASCIIString(unicode, NULL);
    6775             : }
    6776             : 
    6777             : #ifdef MS_WINDOWS
    6778             : 
    6779             : /* --- MBCS codecs for Windows -------------------------------------------- */
    6780             : 
    6781             : #if SIZEOF_INT < SIZEOF_SIZE_T
    6782             : #define NEED_RETRY
    6783             : #endif
    6784             : 
    6785             : /* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
    6786             :    transcoding from UTF-16), but INT_MAX / 4 performs better in
    6787             :    both cases also and avoids partial characters overrunning the
    6788             :    length limit in MultiByteToWideChar on Windows */
    6789             : #define DECODING_CHUNK_SIZE (INT_MAX/4)
    6790             : 
    6791             : #ifndef WC_ERR_INVALID_CHARS
    6792             : #  define WC_ERR_INVALID_CHARS 0x0080
    6793             : #endif
    6794             : 
    6795             : static const char*
    6796             : code_page_name(UINT code_page, PyObject **obj)
    6797             : {
    6798             :     *obj = NULL;
    6799             :     if (code_page == CP_ACP)
    6800             :         return "mbcs";
    6801             :     if (code_page == CP_UTF7)
    6802             :         return "CP_UTF7";
    6803             :     if (code_page == CP_UTF8)
    6804             :         return "CP_UTF8";
    6805             : 
    6806             :     *obj = PyBytes_FromFormat("cp%u", code_page);
    6807             :     if (*obj == NULL)
    6808             :         return NULL;
    6809             :     return PyBytes_AS_STRING(*obj);
    6810             : }
    6811             : 
    6812             : static DWORD
    6813             : decode_code_page_flags(UINT code_page)
    6814             : {
    6815             :     if (code_page == CP_UTF7) {
    6816             :         /* The CP_UTF7 decoder only supports flags=0 */
    6817             :         return 0;
    6818             :     }
    6819             :     else
    6820             :         return MB_ERR_INVALID_CHARS;
    6821             : }
    6822             : 
    6823             : /*
    6824             :  * Decode a byte string from a Windows code page into unicode object in strict
    6825             :  * mode.
    6826             :  *
    6827             :  * Returns consumed size if succeed, returns -2 on decode error, or raise an
    6828             :  * OSError and returns -1 on other error.
    6829             :  */
    6830             : static int
    6831             : decode_code_page_strict(UINT code_page,
    6832             :                         wchar_t **buf,
    6833             :                         Py_ssize_t *bufsize,
    6834             :                         const char *in,
    6835             :                         int insize)
    6836             : {
    6837             :     DWORD flags = MB_ERR_INVALID_CHARS;
    6838             :     wchar_t *out;
    6839             :     DWORD outsize;
    6840             : 
    6841             :     /* First get the size of the result */
    6842             :     assert(insize > 0);
    6843             :     while ((outsize = MultiByteToWideChar(code_page, flags,
    6844             :                                           in, insize, NULL, 0)) <= 0)
    6845             :     {
    6846             :         if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
    6847             :             goto error;
    6848             :         }
    6849             :         /* For some code pages (e.g. UTF-7) flags must be set to 0. */
    6850             :         flags = 0;
    6851             :     }
    6852             : 
    6853             :     /* Extend a wchar_t* buffer */
    6854             :     Py_ssize_t n = *bufsize;   /* Get the current length */
    6855             :     if (widechar_resize(buf, bufsize, n + outsize) < 0) {
    6856             :         return -1;
    6857             :     }
    6858             :     out = *buf + n;
    6859             : 
    6860             :     /* Do the conversion */
    6861             :     outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
    6862             :     if (outsize <= 0)
    6863             :         goto error;
    6864             :     return insize;
    6865             : 
    6866             : error:
    6867             :     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
    6868             :         return -2;
    6869             :     PyErr_SetFromWindowsErr(0);
    6870             :     return -1;
    6871             : }
    6872             : 
    6873             : /*
    6874             :  * Decode a byte string from a code page into unicode object with an error
    6875             :  * handler.
    6876             :  *
    6877             :  * Returns consumed size if succeed, or raise an OSError or
    6878             :  * UnicodeDecodeError exception and returns -1 on error.
    6879             :  */
    6880             : static int
    6881             : decode_code_page_errors(UINT code_page,
    6882             :                         wchar_t **buf,
    6883             :                         Py_ssize_t *bufsize,
    6884             :                         const char *in, const int size,
    6885             :                         const char *errors, int final)
    6886             : {
    6887             :     const char *startin = in;
    6888             :     const char *endin = in + size;
    6889             :     DWORD flags = MB_ERR_INVALID_CHARS;
    6890             :     /* Ideally, we should get reason from FormatMessage. This is the Windows
    6891             :        2000 English version of the message. */
    6892             :     const char *reason = "No mapping for the Unicode character exists "
    6893             :                          "in the target code page.";
    6894             :     /* each step cannot decode more than 1 character, but a character can be
    6895             :        represented as a surrogate pair */
    6896             :     wchar_t buffer[2], *out;
    6897             :     int insize;
    6898             :     Py_ssize_t outsize;
    6899             :     PyObject *errorHandler = NULL;
    6900             :     PyObject *exc = NULL;
    6901             :     PyObject *encoding_obj = NULL;
    6902             :     const char *encoding;
    6903             :     DWORD err;
    6904             :     int ret = -1;
    6905             : 
    6906             :     assert(size > 0);
    6907             : 
    6908             :     encoding = code_page_name(code_page, &encoding_obj);
    6909             :     if (encoding == NULL)
    6910             :         return -1;
    6911             : 
    6912             :     if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
    6913             :         /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
    6914             :            UnicodeDecodeError. */
    6915             :         make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
    6916             :         if (exc != NULL) {
    6917             :             PyCodec_StrictErrors(exc);
    6918             :             Py_CLEAR(exc);
    6919             :         }
    6920             :         goto error;
    6921             :     }
    6922             : 
    6923             :     /* Extend a wchar_t* buffer */
    6924             :     Py_ssize_t n = *bufsize;   /* Get the current length */
    6925             :     if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
    6926             :         PyErr_NoMemory();
    6927             :         goto error;
    6928             :     }
    6929             :     if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
    6930             :         goto error;
    6931             :     }
    6932             :     out = *buf + n;
    6933             : 
    6934             :     /* Decode the byte string character per character */
    6935             :     while (in < endin)
    6936             :     {
    6937             :         /* Decode a character */
    6938             :         insize = 1;
    6939             :         do
    6940             :         {
    6941             :             outsize = MultiByteToWideChar(code_page, flags,
    6942             :                                           in, insize,
    6943             :                                           buffer, Py_ARRAY_LENGTH(buffer));
    6944             :             if (outsize > 0)
    6945             :                 break;
    6946             :             err = GetLastError();
    6947             :             if (err == ERROR_INVALID_FLAGS && flags) {
    6948             :                 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
    6949             :                 flags = 0;
    6950             :                 continue;
    6951             :             }
    6952             :             if (err != ERROR_NO_UNICODE_TRANSLATION
    6953             :                 && err != ERROR_INSUFFICIENT_BUFFER)
    6954             :             {
    6955             :                 PyErr_SetFromWindowsErr(0);
    6956             :                 goto error;
    6957             :             }
    6958             :             insize++;
    6959             :         }
    6960             :         /* 4=maximum length of a UTF-8 sequence */
    6961             :         while (insize <= 4 && (in + insize) <= endin);
    6962             : 
    6963             :         if (outsize <= 0) {
    6964             :             Py_ssize_t startinpos, endinpos, outpos;
    6965             : 
    6966             :             /* last character in partial decode? */
    6967             :             if (in + insize >= endin && !final)
    6968             :                 break;
    6969             : 
    6970             :             startinpos = in - startin;
    6971             :             endinpos = startinpos + 1;
    6972             :             outpos = out - *buf;
    6973             :             if (unicode_decode_call_errorhandler_wchar(
    6974             :                     errors, &errorHandler,
    6975             :                     encoding, reason,
    6976             :                     &startin, &endin, &startinpos, &endinpos, &exc, &in,
    6977             :                     buf, bufsize, &outpos))
    6978             :             {
    6979             :                 goto error;
    6980             :             }
    6981             :             out = *buf + outpos;
    6982             :         }
    6983             :         else {
    6984             :             in += insize;
    6985             :             memcpy(out, buffer, outsize * sizeof(wchar_t));
    6986             :             out += outsize;
    6987             :         }
    6988             :     }
    6989             : 
    6990             :     /* Shrink the buffer */
    6991             :     assert(out - *buf <= *bufsize);
    6992             :     *bufsize = out - *buf;
    6993             :     /* (in - startin) <= size and size is an int */
    6994             :     ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
    6995             : 
    6996             : error:
    6997             :     Py_XDECREF(encoding_obj);
    6998             :     Py_XDECREF(errorHandler);
    6999             :     Py_XDECREF(exc);
    7000             :     return ret;
    7001             : }
    7002             : 
    7003             : static PyObject *
    7004             : decode_code_page_stateful(int code_page,
    7005             :                           const char *s, Py_ssize_t size,
    7006             :                           const char *errors, Py_ssize_t *consumed)
    7007             : {
    7008             :     wchar_t *buf = NULL;
    7009             :     Py_ssize_t bufsize = 0;
    7010             :     int chunk_size, final, converted, done;
    7011             : 
    7012             :     if (code_page < 0) {
    7013             :         PyErr_SetString(PyExc_ValueError, "invalid code page number");
    7014             :         return NULL;
    7015             :     }
    7016             :     if (size < 0) {
    7017             :         PyErr_BadInternalCall();
    7018             :         return NULL;
    7019             :     }
    7020             : 
    7021             :     if (consumed)
    7022             :         *consumed = 0;
    7023             : 
    7024             :     do
    7025             :     {
    7026             : #ifdef NEED_RETRY
    7027             :         if (size > DECODING_CHUNK_SIZE) {
    7028             :             chunk_size = DECODING_CHUNK_SIZE;
    7029             :             final = 0;
    7030             :             done = 0;
    7031             :         }
    7032             :         else
    7033             : #endif
    7034             :         {
    7035             :             chunk_size = (int)size;
    7036             :             final = (consumed == NULL);
    7037             :             done = 1;
    7038             :         }
    7039             : 
    7040             :         if (chunk_size == 0 && done) {
    7041             :             if (buf != NULL)
    7042             :                 break;
    7043             :             _Py_RETURN_UNICODE_EMPTY();
    7044             :         }
    7045             : 
    7046             :         converted = decode_code_page_strict(code_page, &buf, &bufsize,
    7047             :                                             s, chunk_size);
    7048             :         if (converted == -2)
    7049             :             converted = decode_code_page_errors(code_page, &buf, &bufsize,
    7050             :                                                 s, chunk_size,
    7051             :                                                 errors, final);
    7052             :         assert(converted != 0 || done);
    7053             : 
    7054             :         if (converted < 0) {
    7055             :             PyMem_Free(buf);
    7056             :             return NULL;
    7057             :         }
    7058             : 
    7059             :         if (consumed)
    7060             :             *consumed += converted;
    7061             : 
    7062             :         s += converted;
    7063             :         size -= converted;
    7064             :     } while (!done);
    7065             : 
    7066             :     PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
    7067             :     PyMem_Free(buf);
    7068             :     return v;
    7069             : }
    7070             : 
    7071             : PyObject *
    7072             : PyUnicode_DecodeCodePageStateful(int code_page,
    7073             :                                  const char *s,
    7074             :                                  Py_ssize_t size,
    7075             :                                  const char *errors,
    7076             :                                  Py_ssize_t *consumed)
    7077             : {
    7078             :     return decode_code_page_stateful(code_page, s, size, errors, consumed);
    7079             : }
    7080             : 
    7081             : PyObject *
    7082             : PyUnicode_DecodeMBCSStateful(const char *s,
    7083             :                              Py_ssize_t size,
    7084             :                              const char *errors,
    7085             :                              Py_ssize_t *consumed)
    7086             : {
    7087             :     return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
    7088             : }
    7089             : 
    7090             : PyObject *
    7091             : PyUnicode_DecodeMBCS(const char *s,
    7092             :                      Py_ssize_t size,
    7093             :                      const char *errors)
    7094             : {
    7095             :     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
    7096             : }
    7097             : 
    7098             : static DWORD
    7099             : encode_code_page_flags(UINT code_page, const char *errors)
    7100             : {
    7101             :     if (code_page == CP_UTF8) {
    7102             :         return WC_ERR_INVALID_CHARS;
    7103             :     }
    7104             :     else if (code_page == CP_UTF7) {
    7105             :         /* CP_UTF7 only supports flags=0 */
    7106             :         return 0;
    7107             :     }
    7108             :     else {
    7109             :         if (errors != NULL && strcmp(errors, "replace") == 0)
    7110             :             return 0;
    7111             :         else
    7112             :             return WC_NO_BEST_FIT_CHARS;
    7113             :     }
    7114             : }
    7115             : 
    7116             : /*
    7117             :  * Encode a Unicode string to a Windows code page into a byte string in strict
    7118             :  * mode.
    7119             :  *
    7120             :  * Returns consumed characters if succeed, returns -2 on encode error, or raise
    7121             :  * an OSError and returns -1 on other error.
    7122             :  */
    7123             : static int
    7124             : encode_code_page_strict(UINT code_page, PyObject **outbytes,
    7125             :                         PyObject *unicode, Py_ssize_t offset, int len,
    7126             :                         const char* errors)
    7127             : {
    7128             :     BOOL usedDefaultChar = FALSE;
    7129             :     BOOL *pusedDefaultChar = &usedDefaultChar;
    7130             :     int outsize;
    7131             :     wchar_t *p;
    7132             :     Py_ssize_t size;
    7133             :     const DWORD flags = encode_code_page_flags(code_page, NULL);
    7134             :     char *out;
    7135             :     /* Create a substring so that we can get the UTF-16 representation
    7136             :        of just the slice under consideration. */
    7137             :     PyObject *substring;
    7138             :     int ret = -1;
    7139             : 
    7140             :     assert(len > 0);
    7141             : 
    7142             :     if (code_page != CP_UTF8 && code_page != CP_UTF7)
    7143             :         pusedDefaultChar = &usedDefaultChar;
    7144             :     else
    7145             :         pusedDefaultChar = NULL;
    7146             : 
    7147             :     substring = PyUnicode_Substring(unicode, offset, offset+len);
    7148             :     if (substring == NULL)
    7149             :         return -1;
    7150             :     p = PyUnicode_AsWideCharString(substring, &size);
    7151             :     Py_CLEAR(substring);
    7152             :     if (p == NULL) {
    7153             :         return -1;
    7154             :     }
    7155             :     assert(size <= INT_MAX);
    7156             : 
    7157             :     /* First get the size of the result */
    7158             :     outsize = WideCharToMultiByte(code_page, flags,
    7159             :                                   p, (int)size,
    7160             :                                   NULL, 0,
    7161             :                                   NULL, pusedDefaultChar);
    7162             :     if (outsize <= 0)
    7163             :         goto error;
    7164             :     /* If we used a default char, then we failed! */
    7165             :     if (pusedDefaultChar && *pusedDefaultChar) {
    7166             :         ret = -2;
    7167             :         goto done;
    7168             :     }
    7169             : 
    7170             :     if (*outbytes == NULL) {
    7171             :         /* Create string object */
    7172             :         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
    7173             :         if (*outbytes == NULL) {
    7174             :             goto done;
    7175             :         }
    7176             :         out = PyBytes_AS_STRING(*outbytes);
    7177             :     }
    7178             :     else {
    7179             :         /* Extend string object */
    7180             :         const Py_ssize_t n = PyBytes_Size(*outbytes);
    7181             :         if (outsize > PY_SSIZE_T_MAX - n) {
    7182             :             PyErr_NoMemory();
    7183             :             goto done;
    7184             :         }
    7185             :         if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
    7186             :             goto done;
    7187             :         }
    7188             :         out = PyBytes_AS_STRING(*outbytes) + n;
    7189             :     }
    7190             : 
    7191             :     /* Do the conversion */
    7192             :     outsize = WideCharToMultiByte(code_page, flags,
    7193             :                                   p, (int)size,
    7194             :                                   out, outsize,
    7195             :                                   NULL, pusedDefaultChar);
    7196             :     if (outsize <= 0)
    7197             :         goto error;
    7198             :     if (pusedDefaultChar && *pusedDefaultChar) {
    7199             :         ret = -2;
    7200             :         goto done;
    7201             :     }
    7202             :     ret = 0;
    7203             : 
    7204             : done:
    7205             :     PyMem_Free(p);
    7206             :     return ret;
    7207             : 
    7208             : error:
    7209             :     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
    7210             :         ret = -2;
    7211             :         goto done;
    7212             :     }
    7213             :     PyErr_SetFromWindowsErr(0);
    7214             :     goto done;
    7215             : }
    7216             : 
    7217             : /*
    7218             :  * Encode a Unicode string to a Windows code page into a byte string using an
    7219             :  * error handler.
    7220             :  *
    7221             :  * Returns consumed characters if succeed, or raise an OSError and returns
    7222             :  * -1 on other error.
    7223             :  */
    7224             : static int
    7225             : encode_code_page_errors(UINT code_page, PyObject **outbytes,
    7226             :                         PyObject *unicode, Py_ssize_t unicode_offset,
    7227             :                         Py_ssize_t insize, const char* errors)
    7228             : {
    7229             :     const DWORD flags = encode_code_page_flags(code_page, errors);
    7230             :     Py_ssize_t pos = unicode_offset;
    7231             :     Py_ssize_t endin = unicode_offset + insize;
    7232             :     /* Ideally, we should get reason from FormatMessage. This is the Windows
    7233             :        2000 English version of the message. */
    7234             :     const char *reason = "invalid character";
    7235             :     /* 4=maximum length of a UTF-8 sequence */
    7236             :     char buffer[4];
    7237             :     BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
    7238             :     Py_ssize_t outsize;
    7239             :     char *out;
    7240             :     PyObject *errorHandler = NULL;
    7241             :     PyObject *exc = NULL;
    7242             :     PyObject *encoding_obj = NULL;
    7243             :     const char *encoding;
    7244             :     Py_ssize_t newpos, newoutsize;
    7245             :     PyObject *rep;
    7246             :     int ret = -1;
    7247             : 
    7248             :     assert(insize > 0);
    7249             : 
    7250             :     encoding = code_page_name(code_page, &encoding_obj);
    7251             :     if (encoding == NULL)
    7252             :         return -1;
    7253             : 
    7254             :     if (errors == NULL || strcmp(errors, "strict") == 0) {
    7255             :         /* The last error was ERROR_NO_UNICODE_TRANSLATION,
    7256             :            then we raise a UnicodeEncodeError. */
    7257             :         make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
    7258             :         if (exc != NULL) {
    7259             :             PyCodec_StrictErrors(exc);
    7260             :             Py_DECREF(exc);
    7261             :         }
    7262             :         Py_XDECREF(encoding_obj);
    7263             :         return -1;
    7264             :     }
    7265             : 
    7266             :     if (code_page != CP_UTF8 && code_page != CP_UTF7)
    7267             :         pusedDefaultChar = &usedDefaultChar;
    7268             :     else
    7269             :         pusedDefaultChar = NULL;
    7270             : 
    7271             :     if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
    7272             :         PyErr_NoMemory();
    7273             :         goto error;
    7274             :     }
    7275             :     outsize = insize * Py_ARRAY_LENGTH(buffer);
    7276             : 
    7277             :     if (*outbytes == NULL) {
    7278             :         /* Create string object */
    7279             :         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
    7280             :         if (*outbytes == NULL)
    7281             :             goto error;
    7282             :         out = PyBytes_AS_STRING(*outbytes);
    7283             :     }
    7284             :     else {
    7285             :         /* Extend string object */
    7286             :         Py_ssize_t n = PyBytes_Size(*outbytes);
    7287             :         if (n > PY_SSIZE_T_MAX - outsize) {
    7288             :             PyErr_NoMemory();
    7289             :             goto error;
    7290             :         }
    7291             :         if (_PyBytes_Resize(outbytes, n + outsize) < 0)
    7292             :             goto error;
    7293             :         out = PyBytes_AS_STRING(*outbytes) + n;
    7294             :     }
    7295             : 
    7296             :     /* Encode the string character per character */
    7297             :     while (pos < endin)
    7298             :     {
    7299             :         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
    7300             :         wchar_t chars[2];
    7301             :         int charsize;
    7302             :         if (ch < 0x10000) {
    7303             :             chars[0] = (wchar_t)ch;
    7304             :             charsize = 1;
    7305             :         }
    7306             :         else {
    7307             :             chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
    7308             :             chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
    7309             :             charsize = 2;
    7310             :         }
    7311             : 
    7312             :         outsize = WideCharToMultiByte(code_page, flags,
    7313             :                                       chars, charsize,
    7314             :                                       buffer, Py_ARRAY_LENGTH(buffer),
    7315             :                                       NULL, pusedDefaultChar);
    7316             :         if (outsize > 0) {
    7317             :             if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
    7318             :             {
    7319             :                 pos++;
    7320             :                 memcpy(out, buffer, outsize);
    7321             :                 out += outsize;
    7322             :                 continue;
    7323             :             }
    7324             :         }
    7325             :         else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
    7326             :             PyErr_SetFromWindowsErr(0);
    7327             :             goto error;
    7328             :         }
    7329             : 
    7330             :         rep = unicode_encode_call_errorhandler(
    7331             :                   errors, &errorHandler, encoding, reason,
    7332             :                   unicode, &exc,
    7333             :                   pos, pos + 1, &newpos);
    7334             :         if (rep == NULL)
    7335             :             goto error;
    7336             : 
    7337             :         Py_ssize_t morebytes = pos - newpos;
    7338             :         if (PyBytes_Check(rep)) {
    7339             :             outsize = PyBytes_GET_SIZE(rep);
    7340             :             morebytes += outsize;
    7341             :             if (morebytes > 0) {
    7342             :                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
    7343             :                 newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
    7344             :                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
    7345             :                     Py_DECREF(rep);
    7346             :                     goto error;
    7347             :                 }
    7348             :                 out = PyBytes_AS_STRING(*outbytes) + offset;
    7349             :             }
    7350             :             memcpy(out, PyBytes_AS_STRING(rep), outsize);
    7351             :             out += outsize;
    7352             :         }
    7353             :         else {
    7354             :             Py_ssize_t i;
    7355             :             int kind;
    7356             :             const void *data;
    7357             : 
    7358             :             outsize = PyUnicode_GET_LENGTH(rep);
    7359             :             morebytes += outsize;
    7360             :             if (morebytes > 0) {
    7361             :                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
    7362             :                 newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
    7363             :                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
    7364             :                     Py_DECREF(rep);
    7365             :                     goto error;
    7366             :                 }
    7367             :                 out = PyBytes_AS_STRING(*outbytes) + offset;
    7368             :             }
    7369             :             kind = PyUnicode_KIND(rep);
    7370             :             data = PyUnicode_DATA(rep);
    7371             :             for (i=0; i < outsize; i++) {
    7372             :                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
    7373             :                 if (ch > 127) {
    7374             :                     raise_encode_exception(&exc,
    7375             :                         encoding, unicode,
    7376             :                         pos, pos + 1,
    7377             :                         "unable to encode error handler result to ASCII");
    7378             :                     Py_DECREF(rep);
    7379             :                     goto error;
    7380             :                 }
    7381             :                 *out = (unsigned char)ch;
    7382             :                 out++;
    7383             :             }
    7384             :         }
    7385             :         pos = newpos;
    7386             :         Py_DECREF(rep);
    7387             :     }
    7388             :     /* write a NUL byte */
    7389             :     *out = 0;
    7390             :     outsize = out - PyBytes_AS_STRING(*outbytes);
    7391             :     assert(outsize <= PyBytes_GET_SIZE(*outbytes));
    7392             :     if (_PyBytes_Resize(outbytes, outsize) < 0)
    7393             :         goto error;
    7394             :     ret = 0;
    7395             : 
    7396             : error:
    7397             :     Py_XDECREF(encoding_obj);
    7398             :     Py_XDECREF(errorHandler);
    7399             :     Py_XDECREF(exc);
    7400             :     return ret;
    7401             : }
    7402             : 
    7403             : static PyObject *
    7404             : encode_code_page(int code_page,
    7405             :                  PyObject *unicode,
    7406             :                  const char *errors)
    7407             : {
    7408             :     Py_ssize_t len;
    7409             :     PyObject *outbytes = NULL;
    7410             :     Py_ssize_t offset;
    7411             :     int chunk_len, ret, done;
    7412             : 
    7413             :     if (!PyUnicode_Check(unicode)) {
    7414             :         PyErr_BadArgument();
    7415             :         return NULL;
    7416             :     }
    7417             : 
    7418             :     len = PyUnicode_GET_LENGTH(unicode);
    7419             : 
    7420             :     if (code_page < 0) {
    7421             :         PyErr_SetString(PyExc_ValueError, "invalid code page number");
    7422             :         return NULL;
    7423             :     }
    7424             : 
    7425             :     if (len == 0)
    7426             :         return PyBytes_FromStringAndSize(NULL, 0);
    7427             : 
    7428             :     offset = 0;
    7429             :     do
    7430             :     {
    7431             : #ifdef NEED_RETRY
    7432             :         if (len > DECODING_CHUNK_SIZE) {
    7433             :             chunk_len = DECODING_CHUNK_SIZE;
    7434             :             done = 0;
    7435             :         }
    7436             :         else
    7437             : #endif
    7438             :         {
    7439             :             chunk_len = (int)len;
    7440             :             done = 1;
    7441             :         }
    7442             : 
    7443             :         ret = encode_code_page_strict(code_page, &outbytes,
    7444             :                                       unicode, offset, chunk_len,
    7445             :                                       errors);
    7446             :         if (ret == -2)
    7447             :             ret = encode_code_page_errors(code_page, &outbytes,
    7448             :                                           unicode, offset,
    7449             :                                           chunk_len, errors);
    7450             :         if (ret < 0) {
    7451             :             Py_XDECREF(outbytes);
    7452             :             return NULL;
    7453             :         }
    7454             : 
    7455             :         offset += chunk_len;
    7456             :         len -= chunk_len;
    7457             :     } while (!done);
    7458             : 
    7459             :     return outbytes;
    7460             : }
    7461             : 
    7462             : PyObject *
    7463             : PyUnicode_EncodeCodePage(int code_page,
    7464             :                          PyObject *unicode,
    7465             :                          const char *errors)
    7466             : {
    7467             :     return encode_code_page(code_page, unicode, errors);
    7468             : }
    7469             : 
    7470             : PyObject *
    7471             : PyUnicode_AsMBCSString(PyObject *unicode)
    7472             : {
    7473             :     return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
    7474             : }
    7475             : 
    7476             : #undef NEED_RETRY
    7477             : 
    7478             : #endif /* MS_WINDOWS */
    7479             : 
    7480             : /* --- Character Mapping Codec -------------------------------------------- */
    7481             : 
    7482             : static int
    7483       22774 : charmap_decode_string(const char *s,
    7484             :                       Py_ssize_t size,
    7485             :                       PyObject *mapping,
    7486             :                       const char *errors,
    7487             :                       _PyUnicodeWriter *writer)
    7488             : {
    7489       22774 :     const char *starts = s;
    7490             :     const char *e;
    7491             :     Py_ssize_t startinpos, endinpos;
    7492       22774 :     PyObject *errorHandler = NULL, *exc = NULL;
    7493             :     Py_ssize_t maplen;
    7494             :     int mapkind;
    7495             :     const void *mapdata;
    7496             :     Py_UCS4 x;
    7497             :     unsigned char ch;
    7498             : 
    7499       22774 :     maplen = PyUnicode_GET_LENGTH(mapping);
    7500       22774 :     mapdata = PyUnicode_DATA(mapping);
    7501       22774 :     mapkind = PyUnicode_KIND(mapping);
    7502             : 
    7503       22774 :     e = s + size;
    7504             : 
    7505       22774 :     if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
    7506             :         /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
    7507             :          * is disabled in encoding aliases, latin1 is preferred because
    7508             :          * its implementation is faster. */
    7509         112 :         const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
    7510         112 :         Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
    7511         112 :         Py_UCS4 maxchar = writer->maxchar;
    7512             : 
    7513         112 :         assert (writer->kind == PyUnicode_1BYTE_KIND);
    7514       12812 :         while (s < e) {
    7515       12700 :             ch = *s;
    7516       12700 :             x = mapdata_ucs1[ch];
    7517       12700 :             if (x > maxchar) {
    7518           4 :                 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
    7519           0 :                     goto onError;
    7520           4 :                 maxchar = writer->maxchar;
    7521           4 :                 outdata = (Py_UCS1 *)writer->data;
    7522             :             }
    7523       12700 :             outdata[writer->pos] = x;
    7524       12700 :             writer->pos++;
    7525       12700 :             ++s;
    7526             :         }
    7527         112 :         return 0;
    7528             :     }
    7529             : 
    7530       23899 :     while (s < e) {
    7531       23864 :         if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
    7532       23578 :             int outkind = writer->kind;
    7533       23578 :             const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
    7534       23578 :             if (outkind == PyUnicode_1BYTE_KIND) {
    7535       22824 :                 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
    7536       22824 :                 Py_UCS4 maxchar = writer->maxchar;
    7537     1016750 :                 while (s < e) {
    7538      994321 :                     ch = *s;
    7539      994321 :                     x = mapdata_ucs2[ch];
    7540      994321 :                     if (x > maxchar)
    7541         399 :                         goto Error;
    7542      993922 :                     outdata[writer->pos] = x;
    7543      993922 :                     writer->pos++;
    7544      993922 :                     ++s;
    7545             :                 }
    7546       22425 :                 break;
    7547             :             }
    7548         754 :             else if (outkind == PyUnicode_2BYTE_KIND) {
    7549         754 :                 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
    7550       43059 :                 while (s < e) {
    7551       42859 :                     ch = *s;
    7552       42859 :                     x = mapdata_ucs2[ch];
    7553       42859 :                     if (x == 0xFFFE)
    7554         554 :                         goto Error;
    7555       42305 :                     outdata[writer->pos] = x;
    7556       42305 :                     writer->pos++;
    7557       42305 :                     ++s;
    7558             :                 }
    7559         200 :                 break;
    7560             :             }
    7561             :         }
    7562         286 :         ch = *s;
    7563             : 
    7564         286 :         if (ch < maplen)
    7565          26 :             x = PyUnicode_READ(mapkind, mapdata, ch);
    7566             :         else
    7567         260 :             x = 0xfffe; /* invalid value */
    7568        1239 : Error:
    7569        1239 :         if (x == 0xfffe)
    7570             :         {
    7571             :             /* undefined mapping */
    7572         876 :             startinpos = s-starts;
    7573         876 :             endinpos = startinpos+1;
    7574         876 :             if (unicode_decode_call_errorhandler_writer(
    7575             :                     errors, &errorHandler,
    7576             :                     "charmap", "character maps to <undefined>",
    7577             :                     &starts, &e, &startinpos, &endinpos, &exc, &s,
    7578             :                     writer)) {
    7579           2 :                 goto onError;
    7580             :             }
    7581         874 :             continue;
    7582             :         }
    7583             : 
    7584         363 :         if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
    7585           0 :             goto onError;
    7586         363 :         ++s;
    7587             :     }
    7588       22660 :     Py_XDECREF(errorHandler);
    7589       22660 :     Py_XDECREF(exc);
    7590       22660 :     return 0;
    7591             : 
    7592           2 : onError:
    7593           2 :     Py_XDECREF(errorHandler);
    7594           2 :     Py_XDECREF(exc);
    7595           2 :     return -1;
    7596             : }
    7597             : 
    7598             : static int
    7599          45 : charmap_decode_mapping(const char *s,
    7600             :                        Py_ssize_t size,
    7601             :                        PyObject *mapping,
    7602             :                        const char *errors,
    7603             :                        _PyUnicodeWriter *writer)
    7604             : {
    7605          45 :     const char *starts = s;
    7606             :     const char *e;
    7607             :     Py_ssize_t startinpos, endinpos;
    7608          45 :     PyObject *errorHandler = NULL, *exc = NULL;
    7609             :     unsigned char ch;
    7610          45 :     PyObject *key, *item = NULL;
    7611             : 
    7612          45 :     e = s + size;
    7613             : 
    7614         420 :     while (s < e) {
    7615         387 :         ch = *s;
    7616             : 
    7617             :         /* Get mapping (char ordinal -> integer, Unicode char or None) */
    7618         387 :         key = PyLong_FromLong((long)ch);
    7619         387 :         if (key == NULL)
    7620           0 :             goto onError;
    7621             : 
    7622         387 :         item = PyObject_GetItem(mapping, key);
    7623         387 :         Py_DECREF(key);
    7624         387 :         if (item == NULL) {
    7625         267 :             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
    7626             :                 /* No mapping found means: mapping is undefined. */
    7627         266 :                 PyErr_Clear();
    7628         266 :                 goto Undefined;
    7629             :             } else
    7630           1 :                 goto onError;
    7631             :         }
    7632             : 
    7633             :         /* Apply mapping */
    7634         120 :         if (item == Py_None)
    7635           6 :             goto Undefined;
    7636         114 :         if (PyLong_Check(item)) {
    7637          59 :             long value = PyLong_AS_LONG(item);
    7638          59 :             if (value == 0xFFFE)
    7639           4 :                 goto Undefined;
    7640          55 :             if (value < 0 || value > MAX_UNICODE) {
    7641           4 :                 PyErr_Format(PyExc_TypeError,
    7642             :                              "character mapping must be in range(0x%x)",
    7643             :                              (unsigned long)MAX_UNICODE + 1);
    7644           4 :                 goto onError;
    7645             :             }
    7646             : 
    7647          51 :             if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
    7648           0 :                 goto onError;
    7649             :         }
    7650          55 :         else if (PyUnicode_Check(item)) {
    7651          55 :             if (PyUnicode_GET_LENGTH(item) == 1) {
    7652          39 :                 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
    7653          39 :                 if (value == 0xFFFE)
    7654           4 :                     goto Undefined;
    7655          35 :                 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
    7656           0 :                     goto onError;
    7657             :             }
    7658             :             else {
    7659          16 :                 writer->overallocate = 1;
    7660          16 :                 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
    7661           0 :                     goto onError;
    7662             :             }
    7663             :         }
    7664             :         else {
    7665             :             /* wrong return value */
    7666           0 :             PyErr_SetString(PyExc_TypeError,
    7667             :                             "character mapping must return integer, None or str");
    7668           0 :             goto onError;
    7669             :         }
    7670         102 :         Py_CLEAR(item);
    7671         102 :         ++s;
    7672         102 :         continue;
    7673             : 
    7674         280 : Undefined:
    7675             :         /* undefined mapping */
    7676         280 :         Py_CLEAR(item);
    7677         280 :         startinpos = s-starts;
    7678         280 :         endinpos = startinpos+1;
    7679         280 :         if (unicode_decode_call_errorhandler_writer(
    7680             :                 errors, &errorHandler,
    7681             :                 "charmap", "character maps to <undefined>",
    7682             :                 &starts, &e, &startinpos, &endinpos, &exc, &s,
    7683             :                 writer)) {
    7684           7 :             goto onError;
    7685             :         }
    7686             :     }
    7687          33 :     Py_XDECREF(errorHandler);
    7688          33 :     Py_XDECREF(exc);
    7689          33 :     return 0;
    7690             : 
    7691          12 : onError:
    7692          12 :     Py_XDECREF(item);
    7693          12 :     Py_XDECREF(errorHandler);
    7694          12 :     Py_XDECREF(exc);
    7695          12 :     return -1;
    7696             : }
    7697             : 
    7698             : PyObject *
    7699       23728 : PyUnicode_DecodeCharmap(const char *s,
    7700             :                         Py_ssize_t size,
    7701             :                         PyObject *mapping,
    7702             :                         const char *errors)
    7703             : {
    7704             :     _PyUnicodeWriter writer;
    7705             : 
    7706             :     /* Default to Latin-1 */
    7707       23728 :     if (mapping == NULL)
    7708          67 :         return PyUnicode_DecodeLatin1(s, size, errors);
    7709             : 
    7710       23661 :     if (size == 0)
    7711         842 :         _Py_RETURN_UNICODE_EMPTY();
    7712       22819 :     _PyUnicodeWriter_Init(&writer);
    7713       22819 :     writer.min_length = size;
    7714       22819 :     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
    7715           0 :         goto onError;
    7716             : 
    7717       22819 :     if (PyUnicode_CheckExact(mapping)) {
    7718       22774 :         if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
    7719           2 :             goto onError;
    7720             :     }
    7721             :     else {
    7722          45 :         if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
    7723          12 :             goto onError;
    7724             :     }
    7725       22805 :     return _PyUnicodeWriter_Finish(&writer);
    7726             : 
    7727          14 :   onError:
    7728          14 :     _PyUnicodeWriter_Dealloc(&writer);
    7729          14 :     return NULL;
    7730             : }
    7731             : 
    7732             : /* Charmap encoding: the lookup table */
    7733             : 
    7734             : /*[clinic input]
    7735             : class EncodingMap "struct encoding_map *" "&EncodingMapType"
    7736             : [clinic start generated code]*/
    7737             : /*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
    7738             : 
    7739             : struct encoding_map {
    7740             :     PyObject_HEAD
    7741             :     unsigned char level1[32];
    7742             :     int count2, count3;
    7743             :     unsigned char level23[1];
    7744             : };
    7745             : 
    7746             : /*[clinic input]
    7747             : EncodingMap.size
    7748             : 
    7749             : Return the size (in bytes) of this object.
    7750             : [clinic start generated code]*/
    7751             : 
    7752             : static PyObject *
    7753           0 : EncodingMap_size_impl(struct encoding_map *self)
    7754             : /*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
    7755             : {
    7756           0 :     return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
    7757           0 :                            128*self->count3);
    7758             : }
    7759             : 
    7760             : static PyMethodDef encoding_map_methods[] = {
    7761             :     ENCODINGMAP_SIZE_METHODDEF
    7762             :     {NULL, NULL}
    7763             : };
    7764             : 
    7765             : static PyTypeObject EncodingMapType = {
    7766             :     PyVarObject_HEAD_INIT(NULL, 0)
    7767             :     .tp_name = "EncodingMap",
    7768             :     .tp_basicsize = sizeof(struct encoding_map),
    7769             :     /* methods */
    7770             :     .tp_flags = Py_TPFLAGS_DEFAULT,
    7771             :     .tp_methods = encoding_map_methods,
    7772             : };
    7773             : 
    7774             : PyObject*
    7775         218 : PyUnicode_BuildEncodingMap(PyObject* string)
    7776             : {
    7777             :     PyObject *result;
    7778             :     struct encoding_map *mresult;
    7779             :     int i;
    7780         218 :     int need_dict = 0;
    7781             :     unsigned char level1[32];
    7782             :     unsigned char level2[512];
    7783             :     unsigned char *mlevel1, *mlevel2, *mlevel3;
    7784         218 :     int count2 = 0, count3 = 0;
    7785             :     int kind;
    7786             :     const void *data;
    7787             :     Py_ssize_t length;
    7788             :     Py_UCS4 ch;
    7789             : 
    7790         218 :     if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
    7791           0 :         PyErr_BadArgument();
    7792           0 :         return NULL;
    7793             :     }
    7794         218 :     kind = PyUnicode_KIND(string);
    7795         218 :     data = PyUnicode_DATA(string);
    7796         218 :     length = PyUnicode_GET_LENGTH(string);
    7797         218 :     length = Py_MIN(length, 256);
    7798         218 :     memset(level1, 0xFF, sizeof level1);
    7799         218 :     memset(level2, 0xFF, sizeof level2);
    7800             : 
    7801             :     /* If there isn't a one-to-one mapping of NULL to \0,
    7802             :        or if there are non-BMP characters, we need to use
    7803             :        a mapping dictionary. */
    7804         218 :     if (PyUnicode_READ(kind, data, 0) != 0)
    7805           0 :         need_dict = 1;
    7806       55808 :     for (i = 1; i < length; i++) {
    7807             :         int l1, l2;
    7808       55590 :         ch = PyUnicode_READ(kind, data, i);
    7809       55590 :         if (ch == 0 || ch > 0xFFFF) {
    7810           0 :             need_dict = 1;
    7811           0 :             break;
    7812             :         }
    7813       55590 :         if (ch == 0xFFFE)
    7814             :             /* unmapped character */
    7815        1340 :             continue;
    7816       54250 :         l1 = ch >> 11;
    7817       54250 :         l2 = ch >> 7;
    7818       54250 :         if (level1[l1] == 0xFF)
    7819         417 :             level1[l1] = count2++;
    7820       54250 :         if (level2[l2] == 0xFF)
    7821        1306 :             level2[l2] = count3++;
    7822             :     }
    7823             : 
    7824         218 :     if (count2 >= 0xFF || count3 >= 0xFF)
    7825           0 :         need_dict = 1;
    7826             : 
    7827         218 :     if (need_dict) {
    7828           0 :         PyObject *result = PyDict_New();
    7829             :         PyObject *key, *value;
    7830           0 :         if (!result)
    7831           0 :             return NULL;
    7832           0 :         for (i = 0; i < length; i++) {
    7833           0 :             key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
    7834           0 :             value = PyLong_FromLong(i);
    7835           0 :             if (!key || !value)
    7836           0 :                 goto failed1;
    7837           0 :             if (PyDict_SetItem(result, key, value) == -1)
    7838           0 :                 goto failed1;
    7839           0 :             Py_DECREF(key);
    7840           0 :             Py_DECREF(value);
    7841             :         }
    7842           0 :         return result;
    7843           0 :       failed1:
    7844           0 :         Py_XDECREF(key);
    7845           0 :         Py_XDECREF(value);
    7846           0 :         Py_DECREF(result);
    7847           0 :         return NULL;
    7848             :     }
    7849             : 
    7850             :     /* Create a three-level trie */
    7851         218 :     result = PyObject_Malloc(sizeof(struct encoding_map) +
    7852         218 :                              16*count2 + 128*count3 - 1);
    7853         218 :     if (!result) {
    7854           0 :         return PyErr_NoMemory();
    7855             :     }
    7856             : 
    7857         218 :     _PyObject_Init(result, &EncodingMapType);
    7858         218 :     mresult = (struct encoding_map*)result;
    7859         218 :     mresult->count2 = count2;
    7860         218 :     mresult->count3 = count3;
    7861         218 :     mlevel1 = mresult->level1;
    7862         218 :     mlevel2 = mresult->level23;
    7863         218 :     mlevel3 = mresult->level23 + 16*count2;
    7864         218 :     memcpy(mlevel1, level1, 32);
    7865         218 :     memset(mlevel2, 0xFF, 16*count2);
    7866         218 :     memset(mlevel3, 0, 128*count3);
    7867         218 :     count3 = 0;
    7868       55808 :     for (i = 1; i < length; i++) {
    7869             :         int o1, o2, o3, i2, i3;
    7870       55590 :         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
    7871       55590 :         if (ch == 0xFFFE)
    7872             :             /* unmapped character */
    7873        1340 :             continue;
    7874       54250 :         o1 = ch>>11;
    7875       54250 :         o2 = (ch>>7) & 0xF;
    7876       54250 :         i2 = 16*mlevel1[o1] + o2;
    7877       54250 :         if (mlevel2[i2] == 0xFF)
    7878        1306 :             mlevel2[i2] = count3++;
    7879       54250 :         o3 = ch & 0x7F;
    7880       54250 :         i3 = 128*mlevel2[i2] + o3;
    7881       54250 :         mlevel3[i3] = i;
    7882             :     }
    7883         218 :     return result;
    7884             : }
    7885             : 
    7886             : static int
    7887      220485 : encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
    7888             : {
    7889      220485 :     struct encoding_map *map = (struct encoding_map*)mapping;
    7890      220485 :     int l1 = c>>11;
    7891      220485 :     int l2 = (c>>7) & 0xF;
    7892      220485 :     int l3 = c & 0x7F;
    7893             :     int i;
    7894             : 
    7895      220485 :     if (c > 0xFFFF)
    7896          10 :         return -1;
    7897      220475 :     if (c == 0)
    7898          39 :         return 0;
    7899             :     /* level 1*/
    7900      220436 :     i = map->level1[l1];
    7901      220436 :     if (i == 0xFF) {
    7902       12074 :         return -1;
    7903             :     }
    7904             :     /* level 2*/
    7905      208362 :     i = map->level23[16*i+l2];
    7906      208362 :     if (i == 0xFF) {
    7907           9 :         return -1;
    7908             :     }
    7909             :     /* level 3 */
    7910      208353 :     i = map->level23[16*map->count2 + 128*i + l3];
    7911      208353 :     if (i == 0) {
    7912          14 :         return -1;
    7913             :     }
    7914      208339 :     return i;
    7915             : }
    7916             : 
    7917             : /* Lookup the character ch in the mapping. If the character
    7918             :    can't be found, Py_None is returned (or NULL, if another
    7919             :    error occurred). */
    7920             : static PyObject *
    7921       28125 : charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
    7922             : {
    7923       28125 :     PyObject *w = PyLong_FromLong((long)c);
    7924             :     PyObject *x;
    7925             : 
    7926       28125 :     if (w == NULL)
    7927           0 :         return NULL;
    7928       28125 :     x = PyObject_GetItem(mapping, w);
    7929       28125 :     Py_DECREF(w);
    7930       28125 :     if (x == NULL) {
    7931          42 :         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
    7932             :             /* No mapping found means: mapping is undefined. */
    7933          36 :             PyErr_Clear();
    7934          36 :             Py_RETURN_NONE;
    7935             :         } else
    7936           6 :             return NULL;
    7937             :     }
    7938       28083 :     else if (x == Py_None)
    7939           6 :         return x;
    7940       28077 :     else if (PyLong_Check(x)) {
    7941       28056 :         long value = PyLong_AS_LONG(x);
    7942       28056 :         if (value < 0 || value > 255) {
    7943           6 :             PyErr_SetString(PyExc_TypeError,
    7944             :                             "character mapping must be in range(256)");
    7945           6 :             Py_DECREF(x);
    7946           6 :             return NULL;
    7947             :         }
    7948       28050 :         return x;
    7949             :     }
    7950          21 :     else if (PyBytes_Check(x))
    7951          20 :         return x;
    7952             :     else {
    7953             :         /* wrong return value */
    7954           1 :         PyErr_Format(PyExc_TypeError,
    7955             :                      "character mapping must return integer, bytes or None, not %.400s",
    7956           1 :                      Py_TYPE(x)->tp_name);
    7957           1 :         Py_DECREF(x);
    7958           1 :         return NULL;
    7959             :     }
    7960             : }
    7961             : 
    7962             : static int
    7963          74 : charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
    7964             : {
    7965          74 :     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
    7966             :     /* exponentially overallocate to minimize reallocations */
    7967          74 :     if (requiredsize < 2*outsize)
    7968          74 :         requiredsize = 2*outsize;
    7969          74 :     if (_PyBytes_Resize(outobj, requiredsize))
    7970           0 :         return -1;
    7971          74 :     return 0;
    7972             : }
    7973             : 
    7974             : typedef enum charmapencode_result {
    7975             :     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
    7976             : } charmapencode_result;
    7977             : /* lookup the character, put the result in the output string and adjust
    7978             :    various state variables. Resize the output bytes object if not enough
    7979             :    space is available. Return a new reference to the object that
    7980             :    was put in the output buffer, or Py_None, if the mapping was undefined
    7981             :    (in which case no character was written) or NULL, if a
    7982             :    reallocation error occurred. The caller must decref the result */
    7983             : static charmapencode_result
    7984      236558 : charmapencode_output(Py_UCS4 c, PyObject *mapping,
    7985             :                      PyObject **outobj, Py_ssize_t *outpos)
    7986             : {
    7987             :     PyObject *rep;
    7988             :     char *outstart;
    7989      236558 :     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
    7990             : 
    7991      236558 :     if (Py_IS_TYPE(mapping, &EncodingMapType)) {
    7992      208461 :         int res = encoding_map_lookup(c, mapping);
    7993      208461 :         Py_ssize_t requiredsize = *outpos+1;
    7994      208461 :         if (res == -1)
    7995          98 :             return enc_FAILED;
    7996      208363 :         if (outsize<requiredsize)
    7997          60 :             if (charmapencode_resize(outobj, outpos, requiredsize))
    7998           0 :                 return enc_EXCEPTION;
    7999      208363 :         outstart = PyBytes_AS_STRING(*outobj);
    8000      208363 :         outstart[(*outpos)++] = (char)res;
    8001      208363 :         return enc_SUCCESS;
    8002             :     }
    8003             : 
    8004       28097 :     rep = charmapencode_lookup(c, mapping);
    8005       28097 :     if (rep==NULL)
    8006          13 :         return enc_EXCEPTION;
    8007       28084 :     else if (rep==Py_None) {
    8008          38 :         Py_DECREF(rep);
    8009          38 :         return enc_FAILED;
    8010             :     } else {
    8011       28046 :         if (PyLong_Check(rep)) {
    8012       28026 :             Py_ssize_t requiredsize = *outpos+1;
    8013       28026 :             if (outsize<requiredsize)
    8014           8 :                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
    8015           0 :                     Py_DECREF(rep);
    8016           0 :                     return enc_EXCEPTION;
    8017             :                 }
    8018       28026 :             outstart = PyBytes_AS_STRING(*outobj);
    8019       28026 :             outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
    8020             :         }
    8021             :         else {
    8022          20 :             const char *repchars = PyBytes_AS_STRING(rep);
    8023          20 :             Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
    8024          20 :             Py_ssize_t requiredsize = *outpos+repsize;
    8025          20 :             if (outsize<requiredsize)
    8026           6 :                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
    8027           0 :                     Py_DECREF(rep);
    8028           0 :                     return enc_EXCEPTION;
    8029             :                 }
    8030          20 :             outstart = PyBytes_AS_STRING(*outobj);
    8031          20 :             memcpy(outstart + *outpos, repchars, repsize);
    8032          20 :             *outpos += repsize;
    8033             :         }
    8034             :     }
    8035       28046 :     Py_DECREF(rep);
    8036       28046 :     return enc_SUCCESS;
    8037             : }
    8038             : 
    8039             : /* handle an error in PyUnicode_EncodeCharmap
    8040             :    Return 0 on success, -1 on error */
    8041             : static int
    8042         129 : charmap_encoding_error(
    8043             :     PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
    8044             :     PyObject **exceptionObject,
    8045             :     _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
    8046             :     PyObject **res, Py_ssize_t *respos)
    8047             : {
    8048         129 :     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
    8049             :     Py_ssize_t size, repsize;
    8050             :     Py_ssize_t newpos;
    8051             :     int kind;
    8052             :     const void *data;
    8053             :     Py_ssize_t index;
    8054             :     /* startpos for collecting unencodable chars */
    8055         129 :     Py_ssize_t collstartpos = *inpos;
    8056         129 :     Py_ssize_t collendpos = *inpos+1;
    8057             :     Py_ssize_t collpos;
    8058         129 :     const char *encoding = "charmap";
    8059         129 :     const char *reason = "character maps to <undefined>";
    8060             :     charmapencode_result x;
    8061             :     Py_UCS4 ch;
    8062             :     int val;
    8063             : 
    8064         129 :     size = PyUnicode_GET_LENGTH(unicode);
    8065             :     /* find all unencodable characters */
    8066       12142 :     while (collendpos < size) {
    8067             :         PyObject *rep;
    8068       12052 :         if (Py_IS_TYPE(mapping, &EncodingMapType)) {
    8069       12024 :             ch = PyUnicode_READ_CHAR(unicode, collendpos);
    8070       12024 :             val = encoding_map_lookup(ch, mapping);
    8071       12024 :             if (val != -1)
    8072          15 :                 break;
    8073       12009 :             ++collendpos;
    8074       12009 :             continue;
    8075             :         }
    8076             : 
    8077          28 :         ch = PyUnicode_READ_CHAR(unicode, collendpos);
    8078          28 :         rep = charmapencode_lookup(ch, mapping);
    8079          28 :         if (rep==NULL)
    8080           0 :             return -1;
    8081          28 :         else if (rep!=Py_None) {
    8082          24 :             Py_DECREF(rep);
    8083          24 :             break;
    8084             :         }
    8085           4 :         Py_DECREF(rep);
    8086           4 :         ++collendpos;
    8087             :     }
    8088             :     /* cache callback name lookup
    8089             :      * (if not done yet, i.e. it's the first error) */
    8090         129 :     if (*error_handler == _Py_ERROR_UNKNOWN)
    8091          62 :         *error_handler = _Py_GetErrorHandler(errors);
    8092             : 
    8093         129 :     switch (*error_handler) {
    8094           5 :     case _Py_ERROR_STRICT:
    8095           5 :         raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
    8096           5 :         return -1;
    8097             : 
    8098           8 :     case _Py_ERROR_REPLACE:
    8099        1015 :         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
    8100        1009 :             x = charmapencode_output('?', mapping, res, respos);
    8101        1009 :             if (x==enc_EXCEPTION) {
    8102           1 :                 return -1;
    8103             :             }
    8104        1008 :             else if (x==enc_FAILED) {
    8105           1 :                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
    8106           1 :                 return -1;
    8107             :             }
    8108             :         }
    8109             :         /* fall through */
    8110             :     case _Py_ERROR_IGNORE:
    8111          11 :         *inpos = collendpos;
    8112          11 :         break;
    8113             : 
    8114           9 :     case _Py_ERROR_XMLCHARREFREPLACE:
    8115             :         /* generate replacement (temporarily (mis)uses p) */
    8116        1023 :         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
    8117             :             char buffer[2+29+1+1];
    8118             :             char *cp;
    8119        1015 :             sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
    8120        9115 :             for (cp = buffer; *cp; ++cp) {
    8121        8101 :                 x = charmapencode_output(*cp, mapping, res, respos);
    8122        8101 :                 if (x==enc_EXCEPTION)
    8123           1 :                     return -1;
    8124        8101 :                 else if (x==enc_FAILED) {
    8125           1 :                     raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
    8126           1 :                     return -1;
    8127             :                 }
    8128             :             }
    8129             :         }
    8130           8 :         *inpos = collendpos;
    8131           8 :         break;
    8132             : 
    8133         102 :     default:
    8134         102 :         repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
    8135             :                                                       encoding, reason, unicode, exceptionObject,
    8136             :                                                       collstartpos, collendpos, &newpos);
    8137         102 :         if (repunicode == NULL)
    8138          12 :             return -1;
    8139          90 :         if (PyBytes_Check(repunicode)) {
    8140             :             /* Directly copy bytes result to output. */
    8141           2 :             Py_ssize_t outsize = PyBytes_Size(*res);
    8142             :             Py_ssize_t requiredsize;
    8143           2 :             repsize = PyBytes_Size(repunicode);
    8144           2 :             requiredsize = *respos + repsize;
    8145           2 :             if (requiredsize > outsize)
    8146             :                 /* Make room for all additional bytes. */
    8147           0 :                 if (charmapencode_resize(res, respos, requiredsize)) {
    8148           0 :                     Py_DECREF(repunicode);
    8149           0 :                     return -1;
    8150             :                 }
    8151           2 :             memcpy(PyBytes_AsString(*res) + *respos,
    8152           2 :                    PyBytes_AsString(repunicode),  repsize);
    8153           2 :             *respos += repsize;
    8154           2 :             *inpos = newpos;
    8155           2 :             Py_DECREF(repunicode);
    8156           2 :             break;
    8157             :         }
    8158             :         /* generate replacement  */
    8159          88 :         repsize = PyUnicode_GET_LENGTH(repunicode);
    8160          88 :         data = PyUnicode_DATA(repunicode);
    8161          88 :         kind = PyUnicode_KIND(repunicode);
    8162       63480 :         for (index = 0; index < repsize; index++) {
    8163       63397 :             Py_UCS4 repch = PyUnicode_READ(kind, data, index);
    8164       63397 :             x = charmapencode_output(repch, mapping, res, respos);
    8165       63397 :             if (x==enc_EXCEPTION) {
    8166           0 :                 Py_DECREF(repunicode);
    8167           0 :                 return -1;
    8168             :             }
    8169       63397 :             else if (x==enc_FAILED) {
    8170           5 :                 Py_DECREF(repunicode);
    8171           5 :                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
    8172           5 :                 return -1;
    8173             :             }
    8174             :         }
    8175          83 :         *inpos = newpos;
    8176          83 :         Py_DECREF(repunicode);
    8177             :     }
    8178         104 :     return 0;
    8179             : }
    8180             : 
    8181             : PyObject *
    8182        4689 : _PyUnicode_EncodeCharmap(PyObject *unicode,
    8183             :                          PyObject *mapping,
    8184             :                          const char *errors)
    8185             : {
    8186             :     /* output object */
    8187        4689 :     PyObject *res = NULL;
    8188             :     /* current input position */
    8189        4689 :     Py_ssize_t inpos = 0;
    8190             :     Py_ssize_t size;
    8191             :     /* current output position */
    8192        4689 :     Py_ssize_t respos = 0;
    8193        4689 :     PyObject *error_handler_obj = NULL;
    8194        4689 :     PyObject *exc = NULL;
    8195        4689 :     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
    8196             :     const void *data;
    8197             :     int kind;
    8198             : 
    8199        4689 :     size = PyUnicode_GET_LENGTH(unicode);
    8200        4689 :     data = PyUnicode_DATA(unicode);
    8201        4689 :     kind = PyUnicode_KIND(unicode);
    8202             : 
    8203             :     /* Default to Latin-1 */
    8204        4689 :     if (mapping == NULL)
    8205          58 :         return unicode_encode_ucs1(unicode, errors, 256);
    8206             : 
    8207             :     /* allocate enough for a simple encoding without
    8208             :        replacements, if we need more, we'll resize */
    8209        4631 :     res = PyBytes_FromStringAndSize(NULL, size);
    8210        4631 :     if (res == NULL)
    8211           0 :         goto onError;
    8212        4631 :     if (size == 0)
    8213         385 :         return res;
    8214             : 
    8215      168260 :     while (inpos<size) {
    8216      164051 :         Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
    8217             :         /* try to encode it */
    8218      164051 :         charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
    8219      164051 :         if (x==enc_EXCEPTION) /* error */
    8220          12 :             goto onError;
    8221      164039 :         if (x==enc_FAILED) { /* unencodable character */
    8222         129 :             if (charmap_encoding_error(unicode, &inpos, mapping,
    8223             :                                        &exc,
    8224             :                                        &error_handler, &error_handler_obj, errors,
    8225             :                                        &res, &respos)) {
    8226          25 :                 goto onError;
    8227             :             }
    8228             :         }
    8229             :         else
    8230             :             /* done with this character => adjust input position */
    8231      163910 :             ++inpos;
    8232             :     }
    8233             : 
    8234             :     /* Resize if we allocated to much */
    8235        4209 :     if (respos<PyBytes_GET_SIZE(res))
    8236          33 :         if (_PyBytes_Resize(&res, respos) < 0)
    8237           0 :             goto onError;
    8238             : 
    8239        4209 :     Py_XDECREF(exc);
    8240        4209 :     Py_XDECREF(error_handler_obj);
    8241        4209 :     return res;
    8242             : 
    8243          37 :   onError:
    8244          37 :     Py_XDECREF(res);
    8245          37 :     Py_XDECREF(exc);
    8246          37 :     Py_XDECREF(error_handler_obj);
    8247          37 :     return NULL;
    8248             : }
    8249             : 
    8250             : PyObject *
    8251           0 : PyUnicode_AsCharmapString(PyObject *unicode,
    8252             :                           PyObject *mapping)
    8253             : {
    8254           0 :     if (!PyUnicode_Check(unicode) || mapping == NULL) {
    8255           0 :         PyErr_BadArgument();
    8256           0 :         return NULL;
    8257             :     }
    8258           0 :     return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
    8259             : }
    8260             : 
    8261             : /* create or adjust a UnicodeTranslateError */
    8262             : static void
    8263           0 : make_translate_exception(PyObject **exceptionObject,
    8264             :                          PyObject *unicode,
    8265             :                          Py_ssize_t startpos, Py_ssize_t endpos,
    8266             :                          const char *reason)
    8267             : {
    8268           0 :     if (*exceptionObject == NULL) {
    8269           0 :         *exceptionObject = _PyUnicodeTranslateError_Create(
    8270             :             unicode, startpos, endpos, reason);
    8271             :     }
    8272             :     else {
    8273           0 :         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
    8274           0 :             goto onError;
    8275           0 :         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
    8276           0 :             goto onError;
    8277           0 :         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
    8278           0 :             goto onError;
    8279           0 :         return;
    8280           0 :       onError:
    8281           0 :         Py_CLEAR(*exceptionObject);
    8282             :     }
    8283             : }
    8284             : 
    8285             : /* error handling callback helper:
    8286             :    build arguments, call the callback and check the arguments,
    8287             :    put the result into newpos and return the replacement string, which
    8288             :    has to be freed by the caller */
    8289             : static PyObject *
    8290           0 : unicode_translate_call_errorhandler(const char *errors,
    8291             :                                     PyObject **errorHandler,
    8292             :                                     const char *reason,
    8293             :                                     PyObject *unicode, PyObject **exceptionObject,
    8294             :                                     Py_ssize_t startpos, Py_ssize_t endpos,
    8295             :                                     Py_ssize_t *newpos)
    8296             : {
    8297             :     static const char *argparse = "Un;translating error handler must return (str, int) tuple";
    8298             : 
    8299             :     Py_ssize_t i_newpos;
    8300             :     PyObject *restuple;
    8301             :     PyObject *resunicode;
    8302             : 
    8303           0 :     if (*errorHandler == NULL) {
    8304           0 :         *errorHandler = PyCodec_LookupError(errors);
    8305           0 :         if (*errorHandler == NULL)
    8306           0 :             return NULL;
    8307             :     }
    8308             : 
    8309           0 :     make_translate_exception(exceptionObject,
    8310             :                              unicode, startpos, endpos, reason);
    8311           0 :     if (*exceptionObject == NULL)
    8312           0 :         return NULL;
    8313             : 
    8314           0 :     restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
    8315           0 :     if (restuple == NULL)
    8316           0 :         return NULL;
    8317           0 :     if (!PyTuple_Check(restuple)) {
    8318           0 :         PyErr_SetString(PyExc_TypeError, &argparse[3]);
    8319           0 :         Py_DECREF(restuple);
    8320           0 :         return NULL;
    8321             :     }
    8322           0 :     if (!PyArg_ParseTuple(restuple, argparse,
    8323             :                           &resunicode, &i_newpos)) {
    8324           0 :         Py_DECREF(restuple);
    8325           0 :         return NULL;
    8326             :     }
    8327           0 :     if (i_newpos<0)
    8328           0 :         *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
    8329             :     else
    8330           0 :         *newpos = i_newpos;
    8331           0 :     if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
    8332           0 :         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
    8333           0 :         Py_DECREF(restuple);
    8334           0 :         return NULL;
    8335             :     }
    8336           0 :     Py_INCREF(resunicode);
    8337           0 :     Py_DECREF(restuple);
    8338           0 :     return resunicode;
    8339             : }
    8340             : 
    8341             : /* Lookup the character ch in the mapping and put the result in result,
    8342             :    which must be decrefed by the caller.
    8343             :    Return 0 on success, -1 on error */
    8344             : static int
    8345      342333 : charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
    8346             : {
    8347      342333 :     PyObject *w = PyLong_FromLong((long)c);
    8348             :     PyObject *x;
    8349             : 
    8350      342333 :     if (w == NULL)
    8351           0 :         return -1;
    8352      342333 :     x = PyObject_GetItem(mapping, w);
    8353      342333 :     Py_DECREF(w);
    8354      342333 :     if (x == NULL) {
    8355      199266 :         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
    8356             :             /* No mapping found means: use 1:1 mapping. */
    8357      199266 :             PyErr_Clear();
    8358      199266 :             *result = NULL;
    8359      199266 :             return 0;
    8360             :         } else
    8361           0 :             return -1;
    8362             :     }
    8363      143067 :     else if (x == Py_None) {
    8364          19 :         *result = x;
    8365          19 :         return 0;
    8366             :     }
    8367      143048 :     else if (PyLong_Check(x)) {
    8368        2885 :         long value = PyLong_AS_LONG(x);
    8369        2885 :         if (value < 0 || value > MAX_UNICODE) {
    8370           5 :             PyErr_Format(PyExc_ValueError,
    8371             :                          "character mapping must be in range(0x%x)",
    8372             :                          MAX_UNICODE+1);
    8373           5 :             Py_DECREF(x);
    8374           5 :             return -1;
    8375             :         }
    8376        2880 :         *result = x;
    8377        2880 :         return 0;
    8378             :     }
    8379      140163 :     else if (PyUnicode_Check(x)) {
    8380      140162 :         *result = x;
    8381      140162 :         return 0;
    8382             :     }
    8383             :     else {
    8384             :         /* wrong return value */
    8385           1 :         PyErr_SetString(PyExc_TypeError,
    8386             :                         "character mapping must return integer, None or str");
    8387           1 :         Py_DECREF(x);
    8388           1 :         return -1;
    8389             :     }
    8390             : }
    8391             : 
    8392             : /* lookup the character, write the result into the writer.
    8393             :    Return 1 if the result was written into the writer, return 0 if the mapping
    8394             :    was undefined, raise an exception return -1 on error. */
    8395             : static int
    8396      170259 : charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
    8397             :                         _PyUnicodeWriter *writer)
    8398             : {
    8399             :     PyObject *item;
    8400             : 
    8401      170259 :     if (charmaptranslate_lookup(ch, mapping, &item))
    8402           5 :         return -1;
    8403             : 
    8404      170254 :     if (item == NULL) {
    8405             :         /* not found => default to 1:1 mapping */
    8406       80548 :         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
    8407           0 :             return -1;
    8408             :         }
    8409       80548 :         return 1;
    8410             :     }
    8411             : 
    8412       89706 :     if (item == Py_None) {
    8413           5 :         Py_DECREF(item);
    8414           5 :         return 0;
    8415             :     }
    8416             : 
    8417       89701 :     if (PyLong_Check(item)) {
    8418          31 :         long ch = (Py_UCS4)PyLong_AS_LONG(item);
    8419             :         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
    8420             :            used it */
    8421          31 :         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
    8422           0 :             Py_DECREF(item);
    8423           0 :             return -1;
    8424             :         }
    8425          31 :         Py_DECREF(item);
    8426          31 :         return 1;
    8427             :     }
    8428             : 
    8429       89670 :     if (!PyUnicode_Check(item)) {
    8430           0 :         Py_DECREF(item);
    8431           0 :         return -1;
    8432             :     }
    8433             : 
    8434       89670 :     if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
    8435           0 :         Py_DECREF(item);
    8436           0 :         return -1;
    8437             :     }
    8438             : 
    8439       89670 :     Py_DECREF(item);
    8440       89670 :     return 1;
    8441             : }
    8442             : 
    8443             : static int
    8444      172069 : unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
    8445             :                               Py_UCS1 *translate)
    8446             : {
    8447      172069 :     PyObject *item = NULL;
    8448      172069 :     int ret = 0;
    8449             : 
    8450      172069 :     if (charmaptranslate_lookup(ch, mapping, &item)) {
    8451           1 :         return -1;
    8452             :     }
    8453             : 
    8454      172068 :     if (item == Py_None) {
    8455             :         /* deletion */
    8456          14 :         translate[ch] = 0xfe;
    8457             :     }
    8458      172054 :     else if (item == NULL) {
    8459             :         /* not found => default to 1:1 mapping */
    8460      118717 :         translate[ch] = ch;
    8461      118717 :         return 1;
    8462             :     }
    8463       53337 :     else if (PyLong_Check(item)) {
    8464        2849 :         long replace = PyLong_AS_LONG(item);
    8465             :         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
    8466             :            used it */
    8467        2849 :         if (127 < replace) {
    8468             :             /* invalid character or character outside ASCII:
    8469             :                skip the fast translate */
    8470           0 :             goto exit;
    8471             :         }
    8472        2849 :         translate[ch] = (Py_UCS1)replace;
    8473             :     }
    8474       50488 :     else if (PyUnicode_Check(item)) {
    8475             :         Py_UCS4 replace;
    8476             : 
    8477       50488 :         if (PyUnicode_GET_LENGTH(item) != 1)
    8478       49994 :             goto exit;
    8479             : 
    8480         494 :         replace = PyUnicode_READ_CHAR(item, 0);
    8481         494 :         if (replace > 127)
    8482           2 :             goto exit;
    8483         492 :         translate[ch] = (Py_UCS1)replace;
    8484             :     }
    8485             :     else {
    8486             :         /* not None, NULL, long or unicode */
    8487           0 :         goto exit;
    8488             :     }
    8489        3355 :     ret = 1;
    8490             : 
    8491       53351 :   exit:
    8492       53351 :     Py_DECREF(item);
    8493       53351 :     return ret;
    8494             : }
    8495             : 
    8496             : /* Fast path for ascii => ascii translation. Return 1 if the whole string
    8497             :    was translated into writer, return 0 if the input string was partially
    8498             :    translated into writer, raise an exception and return -1 on error. */
    8499             : static int
    8500       98517 : unicode_fast_translate(PyObject *input, PyObject *mapping,
    8501             :                        _PyUnicodeWriter *writer, int ignore,
    8502             :                        Py_ssize_t *input_pos)
    8503             : {
    8504             :     Py_UCS1 ascii_table[128], ch, ch2;
    8505             :     Py_ssize_t len;
    8506             :     const Py_UCS1 *in, *end;
    8507             :     Py_UCS1 *out;
    8508       98517 :     int res = 0;
    8509             : 
    8510       98517 :     len = PyUnicode_GET_LENGTH(input);
    8511             : 
    8512       98517 :     memset(ascii_table, 0xff, 128);
    8513             : 
    8514       98517 :     in = PyUnicode_1BYTE_DATA(input);
    8515       98517 :     end = in + len;
    8516             : 
    8517       98517 :     assert(PyUnicode_IS_ASCII(writer->buffer));
    8518       98517 :     assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
    8519       98517 :     out = PyUnicode_1BYTE_DATA(writer->buffer);
    8520             : 
    8521      269283 :     for (; in < end; in++) {
    8522      220763 :         ch = *in;
    8523      220763 :         ch2 = ascii_table[ch];
    8524      220763 :         if (ch2 == 0xff) {
    8525      172069 :             int translate = unicode_fast_translate_lookup(mapping, ch,
    8526             :                                                           ascii_table);
    8527      172069 :             if (translate < 0)
    8528           1 :                 return -1;
    8529      172068 :             if (translate == 0)
    8530       49996 :                 goto exit;
    8531      122072 :             ch2 = ascii_table[ch];
    8532             :         }
    8533      170766 :         if (ch2 == 0xfe) {
    8534          25 :             if (ignore)
    8535          25 :                 continue;
    8536           0 :             goto exit;
    8537             :         }
    8538      170741 :         assert(ch2 < 128);
    8539      170741 :         *out = ch2;
    8540      170741 :         out++;
    8541             :     }
    8542       48520 :     res = 1;
    8543             : 
    8544       98516 : exit:
    8545       98516 :     writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
    8546       98516 :     *input_pos = in - PyUnicode_1BYTE_DATA(input);
    8547       98516 :     return res;
    8548             : }
    8549             : 
    8550             : static PyObject *
    8551       99461 : _PyUnicode_TranslateCharmap(PyObject *input,
    8552             :                             PyObject *mapping,
    8553             :                             const char *errors)
    8554             : {
    8555             :     /* input object */
    8556             :     const void *data;
    8557             :     Py_ssize_t size, i;
    8558             :     int kind;
    8559             :     /* output buffer */
    8560             :     _PyUnicodeWriter writer;
    8561             :     /* error handler */
    8562       99461 :     const char *reason = "character maps to <undefined>";
    8563       99461 :     PyObject *errorHandler = NULL;
    8564       99461 :     PyObject *exc = NULL;
    8565             :     int ignore;
    8566             :     int res;
    8567             : 
    8568       99461 :     if (mapping == NULL) {
    8569           0 :         PyErr_BadArgument();
    8570           0 :         return NULL;
    8571             :     }
    8572             : 
    8573       99461 :     data = PyUnicode_DATA(input);
    8574       99461 :     kind = PyUnicode_KIND(input);
    8575       99461 :     size = PyUnicode_GET_LENGTH(input);
    8576             : 
    8577       99461 :     if (size == 0)
    8578          77 :         return PyUnicode_FromObject(input);
    8579             : 
    8580             :     /* allocate enough for a simple 1:1 translation without
    8581             :        replacements, if we need more, we'll resize */
    8582       99384 :     _PyUnicodeWriter_Init(&writer);
    8583       99384 :     if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
    8584           0 :         goto onError;
    8585             : 
    8586       99384 :     ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
    8587             : 
    8588       99384 :     if (PyUnicode_IS_ASCII(input)) {
    8589       98517 :         res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
    8590       98517 :         if (res < 0) {
    8591           1 :             _PyUnicodeWriter_Dealloc(&writer);
    8592           1 :             return NULL;
    8593             :         }
    8594       98516 :         if (res == 1)
    8595       48520 :             return _PyUnicodeWriter_Finish(&writer);
    8596             :     }
    8597             :     else {
    8598         867 :         i = 0;
    8599             :     }
    8600             : 
    8601      221117 :     while (i<size) {
    8602             :         /* try to encode it */
    8603             :         int translate;
    8604      170259 :         PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
    8605             :         Py_ssize_t newpos;
    8606             :         /* startpos for collecting untranslatable chars */
    8607             :         Py_ssize_t collstart;
    8608             :         Py_ssize_t collend;
    8609             :         Py_UCS4 ch;
    8610             : 
    8611      170259 :         ch = PyUnicode_READ(kind, data, i);
    8612      170259 :         translate = charmaptranslate_output(ch, mapping, &writer);
    8613      170259 :         if (translate < 0)
    8614           5 :             goto onError;
    8615             : 
    8616      170254 :         if (translate != 0) {
    8617             :             /* it worked => adjust input pointer */
    8618      170249 :             ++i;
    8619      170249 :             continue;
    8620             :         }
    8621             : 
    8622             :         /* untranslatable character */
    8623           5 :         collstart = i;
    8624           5 :         collend = i+1;
    8625             : 
    8626             :         /* find all untranslatable characters */
    8627           5 :         while (collend < size) {
    8628             :             PyObject *x;
    8629           5 :             ch = PyUnicode_READ(kind, data, collend);
    8630           5 :             if (charmaptranslate_lookup(ch, mapping, &x))
    8631           0 :                 goto onError;
    8632           5 :             Py_XDECREF(x);
    8633           5 :             if (x != Py_None)
    8634           5 :                 break;
    8635           0 :             ++collend;
    8636             :         }
    8637             : 
    8638           5 :         if (ignore) {
    8639           5 :             i = collend;
    8640             :         }
    8641             :         else {
    8642           0 :             repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
    8643             :                                                              reason, input, &exc,
    8644             :                                                              collstart, collend, &newpos);
    8645           0 :             if (repunicode == NULL)
    8646           0 :                 goto onError;
    8647           0 :             if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
    8648           0 :                 Py_DECREF(repunicode);
    8649           0 :                 goto onError;
    8650             :             }
    8651           0 :             Py_DECREF(repunicode);
    8652           0 :             i = newpos;
    8653             :         }
    8654             :     }
    8655       50858 :     Py_XDECREF(exc);
    8656       50858 :     Py_XDECREF(errorHandler);
    8657       50858 :     return _PyUnicodeWriter_Finish(&writer);
    8658             : 
    8659           5 :   onError:
    8660           5 :     _PyUnicodeWriter_Dealloc(&writer);
    8661           5 :     Py_XDECREF(exc);
    8662           5 :     Py_XDECREF(errorHandler);
    8663           5 :     return NULL;
    8664             : }
    8665             : 
    8666             : PyObject *
    8667           0 : PyUnicode_Translate(PyObject *str,
    8668             :                     PyObject *mapping,
    8669             :                     const char *errors)
    8670             : {
    8671           0 :     if (ensure_unicode(str) < 0)
    8672           0 :         return NULL;
    8673           0 :     return _PyUnicode_TranslateCharmap(str, mapping, errors);
    8674             : }
    8675             : 
    8676             : PyObject *
    8677     1265150 : _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
    8678             : {
    8679     1265150 :     if (!PyUnicode_Check(unicode)) {
    8680           0 :         PyErr_BadInternalCall();
    8681           0 :         return NULL;
    8682             :     }
    8683     1265150 :     if (PyUnicode_IS_ASCII(unicode)) {
    8684             :         /* If the string is already ASCII, just return the same string */
    8685     1265090 :         Py_INCREF(unicode);
    8686     1265090 :         return unicode;
    8687             :     }
    8688             : 
    8689          52 :     Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
    8690          52 :     PyObject *result = PyUnicode_New(len, 127);
    8691          52 :     if (result == NULL) {
    8692           0 :         return NULL;
    8693             :     }
    8694             : 
    8695          52 :     Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
    8696          52 :     int kind = PyUnicode_KIND(unicode);
    8697          52 :     const void *data = PyUnicode_DATA(unicode);
    8698             :     Py_ssize_t i;
    8699         143 :     for (i = 0; i < len; ++i) {
    8700         133 :         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
    8701         133 :         if (ch < 127) {
    8702          66 :             out[i] = ch;
    8703             :         }
    8704          67 :         else if (Py_UNICODE_ISSPACE(ch)) {
    8705           6 :             out[i] = ' ';
    8706             :         }
    8707             :         else {
    8708          61 :             int decimal = Py_UNICODE_TODECIMAL(ch);
    8709          61 :             if (decimal < 0) {
    8710          42 :                 out[i] = '?';
    8711          42 :                 out[i+1] = '\0';
    8712          42 :                 _PyUnicode_LENGTH(result) = i + 1;
    8713          42 :                 break;
    8714             :             }
    8715          19 :             out[i] = '0' + decimal;
    8716             :         }
    8717             :     }
    8718             : 
    8719          52 :     assert(_PyUnicode_CheckConsistency(result, 1));
    8720          52 :     return result;
    8721             : }
    8722             : 
    8723             : /* --- Helpers ------------------------------------------------------------ */
    8724             : 
    8725             : /* helper macro to fixup start/end slice values */
    8726             : #define ADJUST_INDICES(start, end, len)         \
    8727             :     if (end > len)                              \
    8728             :         end = len;                              \
    8729             :     else if (end < 0) {                         \
    8730             :         end += len;                             \
    8731             :         if (end < 0)                            \
    8732             :             end = 0;                            \
    8733             :     }                                           \
    8734             :     if (start < 0) {                            \
    8735             :         start += len;                           \
    8736             :         if (start < 0)                          \
    8737             :             start = 0;                          \
    8738             :     }
    8739             : 
    8740             : static Py_ssize_t
    8741     2357480 : any_find_slice(PyObject* s1, PyObject* s2,
    8742             :                Py_ssize_t start,
    8743             :                Py_ssize_t end,
    8744             :                int direction)
    8745             : {
    8746             :     int kind1, kind2;
    8747             :     const void *buf1, *buf2;
    8748             :     Py_ssize_t len1, len2, result;
    8749             : 
    8750     2357480 :     kind1 = PyUnicode_KIND(s1);
    8751     2357480 :     kind2 = PyUnicode_KIND(s2);
    8752     2357480 :     if (kind1 < kind2)
    8753          24 :         return -1;
    8754             : 
    8755     2357460 :     len1 = PyUnicode_GET_LENGTH(s1);
    8756     2357460 :     len2 = PyUnicode_GET_LENGTH(s2);
    8757     2357460 :     ADJUST_INDICES(start, end, len1);
    8758     2357460 :     if (end - start < len2)
    8759      171983 :         return -1;
    8760             : 
    8761     2185470 :     buf1 = PyUnicode_DATA(s1);
    8762     2185470 :     buf2 = PyUnicode_DATA(s2);
    8763     2185470 :     if (len2 == 1) {
    8764     1629120 :         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
    8765     1629120 :         result = findchar((const char *)buf1 + kind1*start,
    8766             :                           kind1, end - start, ch, direction);
    8767     1629120 :         if (result == -1)
    8768      472319 :             return -1;
    8769             :         else
    8770     1156800 :             return start + result;
    8771             :     }
    8772             : 
    8773      556359 :     if (kind2 != kind1) {
    8774          14 :         buf2 = unicode_askind(kind2, buf2, len2, kind1);
    8775          14 :         if (!buf2)
    8776           0 :             return -2;
    8777             :     }
    8778             : 
    8779      556359 :     if (direction > 0) {
    8780      359507 :         switch (kind1) {
    8781      359499 :         case PyUnicode_1BYTE_KIND:
    8782      359499 :             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
    8783      359499 :                 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
    8784             :             else
    8785           0 :                 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
    8786      359499 :             break;
    8787           4 :         case PyUnicode_2BYTE_KIND:
    8788           4 :             result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
    8789           4 :             break;
    8790           4 :         case PyUnicode_4BYTE_KIND:
    8791           4 :             result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
    8792           4 :             break;
    8793           0 :         default:
    8794           0 :             Py_UNREACHABLE();
    8795             :         }
    8796             :     }
    8797             :     else {
    8798      196852 :         switch (kind1) {
    8799      196846 :         case PyUnicode_1BYTE_KIND:
    8800      196846 :             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
    8801      196846 :                 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
    8802             :             else
    8803           0 :                 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
    8804      196846 :             break;
    8805           2 :         case PyUnicode_2BYTE_KIND:
    8806           2 :             result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
    8807           2 :             break;
    8808           4 :         case PyUnicode_4BYTE_KIND:
    8809           4 :             result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
    8810           4 :             break;
    8811           0 :         default:
    8812           0 :             Py_UNREACHABLE();
    8813             :         }
    8814             :     }
    8815             : 
    8816      556359 :     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
    8817      556359 :     if (kind2 != kind1)
    8818          14 :         PyMem_Free((void *)buf2);
    8819             : 
    8820      556359 :     return result;
    8821             : }
    8822             : 
    8823             : /* _PyUnicode_InsertThousandsGrouping() helper functions */
    8824             : #include "stringlib/localeutil.h"
    8825             : 
    8826             : /**
    8827             :  * InsertThousandsGrouping:
    8828             :  * @writer: Unicode writer.
    8829             :  * @n_buffer: Number of characters in @buffer.
    8830             :  * @digits: Digits we're reading from. If count is non-NULL, this is unused.
    8831             :  * @d_pos: Start of digits string.
    8832             :  * @n_digits: The number of digits in the string, in which we want
    8833             :  *            to put the grouping chars.
    8834             :  * @min_width: The minimum width of the digits in the output string.
    8835             :  *             Output will be zero-padded on the left to fill.
    8836             :  * @grouping: see definition in localeconv().
    8837             :  * @thousands_sep: see definition in localeconv().
    8838             :  *
    8839             :  * There are 2 modes: counting and filling. If @writer is NULL,
    8840             :  *  we are in counting mode, else filling mode.
    8841             :  * If counting, the required buffer size is returned.
    8842             :  * If filling, we know the buffer will be large enough, so we don't
    8843             :  *  need to pass in the buffer size.
    8844             :  * Inserts thousand grouping characters (as defined by grouping and
    8845             :  *  thousands_sep) into @writer.
    8846             :  *
    8847             :  * Return value: -1 on error, number of characters otherwise.
    8848             :  **/
    8849             : Py_ssize_t
    8850      949228 : _PyUnicode_InsertThousandsGrouping(
    8851             :     _PyUnicodeWriter *writer,
    8852             :     Py_ssize_t n_buffer,
    8853             :     PyObject *digits,
    8854             :     Py_ssize_t d_pos,
    8855             :     Py_ssize_t n_digits,
    8856             :     Py_ssize_t min_width,
    8857             :     const char *grouping,
    8858             :     PyObject *thousands_sep,
    8859             :     Py_UCS4 *maxchar)
    8860             : {
    8861      949228 :     min_width = Py_MAX(0, min_width);
    8862      949228 :     if (writer) {
    8863      474611 :         assert(digits != NULL);
    8864      474611 :         assert(maxchar == NULL);
    8865             :     }
    8866             :     else {
    8867      474617 :         assert(digits == NULL);
    8868      474617 :         assert(maxchar != NULL);
    8869             :     }
    8870      949228 :     assert(0 <= d_pos);
    8871      949228 :     assert(0 <= n_digits);
    8872      949228 :     assert(grouping != NULL);
    8873             : 
    8874      949228 :     Py_ssize_t count = 0;
    8875             :     Py_ssize_t n_zeros;
    8876      949228 :     int loop_broken = 0;
    8877      949228 :     int use_separator = 0; /* First time through, don't append the
    8878             :                               separator. They only go between
    8879             :                               groups. */
    8880             :     Py_ssize_t buffer_pos;
    8881             :     Py_ssize_t digits_pos;
    8882             :     Py_ssize_t len;
    8883             :     Py_ssize_t n_chars;
    8884      949228 :     Py_ssize_t remaining = n_digits; /* Number of chars remaining to
    8885             :                                         be looked at */
    8886             :     /* A generator that returns all of the grouping widths, until it
    8887             :        returns 0. */
    8888             :     GroupGenerator groupgen;
    8889      949228 :     GroupGenerator_init(&groupgen, grouping);
    8890      949228 :     const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
    8891             : 
    8892             :     /* if digits are not grouped, thousands separator
    8893             :        should be an empty string */
    8894      949228 :     assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
    8895             : 
    8896      949228 :     digits_pos = d_pos + n_digits;
    8897      949228 :     if (writer) {
    8898      474611 :         buffer_pos = writer->pos + n_buffer;
    8899      474611 :         assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
    8900      474611 :         assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
    8901             :     }
    8902             :     else {
    8903      474617 :         buffer_pos = n_buffer;
    8904             :     }
    8905             : 
    8906      949228 :     if (!writer) {
    8907      474617 :         *maxchar = 127;
    8908             :     }
    8909             : 
    8910      950054 :     while ((len = GroupGenerator_next(&groupgen)) > 0) {
    8911        1322 :         len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
    8912        1322 :         n_zeros = Py_MAX(0, len - remaining);
    8913        1322 :         n_chars = Py_MAX(0, Py_MIN(remaining, len));
    8914             : 
    8915             :         /* Use n_zero zero's and n_chars chars */
    8916             : 
    8917             :         /* Count only, don't do anything. */
    8918        1322 :         count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
    8919             : 
    8920             :         /* Copy into the writer. */
    8921        1322 :         InsertThousandsGrouping_fill(writer, &buffer_pos,
    8922             :                                      digits, &digits_pos,
    8923             :                                      n_chars, n_zeros,
    8924             :                                      use_separator ? thousands_sep : NULL,
    8925             :                                      thousands_sep_len, maxchar);
    8926             : 
    8927             :         /* Use a separator next time. */
    8928        1322 :         use_separator = 1;
    8929             : 
    8930        1322 :         remaining -= n_chars;
    8931        1322 :         min_width -= len;
    8932             : 
    8933        1322 :         if (remaining <= 0 && min_width <= 0) {
    8934         496 :             loop_broken = 1;
    8935         496 :             break;
    8936             :         }
    8937         826 :         min_width -= thousands_sep_len;
    8938             :     }
    8939      949228 :     if (!loop_broken) {
    8940             :         /* We left the loop without using a break statement. */
    8941             : 
    8942      948732 :         len = Py_MAX(Py_MAX(remaining, min_width), 1);
    8943      948732 :         n_zeros = Py_MAX(0, len - remaining);
    8944      948732 :         n_chars = Py_MAX(0, Py_MIN(remaining, len));
    8945             : 
    8946             :         /* Use n_zero zero's and n_chars chars */
    8947      948732 :         count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
    8948             : 
    8949             :         /* Copy into the writer. */
    8950      948732 :         InsertThousandsGrouping_fill(writer, &buffer_pos,
    8951             :                                      digits, &digits_pos,
    8952             :                                      n_chars, n_zeros,
    8953             :                                      use_separator ? thousands_sep : NULL,
    8954             :                                      thousands_sep_len, maxchar);
    8955             :     }
    8956      949228 :     return count;
    8957             : }
    8958             : 
    8959             : 
    8960             : Py_ssize_t
    8961           0 : PyUnicode_Count(PyObject *str,
    8962             :                 PyObject *substr,
    8963             :                 Py_ssize_t start,
    8964             :                 Py_ssize_t end)
    8965             : {
    8966             :     Py_ssize_t result;
    8967             :     int kind1, kind2;
    8968           0 :     const void *buf1 = NULL, *buf2 = NULL;
    8969             :     Py_ssize_t len1, len2;
    8970             : 
    8971           0 :     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
    8972           0 :         return -1;
    8973             : 
    8974           0 :     kind1 = PyUnicode_KIND(str);
    8975           0 :     kind2 = PyUnicode_KIND(substr);
    8976           0 :     if (kind1 < kind2)
    8977           0 :         return 0;
    8978             : 
    8979           0 :     len1 = PyUnicode_GET_LENGTH(str);
    8980           0 :     len2 = PyUnicode_GET_LENGTH(substr);
    8981           0 :     ADJUST_INDICES(start, end, len1);
    8982           0 :     if (end - start < len2)
    8983           0 :         return 0;
    8984             : 
    8985           0 :     buf1 = PyUnicode_DATA(str);
    8986           0 :     buf2 = PyUnicode_DATA(substr);
    8987           0 :     if (kind2 != kind1) {
    8988           0 :         buf2 = unicode_askind(kind2, buf2, len2, kind1);
    8989           0 :         if (!buf2)
    8990           0 :             goto onError;
    8991             :     }
    8992             : 
    8993           0 :     switch (kind1) {
    8994           0 :     case PyUnicode_1BYTE_KIND:
    8995           0 :         if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
    8996           0 :             result = asciilib_count(
    8997             :                 ((const Py_UCS1*)buf1) + start, end - start,
    8998             :                 buf2, len2, PY_SSIZE_T_MAX
    8999             :                 );
    9000             :         else
    9001           0 :             result = ucs1lib_count(
    9002             :                 ((const Py_UCS1*)buf1) + start, end - start,
    9003             :                 buf2, len2, PY_SSIZE_T_MAX
    9004             :                 );
    9005           0 :         break;
    9006           0 :     case PyUnicode_2BYTE_KIND:
    9007           0 :         result = ucs2lib_count(
    9008           0 :             ((const Py_UCS2*)buf1) + start, end - start,
    9009             :             buf2, len2, PY_SSIZE_T_MAX
    9010             :             );
    9011           0 :         break;
    9012           0 :     case PyUnicode_4BYTE_KIND:
    9013           0 :         result = ucs4lib_count(
    9014           0 :             ((const Py_UCS4*)buf1) + start, end - start,
    9015             :             buf2, len2, PY_SSIZE_T_MAX
    9016             :             );
    9017           0 :         break;
    9018           0 :     default:
    9019           0 :         Py_UNREACHABLE();
    9020             :     }
    9021             : 
    9022           0 :     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
    9023           0 :     if (kind2 != kind1)
    9024           0 :         PyMem_Free((void *)buf2);
    9025             : 
    9026           0 :     return result;
    9027           0 :   onError:
    9028           0 :     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
    9029           0 :     if (kind2 != kind1)
    9030           0 :         PyMem_Free((void *)buf2);
    9031           0 :     return -1;
    9032             : }
    9033             : 
    9034             : Py_ssize_t
    9035         119 : PyUnicode_Find(PyObject *str,
    9036             :                PyObject *substr,
    9037             :                Py_ssize_t start,
    9038             :                Py_ssize_t end,
    9039             :                int direction)
    9040             : {
    9041         119 :     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
    9042           0 :         return -2;
    9043             : 
    9044         119 :     return any_find_slice(str, substr, start, end, direction);
    9045             : }
    9046             : 
    9047             : Py_ssize_t
    9048     5765070 : PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
    9049             :                    Py_ssize_t start, Py_ssize_t end,
    9050             :                    int direction)
    9051             : {
    9052             :     int kind;
    9053             :     Py_ssize_t len, result;
    9054     5765070 :     len = PyUnicode_GET_LENGTH(str);
    9055     5765070 :     ADJUST_INDICES(start, end, len);
    9056     5765070 :     if (end - start < 1)
    9057        2280 :         return -1;
    9058     5762790 :     kind = PyUnicode_KIND(str);
    9059     5762790 :     result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
    9060             :                       kind, end-start, ch, direction);
    9061     5762790 :     if (result == -1)
    9062     4690160 :         return -1;
    9063             :     else
    9064     1072630 :         return start + result;
    9065             : }
    9066             : 
    9067             : static int
    9068    10921600 : tailmatch(PyObject *self,
    9069             :           PyObject *substring,
    9070             :           Py_ssize_t start,
    9071             :           Py_ssize_t end,
    9072             :           int direction)
    9073             : {
    9074             :     int kind_self;
    9075             :     int kind_sub;
    9076             :     const void *data_self;
    9077             :     const void *data_sub;
    9078             :     Py_ssize_t offset;
    9079             :     Py_ssize_t i;
    9080             :     Py_ssize_t end_sub;
    9081             : 
    9082    10921600 :     ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
    9083    10921600 :     end -= PyUnicode_GET_LENGTH(substring);
    9084    10921600 :     if (end < start)
    9085      332154 :         return 0;
    9086             : 
    9087    10589500 :     if (PyUnicode_GET_LENGTH(substring) == 0)
    9088        3320 :         return 1;
    9089             : 
    9090    10586200 :     kind_self = PyUnicode_KIND(self);
    9091    10586200 :     data_self = PyUnicode_DATA(self);
    9092    10586200 :     kind_sub = PyUnicode_KIND(substring);
    9093    10586200 :     data_sub = PyUnicode_DATA(substring);
    9094    10586200 :     end_sub = PyUnicode_GET_LENGTH(substring) - 1;
    9095             : 
    9096    10586200 :     if (direction > 0)
    9097     1772780 :         offset = end;
    9098             :     else
    9099     8813390 :         offset = start;
    9100             : 
    9101    21172400 :     if (PyUnicode_READ(kind_self, data_self, offset) ==
    9102    13449500 :         PyUnicode_READ(kind_sub, data_sub, 0) &&
    9103     2863340 :         PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
    9104     2863340 :         PyUnicode_READ(kind_sub, data_sub, end_sub)) {
    9105             :         /* If both are of the same kind, memcmp is sufficient */
    9106     1965300 :         if (kind_self == kind_sub) {
    9107     3930240 :             return ! memcmp((char *)data_self +
    9108     1965120 :                                 (offset * PyUnicode_KIND(substring)),
    9109             :                             data_sub,
    9110     1965120 :                             PyUnicode_GET_LENGTH(substring) *
    9111     1965120 :                                 PyUnicode_KIND(substring));
    9112             :         }
    9113             :         /* otherwise we have to compare each character by first accessing it */
    9114             :         else {
    9115             :             /* We do not need to compare 0 and len(substring)-1 because
    9116             :                the if statement above ensured already that they are equal
    9117             :                when we end up here. */
    9118         203 :             for (i = 1; i < end_sub; ++i) {
    9119          50 :                 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
    9120          25 :                     PyUnicode_READ(kind_sub, data_sub, i))
    9121           0 :                     return 0;
    9122             :             }
    9123         178 :             return 1;
    9124             :         }
    9125             :     }
    9126             : 
    9127     8620880 :     return 0;
    9128             : }
    9129             : 
    9130             : Py_ssize_t
    9131         352 : PyUnicode_Tailmatch(PyObject *str,
    9132             :                     PyObject *substr,
    9133             :                     Py_ssize_t start,
    9134             :                     Py_ssize_t end,
    9135             :                     int direction)
    9136             : {
    9137         352 :     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
    9138           0 :         return -1;
    9139             : 
    9140         352 :     return tailmatch(str, substr, start, end, direction);
    9141             : }
    9142             : 
    9143             : static PyObject *
    9144      758589 : ascii_upper_or_lower(PyObject *self, int lower)
    9145             : {
    9146      758589 :     Py_ssize_t len = PyUnicode_GET_LENGTH(self);
    9147      758589 :     const char *data = PyUnicode_DATA(self);
    9148             :     char *resdata;
    9149             :     PyObject *res;
    9150             : 
    9151      758589 :     res = PyUnicode_New(len, 127);
    9152      758589 :     if (res == NULL)
    9153           0 :         return NULL;
    9154      758589 :     resdata = PyUnicode_DATA(res);
    9155      758589 :     if (lower)
    9156      694645 :         _Py_bytes_lower(resdata, data, len);
    9157             :     else
    9158       63944 :         _Py_bytes_upper(resdata, data, len);
    9159      758589 :     return res;
    9160             : }
    9161             : 
    9162             : static Py_UCS4
    9163          24 : handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
    9164             : {
    9165             :     Py_ssize_t j;
    9166             :     int final_sigma;
    9167          24 :     Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
    9168             :     /* U+03A3 is in the Final_Sigma context when, it is found like this:
    9169             : 
    9170             :      \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
    9171             : 
    9172             :     where ! is a negation and \p{xxx} is a character with property xxx.
    9173             :     */
    9174          33 :     for (j = i - 1; j >= 0; j--) {
    9175          20 :         c = PyUnicode_READ(kind, data, j);
    9176          20 :         if (!_PyUnicode_IsCaseIgnorable(c))
    9177          11 :             break;
    9178             :     }
    9179          24 :     final_sigma = j >= 0 && _PyUnicode_IsCased(c);
    9180          24 :     if (final_sigma) {
    9181          13 :         for (j = i + 1; j < length; j++) {
    9182           6 :             c = PyUnicode_READ(kind, data, j);
    9183           6 :             if (!_PyUnicode_IsCaseIgnorable(c))
    9184           4 :                 break;
    9185             :         }
    9186          11 :         final_sigma = j == length || !_PyUnicode_IsCased(c);
    9187             :     }
    9188          24 :     return (final_sigma) ? 0x3C2 : 0x3C3;
    9189             : }
    9190             : 
    9191             : static int
    9192    11368900 : lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
    9193             :            Py_UCS4 c, Py_UCS4 *mapped)
    9194             : {
    9195             :     /* Obscure special case. */
    9196    11368900 :     if (c == 0x3A3) {
    9197          24 :         mapped[0] = handle_capital_sigma(kind, data, length, i);
    9198          24 :         return 1;
    9199             :     }
    9200    11368900 :     return _PyUnicode_ToLowerFull(c, mapped);
    9201             : }
    9202             : 
    9203             : static Py_ssize_t
    9204       24224 : do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
    9205             : {
    9206       24224 :     Py_ssize_t i, k = 0;
    9207             :     int n_res, j;
    9208             :     Py_UCS4 c, mapped[3];
    9209             : 
    9210       24224 :     c = PyUnicode_READ(kind, data, 0);
    9211       24224 :     n_res = _PyUnicode_ToTitleFull(c, mapped);
    9212       48451 :     for (j = 0; j < n_res; j++) {
    9213       24227 :         *maxchar = Py_MAX(*maxchar, mapped[j]);
    9214       24227 :         res[k++] = mapped[j];
    9215             :     }
    9216      200172 :     for (i = 1; i < length; i++) {
    9217      175948 :         c = PyUnicode_READ(kind, data, i);
    9218      175948 :         n_res = lower_ucs4(kind, data, length, i, c, mapped);
    9219      351898 :         for (j = 0; j < n_res; j++) {
    9220      175950 :             *maxchar = Py_MAX(*maxchar, mapped[j]);
    9221      175950 :             res[k++] = mapped[j];
    9222             :         }
    9223             :     }
    9224       24224 :     return k;
    9225             : }
    9226             : 
    9227             : static Py_ssize_t
    9228          29 : do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
    9229          29 :     Py_ssize_t i, k = 0;
    9230             : 
    9231        7251 :     for (i = 0; i < length; i++) {
    9232        7222 :         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
    9233             :         int n_res, j;
    9234        7222 :         if (Py_UNICODE_ISUPPER(c)) {
    9235        1759 :             n_res = lower_ucs4(kind, data, length, i, c, mapped);
    9236             :         }
    9237        5463 :         else if (Py_UNICODE_ISLOWER(c)) {
    9238        2405 :             n_res = _PyUnicode_ToUpperFull(c, mapped);
    9239             :         }
    9240             :         else {
    9241        3058 :             n_res = 1;
    9242        3058 :             mapped[0] = c;
    9243             :         }
    9244       14449 :         for (j = 0; j < n_res; j++) {
    9245        7227 :             *maxchar = Py_MAX(*maxchar, mapped[j]);
    9246        7227 :             res[k++] = mapped[j];
    9247             :         }
    9248             :     }
    9249          29 :     return k;
    9250             : }
    9251             : 
    9252             : static Py_ssize_t
    9253     6703330 : do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
    9254             :                   Py_UCS4 *maxchar, int lower)
    9255             : {
    9256     6703330 :     Py_ssize_t i, k = 0;
    9257             : 
    9258    20104600 :     for (i = 0; i < length; i++) {
    9259    13401300 :         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
    9260             :         int n_res, j;
    9261    13401300 :         if (lower)
    9262     6710060 :             n_res = lower_ucs4(kind, data, length, i, c, mapped);
    9263             :         else
    9264     6691240 :             n_res = _PyUnicode_ToUpperFull(c, mapped);
    9265    26803000 :         for (j = 0; j < n_res; j++) {
    9266    13401700 :             *maxchar = Py_MAX(*maxchar, mapped[j]);
    9267    13401700 :             res[k++] = mapped[j];
    9268             :         }
    9269             :     }
    9270     6703330 :     return k;
    9271             : }
    9272             : 
    9273             : static Py_ssize_t
    9274     3349050 : do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
    9275             : {
    9276     3349050 :     return do_upper_or_lower(kind, data, length, res, maxchar, 0);
    9277             : }
    9278             : 
    9279             : static Py_ssize_t
    9280     3354280 : do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
    9281             : {
    9282     3354280 :     return do_upper_or_lower(kind, data, length, res, maxchar, 1);
    9283             : }
    9284             : 
    9285             : static Py_ssize_t
    9286           6 : do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
    9287             : {
    9288           6 :     Py_ssize_t i, k = 0;
    9289             : 
    9290          15 :     for (i = 0; i < length; i++) {
    9291           9 :         Py_UCS4 c = PyUnicode_READ(kind, data, i);
    9292             :         Py_UCS4 mapped[3];
    9293           9 :         int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
    9294          20 :         for (j = 0; j < n_res; j++) {
    9295          11 :             *maxchar = Py_MAX(*maxchar, mapped[j]);
    9296          11 :             res[k++] = mapped[j];
    9297             :         }
    9298             :     }
    9299           6 :     return k;
    9300             : }
    9301             : 
    9302             : static Py_ssize_t
    9303     4458370 : do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
    9304             : {
    9305     4458370 :     Py_ssize_t i, k = 0;
    9306             :     int previous_is_cased;
    9307             : 
    9308     4458370 :     previous_is_cased = 0;
    9309    15617400 :     for (i = 0; i < length; i++) {
    9310    11159100 :         const Py_UCS4 c = PyUnicode_READ(kind, data, i);
    9311             :         Py_UCS4 mapped[3];
    9312             :         int n_res, j;
    9313             : 
    9314    11159100 :         if (previous_is_cased)
    9315     4481140 :             n_res = lower_ucs4(kind, data, length, i, c, mapped);
    9316             :         else
    9317     6677950 :             n_res = _PyUnicode_ToTitleFull(c, mapped);
    9318             : 
    9319    22318400 :         for (j = 0; j < n_res; j++) {
    9320    11159300 :             *maxchar = Py_MAX(*maxchar, mapped[j]);
    9321    11159300 :             res[k++] = mapped[j];
    9322             :         }
    9323             : 
    9324    11159100 :         previous_is_cased = _PyUnicode_IsCased(c);
    9325             :     }
    9326     4458370 :     return k;
    9327             : }
    9328             : 
    9329             : static PyObject *
    9330    11186000 : case_operation(PyObject *self,
    9331             :                Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
    9332             : {
    9333    11186000 :     PyObject *res = NULL;
    9334    11186000 :     Py_ssize_t length, newlength = 0;
    9335             :     int kind, outkind;
    9336             :     const void *data;
    9337             :     void *outdata;
    9338    11186000 :     Py_UCS4 maxchar = 0, *tmp, *tmpend;
    9339             : 
    9340    11186000 :     kind = PyUnicode_KIND(self);
    9341    11186000 :     data = PyUnicode_DATA(self);
    9342    11186000 :     length = PyUnicode_GET_LENGTH(self);
    9343    11186000 :     if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
    9344           0 :         PyErr_SetString(PyExc_OverflowError, "string is too long");
    9345           0 :         return NULL;
    9346             :     }
    9347    11186000 :     tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
    9348    11186000 :     if (tmp == NULL)
    9349           0 :         return PyErr_NoMemory();
    9350    11186000 :     newlength = perform(kind, data, length, tmp, &maxchar);
    9351    11186000 :     res = PyUnicode_New(newlength, maxchar);
    9352    11186000 :     if (res == NULL)
    9353           0 :         goto leave;
    9354    11186000 :     tmpend = tmp + newlength;
    9355    11186000 :     outdata = PyUnicode_DATA(res);
    9356    11186000 :     outkind = PyUnicode_KIND(res);
    9357    11186000 :     switch (outkind) {
    9358       28827 :     case PyUnicode_1BYTE_KIND:
    9359      128757 :         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
    9360       28827 :         break;
    9361      671313 :     case PyUnicode_2BYTE_KIND:
    9362     1343280 :         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
    9363      671313 :         break;
    9364    10485800 :     case PyUnicode_4BYTE_KIND:
    9365    10485800 :         memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
    9366    10485800 :         break;
    9367           0 :     default:
    9368           0 :         Py_UNREACHABLE();
    9369             :     }
    9370    11186000 :   leave:
    9371    11186000 :     PyMem_Free(tmp);
    9372    11186000 :     return res;
    9373             : }
    9374             : 
    9375             : PyObject *
    9376     6960270 : PyUnicode_Join(PyObject *separator, PyObject *seq)
    9377             : {
    9378             :     PyObject *res;
    9379             :     PyObject *fseq;
    9380             :     Py_ssize_t seqlen;
    9381             :     PyObject **items;
    9382             : 
    9383     6960270 :     fseq = PySequence_Fast(seq, "can only join an iterable");
    9384     6960270 :     if (fseq == NULL) {
    9385           6 :         return NULL;
    9386             :     }
    9387             : 
    9388             :     /* NOTE: the following code can't call back into Python code,
    9389             :      * so we are sure that fseq won't be mutated.
    9390             :      */
    9391             : 
    9392     6960260 :     items = PySequence_Fast_ITEMS(fseq);
    9393     6960260 :     seqlen = PySequence_Fast_GET_SIZE(fseq);
    9394     6960260 :     res = _PyUnicode_JoinArray(separator, items, seqlen);
    9395     6960260 :     Py_DECREF(fseq);
    9396     6960260 :     return res;
    9397             : }
    9398             : 
    9399             : PyObject *
    9400     9198380 : _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
    9401             : {
    9402     9198380 :     PyObject *res = NULL; /* the result */
    9403     9198380 :     PyObject *sep = NULL;
    9404             :     Py_ssize_t seplen;
    9405             :     PyObject *item;
    9406             :     Py_ssize_t sz, i, res_offset;
    9407             :     Py_UCS4 maxchar;
    9408             :     Py_UCS4 item_maxchar;
    9409             :     int use_memcpy;
    9410     9198380 :     unsigned char *res_data = NULL, *sep_data = NULL;
    9411             :     PyObject *last_obj;
    9412     9198380 :     int kind = 0;
    9413             : 
    9414             :     /* If empty sequence, return u"". */
    9415     9198380 :     if (seqlen == 0) {
    9416       65979 :         _Py_RETURN_UNICODE_EMPTY();
    9417             :     }
    9418             : 
    9419             :     /* If singleton sequence with an exact Unicode, return that. */
    9420     9132400 :     last_obj = NULL;
    9421     9132400 :     if (seqlen == 1) {
    9422      630227 :         if (PyUnicode_CheckExact(items[0])) {
    9423      612504 :             res = items[0];
    9424      612504 :             Py_INCREF(res);
    9425      612504 :             return res;
    9426             :         }
    9427       17723 :         seplen = 0;
    9428       17723 :         maxchar = 0;
    9429             :     }
    9430             :     else {
    9431             :         /* Set up sep and seplen */
    9432     8502180 :         if (separator == NULL) {
    9433             :             /* fall back to a blank space separator */
    9434           0 :             sep = PyUnicode_FromOrdinal(' ');
    9435           0 :             if (!sep)
    9436           0 :                 goto onError;
    9437           0 :             seplen = 1;
    9438           0 :             maxchar = 32;
    9439             :         }
    9440             :         else {
    9441     8502180 :             if (!PyUnicode_Check(separator)) {
    9442           0 :                 PyErr_Format(PyExc_TypeError,
    9443             :                              "separator: expected str instance,"
    9444             :                              " %.80s found",
    9445           0 :                              Py_TYPE(separator)->tp_name);
    9446           0 :                 goto onError;
    9447             :             }
    9448     8502180 :             sep = separator;
    9449     8502180 :             seplen = PyUnicode_GET_LENGTH(separator);
    9450     8502180 :             maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
    9451             :             /* inc refcount to keep this code path symmetric with the
    9452             :                above case of a blank separator */
    9453     8502180 :             Py_INCREF(sep);
    9454             :         }
    9455     8502180 :         last_obj = sep;
    9456             :     }
    9457             : 
    9458             :     /* There are at least two things to join, or else we have a subclass
    9459             :      * of str in the sequence.
    9460             :      * Do a pre-pass to figure out the total amount of space we'll
    9461             :      * need (sz), and see whether all argument are strings.
    9462             :      */
    9463     8519900 :     sz = 0;
    9464             : #ifdef Py_DEBUG
    9465     8519900 :     use_memcpy = 0;
    9466             : #else
    9467             :     use_memcpy = 1;
    9468             : #endif
    9469    88294600 :     for (i = 0; i < seqlen; i++) {
    9470             :         size_t add_sz;
    9471    79774700 :         item = items[i];
    9472    79774700 :         if (!PyUnicode_Check(item)) {
    9473          18 :             PyErr_Format(PyExc_TypeError,
    9474             :                          "sequence item %zd: expected str instance,"
    9475             :                          " %.80s found",
    9476          18 :                          i, Py_TYPE(item)->tp_name);
    9477          18 :             goto onError;
    9478             :         }
    9479    79774700 :         add_sz = PyUnicode_GET_LENGTH(item);
    9480    79774700 :         item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
    9481    79774700 :         maxchar = Py_MAX(maxchar, item_maxchar);
    9482    79774700 :         if (i != 0) {
    9483    71254800 :             add_sz += seplen;
    9484             :         }
    9485    79774700 :         if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
    9486           0 :             PyErr_SetString(PyExc_OverflowError,
    9487             :                             "join() result is too long for a Python string");
    9488           0 :             goto onError;
    9489             :         }
    9490    79774700 :         sz += add_sz;
    9491    79774700 :         if (use_memcpy && last_obj != NULL) {
    9492           0 :             if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
    9493           0 :                 use_memcpy = 0;
    9494             :         }
    9495    79774700 :         last_obj = item;
    9496             :     }
    9497             : 
    9498     8519880 :     res = PyUnicode_New(sz, maxchar);
    9499     8519880 :     if (res == NULL)
    9500           0 :         goto onError;
    9501             : 
    9502             :     /* Catenate everything. */
    9503             : #ifdef Py_DEBUG
    9504     8519880 :     use_memcpy = 0;
    9505             : #else
    9506             :     if (use_memcpy) {
    9507             :         res_data = PyUnicode_1BYTE_DATA(res);
    9508             :         kind = PyUnicode_KIND(res);
    9509             :         if (seplen != 0)
    9510             :             sep_data = PyUnicode_1BYTE_DATA(sep);
    9511             :     }
    9512             : #endif
    9513     8519880 :     if (use_memcpy) {
    9514           0 :         for (i = 0; i < seqlen; ++i) {
    9515             :             Py_ssize_t itemlen;
    9516           0 :             item = items[i];
    9517             : 
    9518             :             /* Copy item, and maybe the separator. */
    9519           0 :             if (i && seplen != 0) {
    9520           0 :                 memcpy(res_data,
    9521             :                           sep_data,
    9522           0 :                           kind * seplen);
    9523           0 :                 res_data += kind * seplen;
    9524             :             }
    9525             : 
    9526           0 :             itemlen = PyUnicode_GET_LENGTH(item);
    9527           0 :             if (itemlen != 0) {
    9528           0 :                 memcpy(res_data,
    9529           0 :                           PyUnicode_DATA(item),
    9530           0 :                           kind * itemlen);
    9531           0 :                 res_data += kind * itemlen;
    9532             :             }
    9533             :         }
    9534           0 :         assert(res_data == PyUnicode_1BYTE_DATA(res)
    9535             :                            + kind * PyUnicode_GET_LENGTH(res));
    9536             :     }
    9537             :     else {
    9538    88294500 :         for (i = 0, res_offset = 0; i < seqlen; ++i) {
    9539             :             Py_ssize_t itemlen;
    9540    79774700 :             item = items[i];
    9541             : 
    9542             :             /* Copy item, and maybe the separator. */
    9543    79774700 :             if (i && seplen != 0) {
    9544     7162680 :                 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
    9545     7162680 :                 res_offset += seplen;
    9546             :             }
    9547             : 
    9548    79774700 :             itemlen = PyUnicode_GET_LENGTH(item);
    9549    79774700 :             if (itemlen != 0) {
    9550    77730700 :                 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
    9551    77730700 :                 res_offset += itemlen;
    9552             :             }
    9553             :         }
    9554     8519880 :         assert(res_offset == PyUnicode_GET_LENGTH(res));
    9555             :     }
    9556             : 
    9557     8519880 :     Py_XDECREF(sep);
    9558     8519880 :     assert(_PyUnicode_CheckConsistency(res, 1));
    9559     8519880 :     return res;
    9560             : 
    9561          18 :   onError:
    9562          18 :     Py_XDECREF(sep);
    9563          18 :     Py_XDECREF(res);
    9564          18 :     return NULL;
    9565             : }
    9566             : 
    9567             : void
    9568       13948 : _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
    9569             :                     Py_UCS4 fill_char)
    9570             : {
    9571       13948 :     const int kind = PyUnicode_KIND(unicode);
    9572       13948 :     void *data = PyUnicode_DATA(unicode);
    9573       13948 :     assert(unicode_modifiable(unicode));
    9574       13948 :     assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
    9575       13948 :     assert(start >= 0);
    9576       13948 :     assert(start + length <= PyUnicode_GET_LENGTH(unicode));
    9577       13948 :     unicode_fill(kind, data, fill_char, start, length);
    9578       13948 : }
    9579             : 
    9580             : Py_ssize_t
    9581        7381 : PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
    9582             :                Py_UCS4 fill_char)
    9583             : {
    9584             :     Py_ssize_t maxlen;
    9585             : 
    9586        7381 :     if (!PyUnicode_Check(unicode)) {
    9587           0 :         PyErr_BadInternalCall();
    9588           0 :         return -1;
    9589             :     }
    9590        7381 :     if (unicode_check_modifiable(unicode))
    9591           0 :         return -1;
    9592             : 
    9593        7381 :     if (start < 0) {
    9594           0 :         PyErr_SetString(PyExc_IndexError, "string index out of range");
    9595           0 :         return -1;
    9596             :     }
    9597        7381 :     if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
    9598           0 :         PyErr_SetString(PyExc_ValueError,
    9599             :                          "fill character is bigger than "
    9600             :                          "the string maximum character");
    9601           0 :         return -1;
    9602             :     }
    9603             : 
    9604        7381 :     maxlen = PyUnicode_GET_LENGTH(unicode) - start;
    9605        7381 :     length = Py_MIN(maxlen, length);
    9606        7381 :     if (length <= 0)
    9607           0 :         return 0;
    9608             : 
    9609        7381 :     _PyUnicode_FastFill(unicode, start, length, fill_char);
    9610        7381 :     return length;
    9611             : }
    9612             : 
    9613             : static PyObject *
    9614       68202 : pad(PyObject *self,
    9615             :     Py_ssize_t left,
    9616             :     Py_ssize_t right,
    9617             :     Py_UCS4 fill)
    9618             : {
    9619             :     PyObject *u;
    9620             :     Py_UCS4 maxchar;
    9621             :     int kind;
    9622             :     void *data;
    9623             : 
    9624       68202 :     if (left < 0)
    9625           0 :         left = 0;
    9626       68202 :     if (right < 0)
    9627           0 :         right = 0;
    9628             : 
    9629       68202 :     if (left == 0 && right == 0)
    9630           0 :         return unicode_result_unchanged(self);
    9631             : 
    9632       68202 :     if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
    9633       68202 :         right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
    9634           0 :         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
    9635           0 :         return NULL;
    9636             :     }
    9637       68202 :     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
    9638       68202 :     maxchar = Py_MAX(maxchar, fill);
    9639       68202 :     u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
    9640       68202 :     if (!u)
    9641           0 :         return NULL;
    9642             : 
    9643       68202 :     kind = PyUnicode_KIND(u);
    9644       68202 :     data = PyUnicode_DATA(u);
    9645       68202 :     if (left)
    9646       42718 :         unicode_fill(kind, data, fill, 0, left);
    9647       68202 :     if (right)
    9648       26738 :         unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
    9649       68202 :     _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
    9650       68202 :     assert(_PyUnicode_CheckConsistency(u, 1));
    9651       68202 :     return u;
    9652             : }
    9653             : 
    9654             : PyObject *
    9655      330017 : PyUnicode_Splitlines(PyObject *string, int keepends)
    9656             : {
    9657             :     PyObject *list;
    9658             : 
    9659      330017 :     if (ensure_unicode(string) < 0)
    9660           0 :         return NULL;
    9661             : 
    9662      330017 :     switch (PyUnicode_KIND(string)) {
    9663      160322 :     case PyUnicode_1BYTE_KIND:
    9664      160322 :         if (PyUnicode_IS_ASCII(string))
    9665      319110 :             list = asciilib_splitlines(
    9666      159555 :                 string, PyUnicode_1BYTE_DATA(string),
    9667             :                 PyUnicode_GET_LENGTH(string), keepends);
    9668             :         else
    9669        1534 :             list = ucs1lib_splitlines(
    9670         767 :                 string, PyUnicode_1BYTE_DATA(string),
    9671             :                 PyUnicode_GET_LENGTH(string), keepends);
    9672      160322 :         break;
    9673      168900 :     case PyUnicode_2BYTE_KIND:
    9674      337800 :         list = ucs2lib_splitlines(
    9675      168900 :             string, PyUnicode_2BYTE_DATA(string),
    9676             :             PyUnicode_GET_LENGTH(string), keepends);
    9677      168900 :         break;
    9678         795 :     case PyUnicode_4BYTE_KIND:
    9679        1590 :         list = ucs4lib_splitlines(
    9680         795 :             string, PyUnicode_4BYTE_DATA(string),
    9681             :             PyUnicode_GET_LENGTH(string), keepends);
    9682         795 :         break;
    9683           0 :     default:
    9684           0 :         Py_UNREACHABLE();
    9685             :     }
    9686      330017 :     return list;
    9687             : }
    9688             : 
    9689             : static PyObject *
    9690     2370380 : split(PyObject *self,
    9691             :       PyObject *substring,
    9692             :       Py_ssize_t maxcount)
    9693             : {
    9694             :     int kind1, kind2;
    9695             :     const void *buf1, *buf2;
    9696             :     Py_ssize_t len1, len2;
    9697             :     PyObject* out;
    9698             : 
    9699     2370380 :     if (maxcount < 0)
    9700     2293180 :         maxcount = PY_SSIZE_T_MAX;
    9701             : 
    9702     2370380 :     if (substring == NULL)
    9703      478186 :         switch (PyUnicode_KIND(self)) {
    9704      478185 :         case PyUnicode_1BYTE_KIND:
    9705      478185 :             if (PyUnicode_IS_ASCII(self))
    9706      956352 :                 return asciilib_split_whitespace(
    9707      478176 :                     self,  PyUnicode_1BYTE_DATA(self),
    9708             :                     PyUnicode_GET_LENGTH(self), maxcount
    9709             :                     );
    9710             :             else
    9711          18 :                 return ucs1lib_split_whitespace(
    9712           9 :                     self,  PyUnicode_1BYTE_DATA(self),
    9713             :                     PyUnicode_GET_LENGTH(self), maxcount
    9714             :                     );
    9715           1 :         case PyUnicode_2BYTE_KIND:
    9716           2 :             return ucs2lib_split_whitespace(
    9717           1 :                 self,  PyUnicode_2BYTE_DATA(self),
    9718             :                 PyUnicode_GET_LENGTH(self), maxcount
    9719             :                 );
    9720           0 :         case PyUnicode_4BYTE_KIND:
    9721           0 :             return ucs4lib_split_whitespace(
    9722           0 :                 self,  PyUnicode_4BYTE_DATA(self),
    9723             :                 PyUnicode_GET_LENGTH(self), maxcount
    9724             :                 );
    9725           0 :         default:
    9726           0 :             Py_UNREACHABLE();
    9727             :         }
    9728             : 
    9729     1892200 :     kind1 = PyUnicode_KIND(self);
    9730     1892200 :     kind2 = PyUnicode_KIND(substring);
    9731     1892200 :     len1 = PyUnicode_GET_LENGTH(self);
    9732     1892200 :     len2 = PyUnicode_GET_LENGTH(substring);
    9733     1892200 :     if (kind1 < kind2 || len1 < len2) {
    9734       18212 :         out = PyList_New(1);
    9735       18212 :         if (out == NULL)
    9736           0 :             return NULL;
    9737       18212 :         Py_INCREF(self);
    9738       18212 :         PyList_SET_ITEM(out, 0, self);
    9739       18212 :         return out;
    9740             :     }
    9741     1873980 :     buf1 = PyUnicode_DATA(self);
    9742     1873980 :     buf2 = PyUnicode_DATA(substring);
    9743     1873980 :     if (kind2 != kind1) {
    9744       19275 :         buf2 = unicode_askind(kind2, buf2, len2, kind1);
    9745       19275 :         if (!buf2)
    9746           0 :             return NULL;
    9747             :     }
    9748             : 
    9749     1873980 :     switch (kind1) {
    9750     1854700 :     case PyUnicode_1BYTE_KIND:
    9751     1854700 :         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
    9752     1848220 :             out = asciilib_split(
    9753             :                 self,  buf1, len1, buf2, len2, maxcount);
    9754             :         else
    9755        6470 :             out = ucs1lib_split(
    9756             :                 self,  buf1, len1, buf2, len2, maxcount);
    9757     1854700 :         break;
    9758       16849 :     case PyUnicode_2BYTE_KIND:
    9759       16849 :         out = ucs2lib_split(
    9760             :             self,  buf1, len1, buf2, len2, maxcount);
    9761       16849 :         break;
    9762        2440 :     case PyUnicode_4BYTE_KIND:
    9763        2440 :         out = ucs4lib_split(
    9764             :             self,  buf1, len1, buf2, len2, maxcount);
    9765        2440 :         break;
    9766           0 :     default:
    9767           0 :         out = NULL;
    9768             :     }
    9769     1873980 :     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
    9770     1873980 :     if (kind2 != kind1)
    9771       19275 :         PyMem_Free((void *)buf2);
    9772     1873980 :     return out;
    9773             : }
    9774             : 
    9775             : static PyObject *
    9776        3824 : rsplit(PyObject *self,
    9777             :        PyObject *substring,
    9778             :        Py_ssize_t maxcount)
    9779             : {
    9780             :     int kind1, kind2;
    9781             :     const void *buf1, *buf2;
    9782             :     Py_ssize_t len1, len2;
    9783             :     PyObject* out;
    9784             : 
    9785        3824 :     if (maxcount < 0)
    9786         112 :         maxcount = PY_SSIZE_T_MAX;
    9787             : 
    9788        3824 :     if (substring == NULL)
    9789          76 :         switch (PyUnicode_KIND(self)) {
    9790          76 :         case PyUnicode_1BYTE_KIND:
    9791          76 :             if (PyUnicode_IS_ASCII(self))
    9792         152 :                 return asciilib_rsplit_whitespace(
    9793          76 :                     self,  PyUnicode_1BYTE_DATA(self),
    9794             :                     PyUnicode_GET_LENGTH(self), maxcount
    9795             :                     );
    9796             :             else
    9797           0 :                 return ucs1lib_rsplit_whitespace(
    9798           0 :                     self,  PyUnicode_1BYTE_DATA(self),
    9799             :                     PyUnicode_GET_LENGTH(self), maxcount
    9800             :                     );
    9801           0 :         case PyUnicode_2BYTE_KIND:
    9802           0 :             return ucs2lib_rsplit_whitespace(
    9803           0 :                 self,  PyUnicode_2BYTE_DATA(self),
    9804             :                 PyUnicode_GET_LENGTH(self), maxcount
    9805             :                 );
    9806           0 :         case PyUnicode_4BYTE_KIND:
    9807           0 :             return ucs4lib_rsplit_whitespace(
    9808           0 :                 self,  PyUnicode_4BYTE_DATA(self),
    9809             :                 PyUnicode_GET_LENGTH(self), maxcount
    9810             :                 );
    9811           0 :         default:
    9812           0 :             Py_UNREACHABLE();
    9813             :         }
    9814             : 
    9815        3748 :     kind1 = PyUnicode_KIND(self);
    9816        3748 :     kind2 = PyUnicode_KIND(substring);
    9817        3748 :     len1 = PyUnicode_GET_LENGTH(self);
    9818        3748 :     len2 = PyUnicode_GET_LENGTH(substring);
    9819        3748 :     if (kind1 < kind2 || len1 < len2) {
    9820          13 :         out = PyList_New(1);
    9821          13 :         if (out == NULL)
    9822           0 :             return NULL;
    9823          13 :         Py_INCREF(self);
    9824          13 :         PyList_SET_ITEM(out, 0, self);
    9825          13 :         return out;
    9826             :     }
    9827        3735 :     buf1 = PyUnicode_DATA(self);
    9828        3735 :     buf2 = PyUnicode_DATA(substring);
    9829        3735 :     if (kind2 != kind1) {
    9830          12 :         buf2 = unicode_askind(kind2, buf2, len2, kind1);
    9831          12 :         if (!buf2)
    9832           0 :             return NULL;
    9833             :     }
    9834             : 
    9835        3735 :     switch (kind1) {
    9836        3709 :     case PyUnicode_1BYTE_KIND:
    9837        3709 :         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
    9838        3706 :             out = asciilib_rsplit(
    9839             :                 self,  buf1, len1, buf2, len2, maxcount);
    9840             :         else
    9841           3 :             out = ucs1lib_rsplit(
    9842             :                 self,  buf1, len1, buf2, len2, maxcount);
    9843        3709 :         break;
    9844          10 :     case PyUnicode_2BYTE_KIND:
    9845          10 :         out = ucs2lib_rsplit(
    9846             :             self,  buf1, len1, buf2, len2, maxcount);
    9847          10 :         break;
    9848          16 :     case PyUnicode_4BYTE_KIND:
    9849          16 :         out = ucs4lib_rsplit(
    9850             :             self,  buf1, len1, buf2, len2, maxcount);
    9851          16 :         break;
    9852           0 :     default:
    9853           0 :         out = NULL;
    9854             :     }
    9855        3735 :     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
    9856        3735 :     if (kind2 != kind1)
    9857          12 :         PyMem_Free((void *)buf2);
    9858        3735 :     return out;
    9859             : }
    9860             : 
    9861             : static Py_ssize_t
    9862      576580 : anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
    9863             :             PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
    9864             : {
    9865      576580 :     switch (kind) {
    9866      576230 :     case PyUnicode_1BYTE_KIND:
    9867      576230 :         if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
    9868      564106 :             return asciilib_find(buf1, len1, buf2, len2, offset);
    9869             :         else
    9870       12124 :             return ucs1lib_find(buf1, len1, buf2, len2, offset);
    9871         329 :     case PyUnicode_2BYTE_KIND:
    9872         329 :         return ucs2lib_find(buf1, len1, buf2, len2, offset);
    9873          21 :     case PyUnicode_4BYTE_KIND:
    9874          21 :         return ucs4lib_find(buf1, len1, buf2, len2, offset);
    9875             :     }
    9876           0 :     Py_UNREACHABLE();
    9877             : }
    9878             : 
    9879             : static Py_ssize_t
    9880     2356470 : anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
    9881             :              PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
    9882             : {
    9883     2356470 :     switch (kind) {
    9884     2353310 :     case PyUnicode_1BYTE_KIND:
    9885     2353310 :         if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
    9886     2349960 :             return asciilib_count(sbuf, slen, buf1, len1, maxcount);
    9887             :         else
    9888        3348 :             return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
    9889        2344 :     case PyUnicode_2BYTE_KIND:
    9890        2344 :         return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
    9891         823 :     case PyUnicode_4BYTE_KIND:
    9892         823 :         return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
    9893             :     }
    9894           0 :     Py_UNREACHABLE();
    9895             : }
    9896             : 
    9897             : static void
    9898       35157 : replace_1char_inplace(PyObject *u, Py_ssize_t pos,
    9899             :                       Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
    9900             : {
    9901       35157 :     int kind = PyUnicode_KIND(u);
    9902       35157 :     void *data = PyUnicode_DATA(u);
    9903       35157 :     Py_ssize_t len = PyUnicode_GET_LENGTH(u);
    9904       35157 :     if (kind == PyUnicode_1BYTE_KIND) {
    9905       35124 :         ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
    9906             :                                       (Py_UCS1 *)data + len,
    9907             :                                       u1, u2, maxcount);
    9908             :     }
    9909          33 :     else if (kind == PyUnicode_2BYTE_KIND) {
    9910          14 :         ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
    9911          14 :                                       (Py_UCS2 *)data + len,
    9912             :                                       u1, u2, maxcount);
    9913             :     }
    9914             :     else {
    9915          19 :         assert(kind == PyUnicode_4BYTE_KIND);
    9916          19 :         ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
    9917          19 :                                       (Py_UCS4 *)data + len,
    9918             :                                       u1, u2, maxcount);
    9919             :     }
    9920       35157 : }
    9921             : 
    9922             : static PyObject *
    9923     3239390 : replace(PyObject *self, PyObject *str1,
    9924             :         PyObject *str2, Py_ssize_t maxcount)
    9925             : {
    9926             :     PyObject *u;
    9927     3239390 :     const char *sbuf = PyUnicode_DATA(self);
    9928     3239390 :     const void *buf1 = PyUnicode_DATA(str1);
    9929     3239390 :     const void *buf2 = PyUnicode_DATA(str2);
    9930     3239390 :     int srelease = 0, release1 = 0, release2 = 0;
    9931     3239390 :     int skind = PyUnicode_KIND(self);
    9932     3239390 :     int kind1 = PyUnicode_KIND(str1);
    9933     3239390 :     int kind2 = PyUnicode_KIND(str2);
    9934     3239390 :     Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
    9935     3239390 :     Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
    9936     3239390 :     Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
    9937             :     int mayshrink;
    9938             :     Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
    9939             : 
    9940     3239390 :     if (slen < len1)
    9941      769592 :         goto nothing;
    9942             : 
    9943     2469800 :     if (maxcount < 0)
    9944     2469660 :         maxcount = PY_SSIZE_T_MAX;
    9945         141 :     else if (maxcount == 0)
    9946          33 :         goto nothing;
    9947             : 
    9948     2469770 :     if (str1 == str2)
    9949       12519 :         goto nothing;
    9950             : 
    9951     2457250 :     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
    9952     2457250 :     maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
    9953     2457250 :     if (maxchar < maxchar_str1)
    9954             :         /* substring too wide to be present */
    9955          36 :         goto nothing;
    9956     2457210 :     maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
    9957             :     /* Replacing str1 with str2 may cause a maxchar reduction in the
    9958             :        result string. */
    9959     2457210 :     mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
    9960     2457210 :     maxchar = Py_MAX(maxchar, maxchar_str2);
    9961             : 
    9962     2457210 :     if (len1 == len2) {
    9963             :         /* same length */
    9964      100741 :         if (len1 == 0)
    9965           0 :             goto nothing;
    9966      100741 :         if (len1 == 1) {
    9967             :             /* replace characters */
    9968             :             Py_UCS4 u1, u2;
    9969             :             Py_ssize_t pos;
    9970             : 
    9971       99432 :             u1 = PyUnicode_READ(kind1, buf1, 0);
    9972       99432 :             pos = findchar(sbuf, skind, slen, u1, 1);
    9973       99432 :             if (pos < 0)
    9974       64275 :                 goto nothing;
    9975       35157 :             u2 = PyUnicode_READ(kind2, buf2, 0);
    9976       35157 :             u = PyUnicode_New(slen, maxchar);
    9977       35157 :             if (!u)
    9978           0 :                 goto error;
    9979             : 
    9980       35157 :             _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
    9981       35157 :             replace_1char_inplace(u, pos, u1, u2, maxcount);
    9982             :         }
    9983             :         else {
    9984        1309 :             int rkind = skind;
    9985             :             char *res;
    9986             :             Py_ssize_t i;
    9987             : 
    9988        1309 :             if (kind1 < rkind) {
    9989             :                 /* widen substring */
    9990           0 :                 buf1 = unicode_askind(kind1, buf1, len1, rkind);
    9991           0 :                 if (!buf1) goto error;
    9992           0 :                 release1 = 1;
    9993             :             }
    9994        1309 :             i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
    9995        1309 :             if (i < 0)
    9996         881 :                 goto nothing;
    9997         428 :             if (rkind > kind2) {
    9998             :                 /* widen replacement */
    9999           0 :                 buf2 = unicode_askind(kind2, buf2, len2, rkind);
   10000           0 :                 if (!buf2) goto error;
   10001           0 :                 release2 = 1;
   10002             :             }
   10003         428 :             else if (rkind < kind2) {
   10004             :                 /* widen self and buf1 */
   10005           0 :                 rkind = kind2;
   10006           0 :                 if (release1) {
   10007           0 :                     assert(buf1 != PyUnicode_DATA(str1));
   10008           0 :                     PyMem_Free((void *)buf1);
   10009           0 :                     buf1 = PyUnicode_DATA(str1);
   10010           0 :                     release1 = 0;
   10011             :                 }
   10012           0 :                 sbuf = unicode_askind(skind, sbuf, slen, rkind);
   10013           0 :                 if (!sbuf) goto error;
   10014           0 :                 srelease = 1;
   10015           0 :                 buf1 = unicode_askind(kind1, buf1, len1, rkind);
   10016           0 :                 if (!buf1) goto error;
   10017           0 :                 release1 = 1;
   10018             :             }
   10019         428 :             u = PyUnicode_New(slen, maxchar);
   10020         428 :             if (!u)
   10021           0 :                 goto error;
   10022         428 :             assert(PyUnicode_KIND(u) == rkind);
   10023         428 :             res = PyUnicode_DATA(u);
   10024             : 
   10025         428 :             memcpy(res, sbuf, rkind * slen);
   10026             :             /* change everything in-place, starting with this one */
   10027         428 :             memcpy(res + rkind * i,
   10028             :                    buf2,
   10029         428 :                    rkind * len2);
   10030         428 :             i += len1;
   10031             : 
   10032         455 :             while ( --maxcount > 0) {
   10033         449 :                 i = anylib_find(rkind, self,
   10034         449 :                                 sbuf+rkind*i, slen-i,
   10035             :                                 str1, buf1, len1, i);
   10036         449 :                 if (i == -1)
   10037         422 :                     break;
   10038          27 :                 memcpy(res + rkind * i,
   10039             :                        buf2,
   10040          27 :                        rkind * len2);
   10041          27 :                 i += len1;
   10042             :             }
   10043             :         }
   10044             :     }
   10045             :     else {
   10046             :         Py_ssize_t n, i, j, ires;
   10047             :         Py_ssize_t new_size;
   10048     2356470 :         int rkind = skind;
   10049             :         char *res;
   10050             : 
   10051     2356470 :         if (kind1 < rkind) {
   10052             :             /* widen substring */
   10053        2831 :             buf1 = unicode_askind(kind1, buf1, len1, rkind);
   10054        2831 :             if (!buf1) goto error;
   10055        2831 :             release1 = 1;
   10056             :         }
   10057     2356470 :         n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
   10058     2356470 :         if (n == 0)
   10059     2073350 :             goto nothing;
   10060      283125 :         if (kind2 < rkind) {
   10061             :             /* widen replacement */
   10062         410 :             buf2 = unicode_askind(kind2, buf2, len2, rkind);
   10063         410 :             if (!buf2) goto error;
   10064         410 :             release2 = 1;
   10065             :         }
   10066      282715 :         else if (kind2 > rkind) {
   10067             :             /* widen self and buf1 */
   10068           5 :             rkind = kind2;
   10069           5 :             sbuf = unicode_askind(skind, sbuf, slen, rkind);
   10070           5 :             if (!sbuf) goto error;
   10071           5 :             srelease = 1;
   10072           5 :             if (release1) {
   10073           1 :                 assert(buf1 != PyUnicode_DATA(str1));
   10074           1 :                 PyMem_Free((void *)buf1);
   10075           1 :                 buf1 = PyUnicode_DATA(str1);
   10076           1 :                 release1 = 0;
   10077             :             }
   10078           5 :             buf1 = unicode_askind(kind1, buf1, len1, rkind);
   10079           5 :             if (!buf1) goto error;
   10080           5 :             release1 = 1;
   10081             :         }
   10082             :         /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
   10083             :            PyUnicode_GET_LENGTH(str1)); */
   10084      283125 :         if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
   10085           0 :                 PyErr_SetString(PyExc_OverflowError,
   10086             :                                 "replace string is too long");
   10087           0 :                 goto error;
   10088             :         }
   10089      283125 :         new_size = slen + n * (len2 - len1);
   10090      283125 :         if (new_size == 0) {
   10091       10429 :             u = unicode_new_empty();
   10092       10429 :             goto done;
   10093             :         }
   10094      272696 :         if (new_size > (PY_SSIZE_T_MAX / rkind)) {
   10095           0 :             PyErr_SetString(PyExc_OverflowError,
   10096             :                             "replace string is too long");
   10097           0 :             goto error;
   10098             :         }
   10099      272696 :         u = PyUnicode_New(new_size, maxchar);
   10100      272696 :         if (!u)
   10101           0 :             goto error;
   10102      272696 :         assert(PyUnicode_KIND(u) == rkind);
   10103      272696 :         res = PyUnicode_DATA(u);
   10104      272696 :         ires = i = 0;
   10105      272696 :         if (len1 > 0) {
   10106      847490 :             while (n-- > 0) {
   10107             :                 /* look for next match */
   10108      574822 :                 j = anylib_find(rkind, self,
   10109      574822 :                                 sbuf + rkind * i, slen-i,
   10110             :                                 str1, buf1, len1, i);
   10111      574822 :                 if (j == -1)
   10112           0 :                     break;
   10113      574822 :                 else if (j > i) {
   10114             :                     /* copy unchanged part [i:j] */
   10115      540160 :                     memcpy(res + rkind * ires,
   10116      540160 :                            sbuf + rkind * i,
   10117      540160 :                            rkind * (j-i));
   10118      540160 :                     ires += j - i;
   10119             :                 }
   10120             :                 /* copy substitution string */
   10121      574822 :                 if (len2 > 0) {
   10122      291333 :                     memcpy(res + rkind * ires,
   10123             :                            buf2,
   10124      291333 :                            rkind * len2);
   10125      291333 :                     ires += len2;
   10126             :                 }
   10127      574822 :                 i = j + len1;
   10128             :             }
   10129      272668 :             if (i < slen)
   10130             :                 /* copy tail [i:] */
   10131       88655 :                 memcpy(res + rkind * ires,
   10132       88655 :                        sbuf + rkind * i,
   10133       88655 :                        rkind * (slen-i));
   10134             :         }
   10135             :         else {
   10136             :             /* interleave */
   10137          66 :             while (n > 0) {
   10138          66 :                 memcpy(res + rkind * ires,
   10139             :                        buf2,
   10140          66 :                        rkind * len2);
   10141          66 :                 ires += len2;
   10142          66 :                 if (--n <= 0)
   10143          28 :                     break;
   10144          38 :                 memcpy(res + rkind * ires,
   10145          38 :                        sbuf + rkind * i,
   10146             :                        rkind);
   10147          38 :                 ires++;
   10148          38 :                 i++;
   10149             :             }
   10150          28 :             memcpy(res + rkind * ires,
   10151          28 :                    sbuf + rkind * i,
   10152          28 :                    rkind * (slen-i));
   10153             :         }
   10154             :     }
   10155             : 
   10156      308281 :     if (mayshrink) {
   10157         140 :         unicode_adjust_maxchar(&u);
   10158         140 :         if (u == NULL)
   10159           0 :             goto error;
   10160             :     }
   10161             : 
   10162      308281 :   done:
   10163      318710 :     assert(srelease == (sbuf != PyUnicode_DATA(self)));
   10164      318710 :     assert(release1 == (buf1 != PyUnicode_DATA(str1)));
   10165      318710 :     assert(release2 == (buf2 != PyUnicode_DATA(str2)));
   10166      318710 :     if (srelease)
   10167           5 :         PyMem_Free((void *)sbuf);
   10168      318710 :     if (release1)
   10169         101 :         PyMem_Free((void *)buf1);
   10170      318710 :     if (release2)
   10171         410 :         PyMem_Free((void *)buf2);
   10172      318710 :     assert(_PyUnicode_CheckConsistency(u, 1));
   10173      318710 :     return u;
   10174             : 
   10175     2920680 :   nothing:
   10176             :     /* nothing to replace; return original string (when possible) */
   10177     2920680 :     assert(srelease == (sbuf != PyUnicode_DATA(self)));
   10178     2920680 :     assert(release1 == (buf1 != PyUnicode_DATA(str1)));
   10179     2920680 :     assert(release2 == (buf2 != PyUnicode_DATA(str2)));
   10180     2920680 :     if (srelease)
   10181           0 :         PyMem_Free((void *)sbuf);
   10182     2920680 :     if (release1)
   10183        2734 :         PyMem_Free((void *)buf1);
   10184     2920680 :     if (release2)
   10185           0 :         PyMem_Free((void *)buf2);
   10186     2920680 :     return unicode_result_unchanged(self);
   10187             : 
   10188           0 :   error:
   10189           0 :     assert(srelease == (sbuf != PyUnicode_DATA(self)));
   10190           0 :     assert(release1 == (buf1 != PyUnicode_DATA(str1)));
   10191           0 :     assert(release2 == (buf2 != PyUnicode_DATA(str2)));
   10192           0 :     if (srelease)
   10193           0 :         PyMem_Free((void *)sbuf);
   10194           0 :     if (release1)
   10195           0 :         PyMem_Free((void *)buf1);
   10196           0 :     if (release2)
   10197           0 :         PyMem_Free((void *)buf2);
   10198           0 :     return NULL;
   10199             : }
   10200             : 
   10201             : /* --- Unicode Object Methods --------------------------------------------- */
   10202             : 
   10203             : /*[clinic input]
   10204             : str.title as unicode_title
   10205             : 
   10206             : Return a version of the string where each word is titlecased.
   10207             : 
   10208             : More specifically, words start with uppercased characters and all remaining
   10209             : cased characters have lower case.
   10210             : [clinic start generated code]*/
   10211             : 
   10212             : static PyObject *
   10213     4458370 : unicode_title_impl(PyObject *self)
   10214             : /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
   10215             : {
   10216     4458370 :     return case_operation(self, do_title);
   10217             : }
   10218             : 
   10219             : /*[clinic input]
   10220             : str.capitalize as unicode_capitalize
   10221             : 
   10222             : Return a capitalized version of the string.
   10223             : 
   10224             : More specifically, make the first character have upper case and the rest lower
   10225             : case.
   10226             : [clinic start generated code]*/
   10227             : 
   10228             : static PyObject *
   10229       24227 : unicode_capitalize_impl(PyObject *self)
   10230             : /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
   10231             : {
   10232       24227 :     if (PyUnicode_GET_LENGTH(self) == 0)
   10233           3 :         return unicode_result_unchanged(self);
   10234       24224 :     return case_operation(self, do_capitalize);
   10235             : }
   10236             : 
   10237             : /*[clinic input]
   10238             : str.casefold as unicode_casefold
   10239             : 
   10240             : Return a version of the string suitable for caseless comparisons.
   10241             : [clinic start generated code]*/
   10242             : 
   10243             : static PyObject *
   10244         700 : unicode_casefold_impl(PyObject *self)
   10245             : /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
   10246             : {
   10247         700 :     if (PyUnicode_IS_ASCII(self))
   10248         694 :         return ascii_upper_or_lower(self, 1);
   10249           6 :     return case_operation(self, do_casefold);
   10250             : }
   10251             : 
   10252             : 
   10253             : /* Argument converter. Accepts a single Unicode character. */
   10254             : 
   10255             : static int
   10256          63 : convert_uc(PyObject *obj, void *addr)
   10257             : {
   10258          63 :     Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
   10259             : 
   10260          63 :     if (!PyUnicode_Check(obj)) {
   10261           0 :         PyErr_Format(PyExc_TypeError,
   10262             :                      "The fill character must be a unicode character, "
   10263           0 :                      "not %.100s", Py_TYPE(obj)->tp_name);
   10264           0 :         return 0;
   10265             :     }
   10266          63 :     if (PyUnicode_GET_LENGTH(obj) != 1) {
   10267           0 :         PyErr_SetString(PyExc_TypeError,
   10268             :                         "The fill character must be exactly one character long");
   10269           0 :         return 0;
   10270             :     }
   10271          63 :     *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
   10272          63 :     return 1;
   10273             : }
   10274             : 
   10275             : /*[clinic input]
   10276             : str.center as unicode_center
   10277             : 
   10278             :     width: Py_ssize_t
   10279             :     fillchar: Py_UCS4 = ' '
   10280             :     /
   10281             : 
   10282             : Return a centered string of length width.
   10283             : 
   10284             : Padding is done using the specified fill character (default is a space).
   10285             : [clinic start generated code]*/
   10286             : 
   10287             : static PyObject *
   10288        7863 : unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
   10289             : /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
   10290             : {
   10291             :     Py_ssize_t marg, left;
   10292             : 
   10293        7863 :     if (PyUnicode_GET_LENGTH(self) >= width)
   10294        6240 :         return unicode_result_unchanged(self);
   10295             : 
   10296        1623 :     marg = width - PyUnicode_GET_LENGTH(self);
   10297        1623 :     left = marg / 2 + (marg & width & 1);
   10298             : 
   10299        1623 :     return pad(self, left, marg - left, fillchar);
   10300             : }
   10301             : 
   10302             : /* This function assumes that str1 and str2 are readied by the caller. */
   10303             : 
   10304             : static int
   10305    30614000 : unicode_compare(PyObject *str1, PyObject *str2)
   10306             : {
   10307             : #define COMPARE(TYPE1, TYPE2) \
   10308             :     do { \
   10309             :         TYPE1* p1 = (TYPE1 *)data1; \
   10310             :         TYPE2* p2 = (TYPE2 *)data2; \
   10311             :         TYPE1* end = p1 + len; \
   10312             :         Py_UCS4 c1, c2; \
   10313             :         for (; p1 != end; p1++, p2++) { \
   10314             :             c1 = *p1; \
   10315             :             c2 = *p2; \
   10316             :             if (c1 != c2) \
   10317             :                 return (c1 < c2) ? -1 : 1; \
   10318             :         } \
   10319             :     } \
   10320             :     while (0)
   10321             : 
   10322             :     int kind1, kind2;
   10323             :     const void *data1, *data2;
   10324             :     Py_ssize_t len1, len2, len;
   10325             : 
   10326    30614000 :     kind1 = PyUnicode_KIND(str1);
   10327    30614000 :     kind2 = PyUnicode_KIND(str2);
   10328    30614000 :     data1 = PyUnicode_DATA(str1);
   10329    30614000 :     data2 = PyUnicode_DATA(str2);
   10330    30614000 :     len1 = PyUnicode_GET_LENGTH(str1);
   10331    30614000 :     len2 = PyUnicode_GET_LENGTH(str2);
   10332    30614000 :     len = Py_MIN(len1, len2);
   10333             : 
   10334    30614000 :     switch(kind1) {
   10335    30530900 :     case PyUnicode_1BYTE_KIND:
   10336             :     {
   10337             :         switch(kind2) {
   10338    30526100 :         case PyUnicode_1BYTE_KIND:
   10339             :         {
   10340    30526100 :             int cmp = memcmp(data1, data2, len);
   10341             :             /* normalize result of memcmp() into the range [-1; 1] */
   10342    30526100 :             if (cmp < 0)
   10343    16260100 :                 return -1;
   10344    14266000 :             if (cmp > 0)
   10345    13518500 :                 return 1;
   10346      747565 :             break;
   10347             :         }
   10348        4728 :         case PyUnicode_2BYTE_KIND:
   10349        4728 :             COMPARE(Py_UCS1, Py_UCS2);
   10350           0 :             break;
   10351           4 :         case PyUnicode_4BYTE_KIND:
   10352           4 :             COMPARE(Py_UCS1, Py_UCS4);
   10353           0 :             break;
   10354           0 :         default:
   10355           0 :             Py_UNREACHABLE();
   10356             :         }
   10357      747565 :         break;
   10358             :     }
   10359       82264 :     case PyUnicode_2BYTE_KIND:
   10360             :     {
   10361             :         switch(kind2) {
   10362        1306 :         case PyUnicode_1BYTE_KIND:
   10363        1306 :             COMPARE(Py_UCS2, Py_UCS1);
   10364           0 :             break;
   10365       80956 :         case PyUnicode_2BYTE_KIND:
   10366             :         {
   10367       92533 :             COMPARE(Py_UCS2, Py_UCS2);
   10368        2485 :             break;
   10369             :         }
   10370           2 :         case PyUnicode_4BYTE_KIND:
   10371           2 :             COMPARE(Py_UCS2, Py_UCS4);
   10372           0 :             break;
   10373           0 :         default:
   10374           0 :             Py_UNREACHABLE();
   10375             :         }
   10376        2485 :         break;
   10377             :     }
   10378         899 :     case PyUnicode_4BYTE_KIND:
   10379             :     {
   10380             :         switch(kind2) {
   10381           8 :         case PyUnicode_1BYTE_KIND:
   10382           8 :             COMPARE(Py_UCS4, Py_UCS1);
   10383           0 :             break;
   10384           7 :         case PyUnicode_2BYTE_KIND:
   10385           7 :             COMPARE(Py_UCS4, Py_UCS2);
   10386           0 :             break;
   10387         884 :         case PyUnicode_4BYTE_KIND:
   10388             :         {
   10389             : #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
   10390         884 :             int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
   10391             :             /* normalize result of wmemcmp() into the range [-1; 1] */
   10392         884 :             if (cmp < 0)
   10393           2 :                 return -1;
   10394         882 :             if (cmp > 0)
   10395           0 :                 return 1;
   10396             : #else
   10397             :             COMPARE(Py_UCS4, Py_UCS4);
   10398             : #endif
   10399         882 :             break;
   10400             :         }
   10401           0 :         default:
   10402           0 :             Py_UNREACHABLE();
   10403             :         }
   10404         882 :         break;
   10405             :     }
   10406           0 :     default:
   10407           0 :         Py_UNREACHABLE();
   10408             :     }
   10409             : 
   10410      750932 :     if (len1 == len2)
   10411      220696 :         return 0;
   10412      530236 :     if (len1 < len2)
   10413      161664 :         return -1;
   10414             :     else
   10415      368572 :         return 1;
   10416             : 
   10417             : #undef COMPARE
   10418             : }
   10419             : 
   10420             : static int
   10421    69537100 : unicode_compare_eq(PyObject *str1, PyObject *str2)
   10422             : {
   10423             :     int kind;
   10424             :     const void *data1, *data2;
   10425             :     Py_ssize_t len;
   10426             :     int cmp;
   10427             : 
   10428    69537100 :     len = PyUnicode_GET_LENGTH(str1);
   10429    69537100 :     if (PyUnicode_GET_LENGTH(str2) != len)
   10430    21455700 :         return 0;
   10431    48081400 :     kind = PyUnicode_KIND(str1);
   10432    48081400 :     if (PyUnicode_KIND(str2) != kind)
   10433       17725 :         return 0;
   10434    48063600 :     data1 = PyUnicode_DATA(str1);
   10435    48063600 :     data2 = PyUnicode_DATA(str2);
   10436             : 
   10437    48063600 :     cmp = memcmp(data1, data2, len * kind);
   10438    48063600 :     return (cmp == 0);
   10439             : }
   10440             : 
   10441             : int
   10442    64157100 : _PyUnicode_Equal(PyObject *str1, PyObject *str2)
   10443             : {
   10444    64157100 :     assert(PyUnicode_CheckExact(str1));
   10445    64157100 :     assert(PyUnicode_CheckExact(str2));
   10446    64157100 :     if (str1 == str2) {
   10447    21125600 :         return 1;
   10448             :     }
   10449    43031500 :     return unicode_compare_eq(str1, str2);
   10450             : }
   10451             : 
   10452             : 
   10453             : int
   10454     7262160 : PyUnicode_Compare(PyObject *left, PyObject *right)
   10455             : {
   10456     7262160 :     if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
   10457             :         /* a string is equal to itself */
   10458     7262160 :         if (left == right)
   10459     6659940 :             return 0;
   10460             : 
   10461      602221 :         return unicode_compare(left, right);
   10462             :     }
   10463           0 :     PyErr_Format(PyExc_TypeError,
   10464             :                  "Can't compare %.100s and %.100s",
   10465           0 :                  Py_TYPE(left)->tp_name,
   10466           0 :                  Py_TYPE(right)->tp_name);
   10467           0 :     return -1;
   10468             : }
   10469             : 
   10470             : int
   10471    14630900 : PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
   10472             : {
   10473             :     Py_ssize_t i;
   10474             :     int kind;
   10475             :     Py_UCS4 chr;
   10476             : 
   10477    14630900 :     assert(_PyUnicode_CHECK(uni));
   10478    14630900 :     kind = PyUnicode_KIND(uni);
   10479    14630900 :     if (kind == PyUnicode_1BYTE_KIND) {
   10480    14630900 :         const void *data = PyUnicode_1BYTE_DATA(uni);
   10481    14630900 :         size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
   10482    14630900 :         size_t len, len2 = strlen(str);
   10483             :         int cmp;
   10484             : 
   10485    14630900 :         len = Py_MIN(len1, len2);
   10486    14630900 :         cmp = memcmp(data, str, len);
   10487    14630900 :         if (cmp != 0) {
   10488     8457760 :             if (cmp < 0)
   10489     2161990 :                 return -1;
   10490             :             else
   10491     6295770 :                 return 1;
   10492             :         }
   10493     6173110 :         if (len1 > len2)
   10494           5 :             return 1; /* uni is longer */
   10495     6173110 :         if (len1 < len2)
   10496          24 :             return -1; /* str is longer */
   10497     6173080 :         return 0;
   10498             :     }
   10499             :     else {
   10500           2 :         const void *data = PyUnicode_DATA(uni);
   10501             :         /* Compare Unicode string and source character set string */
   10502           2 :         for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
   10503           2 :             if (chr != (unsigned char)str[i])
   10504           2 :                 return (chr < (unsigned char)(str[i])) ? -1 : 1;
   10505             :         /* This check keeps Python strings that end in '\0' from comparing equal
   10506             :          to C strings identical up to that point. */
   10507           0 :         if (PyUnicode_GET_LENGTH(uni) != i || chr)
   10508           0 :             return 1; /* uni is longer */
   10509           0 :         if (str[i])
   10510           0 :             return -1; /* str is longer */
   10511           0 :         return 0;
   10512             :     }
   10513             : }
   10514             : 
   10515             : int
   10516    51805700 : _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
   10517             : {
   10518             :     size_t len;
   10519    51805700 :     assert(_PyUnicode_CHECK(unicode));
   10520    51805700 :     assert(str);
   10521             : #ifndef NDEBUG
   10522   334988000 :     for (const char *p = str; *p; p++) {
   10523   283182000 :         assert((unsigned char)*p < 128);
   10524             :     }
   10525             : #endif
   10526    51805700 :     if (!PyUnicode_IS_ASCII(unicode))
   10527        1738 :         return 0;
   10528    51804000 :     len = (size_t)PyUnicode_GET_LENGTH(unicode);
   10529    59538400 :     return strlen(str) == len &&
   10530     7734440 :            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
   10531             : }
   10532             : 
   10533             : int
   10534           0 : _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
   10535             : {
   10536             :     PyObject *right_uni;
   10537             : 
   10538           0 :     assert(_PyUnicode_CHECK(left));
   10539           0 :     assert(right->string);
   10540             : #ifndef NDEBUG
   10541           0 :     for (const char *p = right->string; *p; p++) {
   10542           0 :         assert((unsigned char)*p < 128);
   10543             :     }
   10544             : #endif
   10545             : 
   10546           0 :     if (!PyUnicode_IS_ASCII(left))
   10547           0 :         return 0;
   10548             : 
   10549           0 :     right_uni = _PyUnicode_FromId(right);       /* borrowed */
   10550           0 :     if (right_uni == NULL) {
   10551             :         /* memory error or bad data */
   10552           0 :         PyErr_Clear();
   10553           0 :         return _PyUnicode_EqualToASCIIString(left, right->string);
   10554             :     }
   10555             : 
   10556           0 :     if (left == right_uni)
   10557           0 :         return 1;
   10558             : 
   10559           0 :     if (PyUnicode_CHECK_INTERNED(left))
   10560           0 :         return 0;
   10561             : 
   10562           0 :     assert(_PyUnicode_HASH(right_uni) != -1);
   10563           0 :     Py_hash_t hash = _PyUnicode_HASH(left);
   10564           0 :     if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) {
   10565           0 :         return 0;
   10566             :     }
   10567             : 
   10568           0 :     return unicode_compare_eq(left, right_uni);
   10569             : }
   10570             : 
   10571             : PyObject *
   10572    65401300 : PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
   10573             : {
   10574             :     int result;
   10575             : 
   10576    65401300 :     if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
   10577     2320810 :         Py_RETURN_NOTIMPLEMENTED;
   10578             : 
   10579    63080500 :     if (left == right) {
   10580     6563040 :         switch (op) {
   10581     5656930 :         case Py_EQ:
   10582             :         case Py_LE:
   10583             :         case Py_GE:
   10584             :             /* a string is equal to itself */
   10585     5656930 :             Py_RETURN_TRUE;
   10586      906113 :         case Py_NE:
   10587             :         case Py_LT:
   10588             :         case Py_GT:
   10589      906113 :             Py_RETURN_FALSE;
   10590           0 :         default:
   10591           0 :             PyErr_BadArgument();
   10592           0 :             return NULL;
   10593             :         }
   10594             :     }
   10595    56517400 :     else if (op == Py_EQ || op == Py_NE) {
   10596    26505600 :         result = unicode_compare_eq(left, right);
   10597    26505600 :         result ^= (op == Py_NE);
   10598    26505600 :         return PyBool_FromLong(result);
   10599             :     }
   10600             :     else {
   10601    30011800 :         result = unicode_compare(left, right);
   10602    30011800 :         Py_RETURN_RICHCOMPARE(result, 0, op);
   10603             :     }
   10604             : }
   10605             : 
   10606             : int
   10607     1200200 : _PyUnicode_EQ(PyObject *aa, PyObject *bb)
   10608             : {
   10609     1200200 :     return unicode_eq(aa, bb);
   10610             : }
   10611             : 
   10612             : int
   10613    29567200 : PyUnicode_Contains(PyObject *str, PyObject *substr)
   10614             : {
   10615             :     int kind1, kind2;
   10616             :     const void *buf1, *buf2;
   10617             :     Py_ssize_t len1, len2;
   10618             :     int result;
   10619             : 
   10620    29567200 :     if (!PyUnicode_Check(substr)) {
   10621           2 :         PyErr_Format(PyExc_TypeError,
   10622             :                      "'in <string>' requires string as left operand, not %.100s",
   10623           2 :                      Py_TYPE(substr)->tp_name);
   10624           2 :         return -1;
   10625             :     }
   10626    29567200 :     if (ensure_unicode(str) < 0)
   10627           0 :         return -1;
   10628             : 
   10629    29567200 :     kind1 = PyUnicode_KIND(str);
   10630    29567200 :     kind2 = PyUnicode_KIND(substr);
   10631    29567200 :     if (kind1 < kind2)
   10632        2080 :         return 0;
   10633    29565200 :     len1 = PyUnicode_GET_LENGTH(str);
   10634    29565200 :     len2 = PyUnicode_GET_LENGTH(substr);
   10635    29565200 :     if (len1 < len2)
   10636      195974 :         return 0;
   10637    29369200 :     buf1 = PyUnicode_DATA(str);
   10638    29369200 :     buf2 = PyUnicode_DATA(substr);
   10639    29369200 :     if (len2 == 1) {
   10640    27842000 :         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
   10641    27842000 :         result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
   10642    27842000 :         return result;
   10643             :     }
   10644     1527190 :     if (kind2 != kind1) {
   10645          83 :         buf2 = unicode_askind(kind2, buf2, len2, kind1);
   10646          83 :         if (!buf2)
   10647           0 :             return -1;
   10648             :     }
   10649             : 
   10650     1527190 :     switch (kind1) {
   10651     1527090 :     case PyUnicode_1BYTE_KIND:
   10652     1527090 :         result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
   10653     1527090 :         break;
   10654          75 :     case PyUnicode_2BYTE_KIND:
   10655          75 :         result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
   10656          75 :         break;
   10657          23 :     case PyUnicode_4BYTE_KIND:
   10658          23 :         result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
   10659          23 :         break;
   10660           0 :     default:
   10661           0 :         Py_UNREACHABLE();
   10662             :     }
   10663             : 
   10664     1527190 :     assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
   10665     1527190 :     if (kind2 != kind1)
   10666          83 :         PyMem_Free((void *)buf2);
   10667             : 
   10668     1527190 :     return result;
   10669             : }
   10670             : 
   10671             : /* Concat to string or Unicode object giving a new Unicode object. */
   10672             : 
   10673             : PyObject *
   10674    29969500 : PyUnicode_Concat(PyObject *left, PyObject *right)
   10675             : {
   10676             :     PyObject *result;
   10677             :     Py_UCS4 maxchar, maxchar2;
   10678             :     Py_ssize_t left_len, right_len, new_len;
   10679             : 
   10680    29969500 :     if (ensure_unicode(left) < 0)
   10681           0 :         return NULL;
   10682             : 
   10683    29969500 :     if (!PyUnicode_Check(right)) {
   10684          93 :         PyErr_Format(PyExc_TypeError,
   10685             :                      "can only concatenate str (not \"%.200s\") to str",
   10686          93 :                      Py_TYPE(right)->tp_name);
   10687          93 :         return NULL;
   10688             :     }
   10689             : 
   10690             :     /* Shortcuts */
   10691    29969400 :     PyObject *empty = unicode_get_empty();  // Borrowed reference
   10692    29969400 :     if (left == empty) {
   10693      720259 :         return PyUnicode_FromObject(right);
   10694             :     }
   10695    29249100 :     if (right == empty) {
   10696      505230 :         return PyUnicode_FromObject(left);
   10697             :     }
   10698             : 
   10699    28743900 :     left_len = PyUnicode_GET_LENGTH(left);
   10700    28743900 :     right_len = PyUnicode_GET_LENGTH(right);
   10701    28743900 :     if (left_len > PY_SSIZE_T_MAX - right_len) {
   10702           0 :         PyErr_SetString(PyExc_OverflowError,
   10703             :                         "strings are too large to concat");
   10704           0 :         return NULL;
   10705             :     }
   10706    28743900 :     new_len = left_len + right_len;
   10707             : 
   10708    28743900 :     maxchar = PyUnicode_MAX_CHAR_VALUE(left);
   10709    28743900 :     maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
   10710    28743900 :     maxchar = Py_MAX(maxchar, maxchar2);
   10711             : 
   10712             :     /* Concat the two Unicode strings */
   10713    28743900 :     result = PyUnicode_New(new_len, maxchar);
   10714    28743900 :     if (result == NULL)
   10715           0 :         return NULL;
   10716    28743900 :     _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
   10717    28743900 :     _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
   10718    28743900 :     assert(_PyUnicode_CheckConsistency(result, 1));
   10719    28743900 :     return result;
   10720             : }
   10721             : 
   10722             : void
   10723     3891540 : PyUnicode_Append(PyObject **p_left, PyObject *right)
   10724             : {
   10725             :     PyObject *left, *res;
   10726             :     Py_UCS4 maxchar, maxchar2;
   10727             :     Py_ssize_t left_len, right_len, new_len;
   10728             : 
   10729     3891540 :     if (p_left == NULL) {
   10730           0 :         if (!PyErr_Occurred())
   10731           0 :             PyErr_BadInternalCall();
   10732           0 :         return;
   10733             :     }
   10734     3891540 :     left = *p_left;
   10735     3891540 :     if (right == NULL || left == NULL
   10736     3891540 :         || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
   10737           0 :         if (!PyErr_Occurred())
   10738           0 :             PyErr_BadInternalCall();
   10739           0 :         goto error;
   10740             :     }
   10741             : 
   10742             :     /* Shortcuts */
   10743     3891540 :     PyObject *empty = unicode_get_empty();  // Borrowed reference
   10744     3891540 :     if (left == empty) {
   10745      306928 :         Py_DECREF(left);
   10746      306928 :         Py_INCREF(right);
   10747      306928 :         *p_left = right;
   10748      306928 :         return;
   10749             :     }
   10750     3584610 :     if (right == empty) {
   10751       29028 :         return;
   10752             :     }
   10753             : 
   10754     3555580 :     left_len = PyUnicode_GET_LENGTH(left);
   10755     3555580 :     right_len = PyUnicode_GET_LENGTH(right);
   10756     3555580 :     if (left_len > PY_SSIZE_T_MAX - right_len) {
   10757           0 :         PyErr_SetString(PyExc_OverflowError,
   10758             :                         "strings are too large to concat");
   10759           0 :         goto error;
   10760             :     }
   10761     3555580 :     new_len = left_len + right_len;
   10762             : 
   10763     3555580 :     if (unicode_modifiable(left)
   10764     1403580 :         && PyUnicode_CheckExact(right)
   10765     1403580 :         && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
   10766             :         /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
   10767             :            to change the structure size, but characters are stored just after
   10768             :            the structure, and so it requires to move all characters which is
   10769             :            not so different than duplicating the string. */
   10770     1403180 :         && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
   10771             :     {
   10772             :         /* append inplace */
   10773     1402650 :         if (unicode_resize(p_left, new_len) != 0)
   10774           0 :             goto error;
   10775             : 
   10776             :         /* copy 'right' into the newly allocated area of 'left' */
   10777     1402650 :         _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
   10778             :     }
   10779             :     else {
   10780     2152940 :         maxchar = PyUnicode_MAX_CHAR_VALUE(left);
   10781     2152940 :         maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
   10782     2152940 :         maxchar = Py_MAX(maxchar, maxchar2);
   10783             : 
   10784             :         /* Concat the two Unicode strings */
   10785     2152940 :         res = PyUnicode_New(new_len, maxchar);
   10786     2152940 :         if (res == NULL)
   10787           0 :             goto error;
   10788     2152940 :         _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
   10789     2152940 :         _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
   10790     2152940 :         Py_DECREF(left);
   10791     2152940 :         *p_left = res;
   10792             :     }
   10793     3555580 :     assert(_PyUnicode_CheckConsistency(*p_left, 1));
   10794     3555580 :     return;
   10795             : 
   10796           0 : error:
   10797           0 :     Py_CLEAR(*p_left);
   10798             : }
   10799             : 
   10800             : void
   10801       83705 : PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
   10802             : {
   10803       83705 :     PyUnicode_Append(pleft, right);
   10804       83705 :     Py_XDECREF(right);
   10805       83705 : }
   10806             : 
   10807             : /*
   10808             : Wraps asciilib_parse_args_finds() and additionally ensures that the
   10809             : first argument is a unicode object.
   10810             : */
   10811             : 
   10812             : static inline int
   10813     2893870 : parse_args_finds_unicode(const char * function_name, PyObject *args,
   10814             :                          PyObject **substring,
   10815             :                          Py_ssize_t *start, Py_ssize_t *end)
   10816             : {
   10817     2893870 :     if (asciilib_parse_args_finds(function_name, args, substring, start, end)) {
   10818     2893850 :         if (ensure_unicode(*substring) < 0)
   10819          11 :             return 0;
   10820     2893840 :         return 1;
   10821             :     }
   10822          16 :     return 0;
   10823             : }
   10824             : 
   10825             : PyDoc_STRVAR(count__doc__,
   10826             :              "S.count(sub[, start[, end]]) -> int\n\
   10827             : \n\
   10828             : Return the number of non-overlapping occurrences of substring sub in\n\
   10829             : string S[start:end].  Optional arguments start and end are\n\
   10830             : interpreted as in slice notation.");
   10831             : 
   10832             : static PyObject *
   10833      536482 : unicode_count(PyObject *self, PyObject *args)
   10834             : {
   10835      536482 :     PyObject *substring = NULL;   /* initialize to fix a compiler warning */
   10836      536482 :     Py_ssize_t start = 0;
   10837      536482 :     Py_ssize_t end = PY_SSIZE_T_MAX;
   10838             :     PyObject *result;
   10839             :     int kind1, kind2;
   10840             :     const void *buf1, *buf2;
   10841             :     Py_ssize_t len1, len2, iresult;
   10842             : 
   10843      536482 :     if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
   10844           5 :         return NULL;
   10845             : 
   10846      536477 :     kind1 = PyUnicode_KIND(self);
   10847      536477 :     kind2 = PyUnicode_KIND(substring);
   10848      536477 :     if (kind1 < kind2)
   10849           6 :         return PyLong_FromLong(0);
   10850             : 
   10851      536471 :     len1 = PyUnicode_GET_LENGTH(self);
   10852      536471 :     len2 = PyUnicode_GET_LENGTH(substring);
   10853      536471 :     ADJUST_INDICES(start, end, len1);
   10854      536471 :     if (end - start < len2)
   10855      104561 :         return PyLong_FromLong(0);
   10856             : 
   10857      431910 :     buf1 = PyUnicode_DATA(self);
   10858      431910 :     buf2 = PyUnicode_DATA(substring);
   10859      431910 :     if (kind2 != kind1) {
   10860       11889 :         buf2 = unicode_askind(kind2, buf2, len2, kind1);
   10861       11889 :         if (!buf2)
   10862           0 :             return NULL;
   10863             :     }
   10864      431910 :     switch (kind1) {
   10865      420017 :     case PyUnicode_1BYTE_KIND:
   10866      420017 :         iresult = ucs1lib_count(
   10867             :             ((const Py_UCS1*)buf1) + start, end - start,
   10868             :             buf2, len2, PY_SSIZE_T_MAX
   10869             :             );
   10870      420017 :         break;
   10871       11883 :     case PyUnicode_2BYTE_KIND:
   10872       11883 :         iresult = ucs2lib_count(
   10873       11883 :             ((const Py_UCS2*)buf1) + start, end - start,
   10874             :             buf2, len2, PY_SSIZE_T_MAX
   10875             :             );
   10876       11883 :         break;
   10877          10 :     case PyUnicode_4BYTE_KIND:
   10878          10 :         iresult = ucs4lib_count(
   10879          10 :             ((const Py_UCS4*)buf1) + start, end - start,
   10880             :             buf2, len2, PY_SSIZE_T_MAX
   10881             :             );
   10882          10 :         break;
   10883           0 :     default:
   10884           0 :         Py_UNREACHABLE();
   10885             :     }
   10886             : 
   10887      431910 :     result = PyLong_FromSsize_t(iresult);
   10888             : 
   10889      431910 :     assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
   10890      431910 :     if (kind2 != kind1)
   10891       11889 :         PyMem_Free((void *)buf2);
   10892             : 
   10893      431910 :     return result;
   10894             : }
   10895             : 
   10896             : /*[clinic input]
   10897             : str.encode as unicode_encode
   10898             : 
   10899             :     encoding: str(c_default="NULL") = 'utf-8'
   10900             :         The encoding in which to encode the string.
   10901             :     errors: str(c_default="NULL") = 'strict'
   10902             :         The error handling scheme to use for encoding errors.
   10903             :         The default is 'strict' meaning that encoding errors raise a
   10904             :         UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
   10905             :         'xmlcharrefreplace' as well as any other name registered with
   10906             :         codecs.register_error that can handle UnicodeEncodeErrors.
   10907             : 
   10908             : Encode the string using the codec registered for encoding.
   10909             : [clinic start generated code]*/
   10910             : 
   10911             : static PyObject *
   10912     5307600 : unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
   10913             : /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
   10914             : {
   10915     5307600 :     return PyUnicode_AsEncodedString(self, encoding, errors);
   10916             : }
   10917             : 
   10918             : /*[clinic input]
   10919             : str.expandtabs as unicode_expandtabs
   10920             : 
   10921             :     tabsize: int = 8
   10922             : 
   10923             : Return a copy where all tab characters are expanded using spaces.
   10924             : 
   10925             : If tabsize is not given, a tab size of 8 characters is assumed.
   10926             : [clinic start generated code]*/
   10927             : 
   10928             : static PyObject *
   10929       56851 : unicode_expandtabs_impl(PyObject *self, int tabsize)
   10930             : /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
   10931             : {
   10932             :     Py_ssize_t i, j, line_pos, src_len, incr;
   10933             :     Py_UCS4 ch;
   10934             :     PyObject *u;
   10935             :     const void *src_data;
   10936             :     void *dest_data;
   10937             :     int kind;
   10938             :     int found;
   10939             : 
   10940             :     /* First pass: determine size of output string */
   10941       56851 :     src_len = PyUnicode_GET_LENGTH(self);
   10942       56851 :     i = j = line_pos = 0;
   10943       56851 :     kind = PyUnicode_KIND(self);
   10944       56851 :     src_data = PyUnicode_DATA(self);
   10945       56851 :     found = 0;
   10946     3292480 :     for (; i < src_len; i++) {
   10947     3235630 :         ch = PyUnicode_READ(kind, src_data, i);
   10948     3235630 :         if (ch == '\t') {
   10949         797 :             found = 1;
   10950         797 :             if (tabsize > 0) {
   10951         797 :                 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
   10952         797 :                 if (j > PY_SSIZE_T_MAX - incr)
   10953           0 :                     goto overflow;
   10954         797 :                 line_pos += incr;
   10955         797 :                 j += incr;
   10956             :             }
   10957             :         }
   10958             :         else {
   10959     3234840 :             if (j > PY_SSIZE_T_MAX - 1)
   10960           0 :                 goto overflow;
   10961     3234840 :             line_pos++;
   10962     3234840 :             j++;
   10963     3234840 :             if (ch == '\n' || ch == '\r')
   10964       89605 :                 line_pos = 0;
   10965             :         }
   10966             :     }
   10967       56851 :     if (!found)
   10968       56779 :         return unicode_result_unchanged(self);
   10969             : 
   10970             :     /* Second pass: create output string and fill it */
   10971          72 :     u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
   10972          72 :     if (!u)
   10973           0 :         return NULL;
   10974          72 :     dest_data = PyUnicode_DATA(u);
   10975             : 
   10976          72 :     i = j = line_pos = 0;
   10977             : 
   10978        7049 :     for (; i < src_len; i++) {
   10979        6977 :         ch = PyUnicode_READ(kind, src_data, i);
   10980        6977 :         if (ch == '\t') {
   10981         797 :             if (tabsize > 0) {
   10982         797 :                 incr = tabsize - (line_pos % tabsize);
   10983         797 :                 line_pos += incr;
   10984         797 :                 unicode_fill(kind, dest_data, ' ', j, incr);
   10985         797 :                 j += incr;
   10986             :             }
   10987             :         }
   10988             :         else {
   10989        6180 :             line_pos++;
   10990        6180 :             PyUnicode_WRITE(kind, dest_data, j, ch);
   10991        6180 :             j++;
   10992        6180 :             if (ch == '\n' || ch == '\r')
   10993         118 :                 line_pos = 0;
   10994             :         }
   10995             :     }
   10996          72 :     assert (j == PyUnicode_GET_LENGTH(u));
   10997          72 :     return unicode_result(u);
   10998             : 
   10999           0 :   overflow:
   11000           0 :     PyErr_SetString(PyExc_OverflowError, "new string is too long");
   11001           0 :     return NULL;
   11002             : }
   11003             : 
   11004             : PyDoc_STRVAR(find__doc__,
   11005             :              "S.find(sub[, start[, end]]) -> int\n\
   11006             : \n\
   11007             : Return the lowest index in S where substring sub is found,\n\
   11008             : such that sub is contained within S[start:end].  Optional\n\
   11009             : arguments start and end are interpreted as in slice notation.\n\
   11010             : \n\
   11011             : Return -1 on failure.");
   11012             : 
   11013             : static PyObject *
   11014      872222 : unicode_find(PyObject *self, PyObject *args)
   11015             : {
   11016             :     /* initialize variables to prevent gcc warning */
   11017      872222 :     PyObject *substring = NULL;
   11018      872222 :     Py_ssize_t start = 0;
   11019      872222 :     Py_ssize_t end = 0;
   11020             :     Py_ssize_t result;
   11021             : 
   11022      872222 :     if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
   11023           7 :         return NULL;
   11024             : 
   11025      872215 :     result = any_find_slice(self, substring, start, end, 1);
   11026             : 
   11027      872215 :     if (result == -2)
   11028           0 :         return NULL;
   11029             : 
   11030      872215 :     return PyLong_FromSsize_t(result);
   11031             : }
   11032             : 
   11033             : static PyObject *
   11034    55387500 : unicode_getitem(PyObject *self, Py_ssize_t index)
   11035             : {
   11036             :     const void *data;
   11037             :     int kind;
   11038             :     Py_UCS4 ch;
   11039             : 
   11040    55387500 :     if (!PyUnicode_Check(self)) {
   11041           0 :         PyErr_BadArgument();
   11042           0 :         return NULL;
   11043             :     }
   11044    55387500 :     if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
   11045       56334 :         PyErr_SetString(PyExc_IndexError, "string index out of range");
   11046       56334 :         return NULL;
   11047             :     }
   11048    55331200 :     kind = PyUnicode_KIND(self);
   11049    55331200 :     data = PyUnicode_DATA(self);
   11050    55331200 :     ch = PyUnicode_READ(kind, data, index);
   11051    55331200 :     return unicode_char(ch);
   11052             : }
   11053             : 
   11054             : /* Believe it or not, this produces the same value for ASCII strings
   11055             :    as bytes_hash(). */
   11056             : static Py_hash_t
   11057   197312000 : unicode_hash(PyObject *self)
   11058             : {
   11059             :     Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
   11060             : 
   11061             : #ifdef Py_DEBUG
   11062   197312000 :     assert(_Py_HashSecret_Initialized);
   11063             : #endif
   11064   197312000 :     if (_PyUnicode_HASH(self) != -1)
   11065    74337400 :         return _PyUnicode_HASH(self);
   11066             : 
   11067   122974000 :     x = _Py_HashBytes(PyUnicode_DATA(self),
   11068   122974000 :                       PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
   11069   122974000 :     _PyUnicode_HASH(self) = x;
   11070   122974000 :     return x;
   11071             : }
   11072             : 
   11073             : PyDoc_STRVAR(index__doc__,
   11074             :              "S.index(sub[, start[, end]]) -> int\n\
   11075             : \n\
   11076             : Return the lowest index in S where substring sub is found,\n\
   11077             : such that sub is contained within S[start:end].  Optional\n\
   11078             : arguments start and end are interpreted as in slice notation.\n\
   11079             : \n\
   11080             : Raises ValueError when the substring is not found.");
   11081             : 
   11082             : static PyObject *
   11083        5486 : unicode_index(PyObject *self, PyObject *args)
   11084             : {
   11085             :     /* initialize variables to prevent gcc warning */
   11086             :     Py_ssize_t result;
   11087        5486 :     PyObject *substring = NULL;
   11088        5486 :     Py_ssize_t start = 0;
   11089        5486 :     Py_ssize_t end = 0;
   11090             : 
   11091        5486 :     if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
   11092           5 :         return NULL;
   11093             : 
   11094        5481 :     result = any_find_slice(self, substring, start, end, 1);
   11095             : 
   11096        5481 :     if (result == -2)
   11097           0 :         return NULL;
   11098             : 
   11099        5481 :     if (result < 0) {
   11100        1429 :         PyErr_SetString(PyExc_ValueError, "substring not found");
   11101        1429 :         return NULL;
   11102             :     }
   11103             : 
   11104        4052 :     return PyLong_FromSsize_t(result);
   11105             : }
   11106             : 
   11107             : /*[clinic input]
   11108             : str.isascii as unicode_isascii
   11109             : 
   11110             : Return True if all characters in the string are ASCII, False otherwise.
   11111             : 
   11112             : ASCII characters have code points in the range U+0000-U+007F.
   11113             : Empty string is ASCII too.
   11114             : [clinic start generated code]*/
   11115             : 
   11116             : static PyObject *
   11117       39854 : unicode_isascii_impl(PyObject *self)
   11118             : /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
   11119             : {
   11120       39854 :     return PyBool_FromLong(PyUnicode_IS_ASCII(self));
   11121             : }
   11122             : 
   11123             : /*[clinic input]
   11124             : str.islower as unicode_islower
   11125             : 
   11126             : Return True if the string is a lowercase string, False otherwise.
   11127             : 
   11128             : A string is lowercase if all cased characters in the string are lowercase and
   11129             : there is at least one cased character in the string.
   11130             : [clinic start generated code]*/
   11131             : 
   11132             : static PyObject *
   11133     2233430 : unicode_islower_impl(PyObject *self)
   11134             : /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
   11135             : {
   11136             :     Py_ssize_t i, length;
   11137             :     int kind;
   11138             :     const void *data;
   11139             :     int cased;
   11140             : 
   11141     2233430 :     length = PyUnicode_GET_LENGTH(self);
   11142     2233430 :     kind = PyUnicode_KIND(self);
   11143     2233430 :     data = PyUnicode_DATA(self);
   11144             : 
   11145             :     /* Shortcut for single character strings */
   11146     2233430 :     if (length == 1)
   11147     1115020 :         return PyBool_FromLong(
   11148     1115020 :             Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
   11149             : 
   11150             :     /* Special case for empty strings */
   11151     1118410 :     if (length == 0)
   11152           2 :         Py_RETURN_FALSE;
   11153             : 
   11154     1118410 :     cased = 0;
   11155     5604060 :     for (i = 0; i < length; i++) {
   11156     4488230 :         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   11157             : 
   11158     4488230 :         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
   11159        2578 :             Py_RETURN_FALSE;
   11160     4485650 :         else if (!cased && Py_UNICODE_ISLOWER(ch))
   11161     1115900 :             cased = 1;
   11162             :     }
   11163     1115830 :     return PyBool_FromLong(cased);
   11164             : }
   11165             : 
   11166             : /*[clinic input]
   11167             : str.isupper as unicode_isupper
   11168             : 
   11169             : Return True if the string is an uppercase string, False otherwise.
   11170             : 
   11171             : A string is uppercase if all cased characters in the string are uppercase and
   11172             : there is at least one cased character in the string.
   11173             : [clinic start generated code]*/
   11174             : 
   11175             : static PyObject *
   11176     3595200 : unicode_isupper_impl(PyObject *self)
   11177             : /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
   11178             : {
   11179             :     Py_ssize_t i, length;
   11180             :     int kind;
   11181             :     const void *data;
   11182             :     int cased;
   11183             : 
   11184     3595200 :     length = PyUnicode_GET_LENGTH(self);
   11185     3595200 :     kind = PyUnicode_KIND(self);
   11186     3595200 :     data = PyUnicode_DATA(self);
   11187             : 
   11188             :     /* Shortcut for single character strings */
   11189     3595200 :     if (length == 1)
   11190     1116190 :         return PyBool_FromLong(
   11191     1116190 :             Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
   11192             : 
   11193             :     /* Special case for empty strings */
   11194     2479010 :     if (length == 0)
   11195           2 :         Py_RETURN_FALSE;
   11196             : 
   11197     2479010 :     cased = 0;
   11198    21729700 :     for (i = 0; i < length; i++) {
   11199    19475500 :         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   11200             : 
   11201    19475500 :         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
   11202      224833 :             Py_RETURN_FALSE;
   11203    19250700 :         else if (!cased && Py_UNICODE_ISUPPER(ch))
   11204     2274270 :             cased = 1;
   11205             :     }
   11206     2254180 :     return PyBool_FromLong(cased);
   11207             : }
   11208             : 
   11209             : /*[clinic input]
   11210             : str.istitle as unicode_istitle
   11211             : 
   11212             : Return True if the string is a title-cased string, False otherwise.
   11213             : 
   11214             : In a title-cased string, upper- and title-case characters may only
   11215             : follow uncased characters and lowercase characters only cased ones.
   11216             : [clinic start generated code]*/
   11217             : 
   11218             : static PyObject *
   11219     2228270 : unicode_istitle_impl(PyObject *self)
   11220             : /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
   11221             : {
   11222             :     Py_ssize_t i, length;
   11223             :     int kind;
   11224             :     const void *data;
   11225             :     int cased, previous_is_cased;
   11226             : 
   11227     2228270 :     length = PyUnicode_GET_LENGTH(self);
   11228     2228270 :     kind = PyUnicode_KIND(self);
   11229     2228270 :     data = PyUnicode_DATA(self);
   11230             : 
   11231             :     /* Shortcut for single character strings */
   11232     2228270 :     if (length == 1) {
   11233     1114130 :         Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
   11234     2228220 :         return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
   11235     1114100 :                                (Py_UNICODE_ISUPPER(ch) != 0));
   11236             :     }
   11237             : 
   11238             :     /* Special case for empty strings */
   11239     1114140 :     if (length == 0)
   11240           2 :         Py_RETURN_FALSE;
   11241             : 
   11242     1114140 :     cased = 0;
   11243     1114140 :     previous_is_cased = 0;
   11244     2247370 :     for (i = 0; i < length; i++) {
   11245     2245370 :         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   11246             : 
   11247     2245370 :         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
   11248        2030 :             if (previous_is_cased)
   11249           3 :                 Py_RETURN_FALSE;
   11250        2027 :             previous_is_cased = 1;
   11251        2027 :             cased = 1;
   11252             :         }
   11253     2243340 :         else if (Py_UNICODE_ISLOWER(ch)) {
   11254     1133630 :             if (!previous_is_cased)
   11255     1112140 :                 Py_RETURN_FALSE;
   11256       21487 :             previous_is_cased = 1;
   11257       21487 :             cased = 1;
   11258             :         }
   11259             :         else
   11260     1109720 :             previous_is_cased = 0;
   11261             :     }
   11262        1999 :     return PyBool_FromLong(cased);
   11263             : }
   11264             : 
   11265             : /*[clinic input]
   11266             : str.isspace as unicode_isspace
   11267             : 
   11268             : Return True if the string is a whitespace string, False otherwise.
   11269             : 
   11270             : A string is whitespace if all characters in the string are whitespace and there
   11271             : is at least one character in the string.
   11272             : [clinic start generated code]*/
   11273             : 
   11274             : static PyObject *
   11275     3418290 : unicode_isspace_impl(PyObject *self)
   11276             : /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
   11277             : {
   11278             :     Py_ssize_t i, length;
   11279             :     int kind;
   11280             :     const void *data;
   11281             : 
   11282     3418290 :     length = PyUnicode_GET_LENGTH(self);
   11283     3418290 :     kind = PyUnicode_KIND(self);
   11284     3418290 :     data = PyUnicode_DATA(self);
   11285             : 
   11286             :     /* Shortcut for single character strings */
   11287     3418290 :     if (length == 1)
   11288     2294760 :         return PyBool_FromLong(
   11289     2294760 :             Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
   11290             : 
   11291             :     /* Special case for empty strings */
   11292     1123520 :     if (length == 0)
   11293        8693 :         Py_RETURN_FALSE;
   11294             : 
   11295     1127920 :     for (i = 0; i < length; i++) {
   11296     1127880 :         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   11297     1127880 :         if (!Py_UNICODE_ISSPACE(ch))
   11298     1114800 :             Py_RETURN_FALSE;
   11299             :     }
   11300          33 :     Py_RETURN_TRUE;
   11301             : }
   11302             : 
   11303             : /*[clinic input]
   11304             : str.isalpha as unicode_isalpha
   11305             : 
   11306             : Return True if the string is an alphabetic string, False otherwise.
   11307             : 
   11308             : A string is alphabetic if all characters in the string are alphabetic and there
   11309             : is at least one character in the string.
   11310             : [clinic start generated code]*/
   11311             : 
   11312             : static PyObject *
   11313     2274480 : unicode_isalpha_impl(PyObject *self)
   11314             : /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
   11315             : {
   11316             :     Py_ssize_t i, length;
   11317             :     int kind;
   11318             :     const void *data;
   11319             : 
   11320     2274480 :     length = PyUnicode_GET_LENGTH(self);
   11321     2274480 :     kind = PyUnicode_KIND(self);
   11322     2274480 :     data = PyUnicode_DATA(self);
   11323             : 
   11324             :     /* Shortcut for single character strings */
   11325     2274480 :     if (length == 1)
   11326     1160350 :         return PyBool_FromLong(
   11327     1160350 :             Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
   11328             : 
   11329             :     /* Special case for empty strings */
   11330     1114130 :     if (length == 0)
   11331           2 :         Py_RETURN_FALSE;
   11332             : 
   11333     1651480 :     for (i = 0; i < length; i++) {
   11334     1519720 :         if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
   11335      982368 :             Py_RETURN_FALSE;
   11336             :     }
   11337      131760 :     Py_RETURN_TRUE;
   11338             : }
   11339             : 
   11340             : /*[clinic input]
   11341             : str.isalnum as unicode_isalnum
   11342             : 
   11343             : Return True if the string is an alpha-numeric string, False otherwise.
   11344             : 
   11345             : A string is alpha-numeric if all characters in the string are alpha-numeric and
   11346             : there is at least one character in the string.
   11347             : [clinic start generated code]*/
   11348             : 
   11349             : static PyObject *
   11350     2271170 : unicode_isalnum_impl(PyObject *self)
   11351             : /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
   11352             : {
   11353             :     int kind;
   11354             :     const void *data;
   11355             :     Py_ssize_t len, i;
   11356             : 
   11357     2271170 :     kind = PyUnicode_KIND(self);
   11358     2271170 :     data = PyUnicode_DATA(self);
   11359     2271170 :     len = PyUnicode_GET_LENGTH(self);
   11360             : 
   11361             :     /* Shortcut for single character strings */
   11362     2271170 :     if (len == 1) {
   11363     1156950 :         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
   11364     1156950 :         return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
   11365             :     }
   11366             : 
   11367             :     /* Special case for empty strings */
   11368     1114220 :     if (len == 0)
   11369           4 :         Py_RETURN_FALSE;
   11370             : 
   11371     1659280 :     for (i = 0; i < len; i++) {
   11372     1525640 :         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   11373     1525640 :         if (!Py_UNICODE_ISALNUM(ch))
   11374      980579 :             Py_RETURN_FALSE;
   11375             :     }
   11376      133637 :     Py_RETURN_TRUE;
   11377             : }
   11378             : 
   11379             : /*[clinic input]
   11380             : str.isdecimal as unicode_isdecimal
   11381             : 
   11382             : Return True if the string is a decimal string, False otherwise.
   11383             : 
   11384             : A string is a decimal string if all characters in the string are decimal and
   11385             : there is at least one character in the string.
   11386             : [clinic start generated code]*/
   11387             : 
   11388             : static PyObject *
   11389     2228410 : unicode_isdecimal_impl(PyObject *self)
   11390             : /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
   11391             : {
   11392             :     Py_ssize_t i, length;
   11393             :     int kind;
   11394             :     const void *data;
   11395             : 
   11396     2228410 :     length = PyUnicode_GET_LENGTH(self);
   11397     2228410 :     kind = PyUnicode_KIND(self);
   11398     2228410 :     data = PyUnicode_DATA(self);
   11399             : 
   11400             :     /* Shortcut for single character strings */
   11401     2228410 :     if (length == 1)
   11402     1114250 :         return PyBool_FromLong(
   11403     1114250 :             Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
   11404             : 
   11405             :     /* Special case for empty strings */
   11406     1114160 :     if (length == 0)
   11407           1 :         Py_RETURN_FALSE;
   11408             : 
   11409     1116860 :     for (i = 0; i < length; i++) {
   11410     1116190 :         if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
   11411     1113490 :             Py_RETURN_FALSE;
   11412             :     }
   11413         670 :     Py_RETURN_TRUE;
   11414             : }
   11415             : 
   11416             : /*[clinic input]
   11417             : str.isdigit as unicode_isdigit
   11418             : 
   11419             : Return True if the string is a digit string, False otherwise.
   11420             : 
   11421             : A string is a digit string if all characters in the string are digits and there
   11422             : is at least one character in the string.
   11423             : [clinic start generated code]*/
   11424             : 
   11425             : static PyObject *
   11426     2250460 : unicode_isdigit_impl(PyObject *self)
   11427             : /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
   11428             : {
   11429             :     Py_ssize_t i, length;
   11430             :     int kind;
   11431             :     const void *data;
   11432             : 
   11433     2250460 :     length = PyUnicode_GET_LENGTH(self);
   11434     2250460 :     kind = PyUnicode_KIND(self);
   11435     2250460 :     data = PyUnicode_DATA(self);
   11436             : 
   11437             :     /* Shortcut for single character strings */
   11438     2250460 :     if (length == 1) {
   11439     1133870 :         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
   11440     1133870 :         return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
   11441             :     }
   11442             : 
   11443             :     /* Special case for empty strings */
   11444     1116600 :     if (length == 0)
   11445           6 :         Py_RETURN_FALSE;
   11446             : 
   11447     1134950 :     for (i = 0; i < length; i++) {
   11448     1132480 :         if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
   11449     1114120 :             Py_RETURN_FALSE;
   11450             :     }
   11451        2469 :     Py_RETURN_TRUE;
   11452             : }
   11453             : 
   11454             : /*[clinic input]
   11455             : str.isnumeric as unicode_isnumeric
   11456             : 
   11457             : Return True if the string is a numeric string, False otherwise.
   11458             : 
   11459             : A string is numeric if all characters in the string are numeric and there is at
   11460             : least one character in the string.
   11461             : [clinic start generated code]*/
   11462             : 
   11463             : static PyObject *
   11464     2228250 : unicode_isnumeric_impl(PyObject *self)
   11465             : /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
   11466             : {
   11467             :     Py_ssize_t i, length;
   11468             :     int kind;
   11469             :     const void *data;
   11470             : 
   11471     2228250 :     length = PyUnicode_GET_LENGTH(self);
   11472     2228250 :     kind = PyUnicode_KIND(self);
   11473     2228250 :     data = PyUnicode_DATA(self);
   11474             : 
   11475             :     /* Shortcut for single character strings */
   11476     2228250 :     if (length == 1)
   11477     1114130 :         return PyBool_FromLong(
   11478     1114130 :             Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
   11479             : 
   11480             :     /* Special case for empty strings */
   11481     1114120 :     if (length == 0)
   11482           1 :         Py_RETURN_FALSE;
   11483             : 
   11484     1121630 :     for (i = 0; i < length; i++) {
   11485     1119760 :         if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
   11486     1112250 :             Py_RETURN_FALSE;
   11487             :     }
   11488        1874 :     Py_RETURN_TRUE;
   11489             : }
   11490             : 
   11491             : Py_ssize_t
   11492     6121200 : _PyUnicode_ScanIdentifier(PyObject *self)
   11493             : {
   11494             :     Py_ssize_t i;
   11495     6121200 :     Py_ssize_t len = PyUnicode_GET_LENGTH(self);
   11496     6121200 :     if (len == 0) {
   11497             :         /* an empty string is not a valid identifier */
   11498          13 :         return 0;
   11499             :     }
   11500             : 
   11501     6121190 :     int kind = PyUnicode_KIND(self);
   11502     6121190 :     const void *data = PyUnicode_DATA(self);
   11503     6121190 :     Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
   11504             :     /* PEP 3131 says that the first character must be in
   11505             :        XID_Start and subsequent characters in XID_Continue,
   11506             :        and for the ASCII range, the 2.x rules apply (i.e
   11507             :        start with letters and underscore, continue with
   11508             :        letters, digits, underscore). However, given the current
   11509             :        definition of XID_Start and XID_Continue, it is sufficient
   11510             :        to check just for these, except that _ must be allowed
   11511             :        as starting an identifier.  */
   11512     6121190 :     if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
   11513     3294640 :         return 0;
   11514             :     }
   11515             : 
   11516     5057580 :     for (i = 1; i < len; i++) {
   11517     2231550 :         ch = PyUnicode_READ(kind, data, i);
   11518     2231550 :         if (!_PyUnicode_IsXidContinue(ch)) {
   11519         519 :             return i;
   11520             :         }
   11521             :     }
   11522     2826020 :     return i;
   11523             : }
   11524             : 
   11525             : int
   11526     6120990 : PyUnicode_IsIdentifier(PyObject *self)
   11527             : {
   11528     6120990 :     Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
   11529     6120990 :     Py_ssize_t len = PyUnicode_GET_LENGTH(self);
   11530             :     /* an empty string is not a valid identifier */
   11531     6120990 :     return len && i == len;
   11532             : }
   11533             : 
   11534             : /*[clinic input]
   11535             : str.isidentifier as unicode_isidentifier
   11536             : 
   11537             : Return True if the string is a valid Python identifier, False otherwise.
   11538             : 
   11539             : Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
   11540             : such as "def" or "class".
   11541             : [clinic start generated code]*/
   11542             : 
   11543             : static PyObject *
   11544     5976600 : unicode_isidentifier_impl(PyObject *self)
   11545             : /*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
   11546             : {
   11547     5976600 :     return PyBool_FromLong(PyUnicode_IsIdentifier(self));
   11548             : }
   11549             : 
   11550             : /*[clinic input]
   11551             : str.isprintable as unicode_isprintable
   11552             : 
   11553             : Return True if the string is printable, False otherwise.
   11554             : 
   11555             : A string is printable if all of its characters are considered printable in
   11556             : repr() or if it is empty.
   11557             : [clinic start generated code]*/
   11558             : 
   11559             : static PyObject *
   11560     1422190 : unicode_isprintable_impl(PyObject *self)
   11561             : /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
   11562             : {
   11563             :     Py_ssize_t i, length;
   11564             :     int kind;
   11565             :     const void *data;
   11566             : 
   11567     1422190 :     length = PyUnicode_GET_LENGTH(self);
   11568     1422190 :     kind = PyUnicode_KIND(self);
   11569     1422190 :     data = PyUnicode_DATA(self);
   11570             : 
   11571             :     /* Shortcut for single character strings */
   11572     1422190 :     if (length == 1)
   11573     1422180 :         return PyBool_FromLong(
   11574     1422180 :             Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
   11575             : 
   11576          27 :     for (i = 0; i < length; i++) {
   11577          25 :         if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
   11578           7 :             Py_RETURN_FALSE;
   11579             :         }
   11580             :     }
   11581           2 :     Py_RETURN_TRUE;
   11582             : }
   11583             : 
   11584             : /*[clinic input]
   11585             : str.join as unicode_join
   11586             : 
   11587             :     iterable: object
   11588             :     /
   11589             : 
   11590             : Concatenate any number of strings.
   11591             : 
   11592             : The string whose method is called is inserted in between each given string.
   11593             : The result is returned as a new string.
   11594             : 
   11595             : Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
   11596             : [clinic start generated code]*/
   11597             : 
   11598             : static PyObject *
   11599     6856160 : unicode_join(PyObject *self, PyObject *iterable)
   11600             : /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
   11601             : {
   11602     6856160 :     return PyUnicode_Join(self, iterable);
   11603             : }
   11604             : 
   11605             : static Py_ssize_t
   11606    25316800 : unicode_length(PyObject *self)
   11607             : {
   11608    25316800 :     return PyUnicode_GET_LENGTH(self);
   11609             : }
   11610             : 
   11611             : /*[clinic input]
   11612             : str.ljust as unicode_ljust
   11613             : 
   11614             :     width: Py_ssize_t
   11615             :     fillchar: Py_UCS4 = ' '
   11616             :     /
   11617             : 
   11618             : Return a left-justified string of length width.
   11619             : 
   11620             : Padding is done using the specified fill character (default is a space).
   11621             : [clinic start generated code]*/
   11622             : 
   11623             : static PyObject *
   11624       25593 : unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
   11625             : /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
   11626             : {
   11627       25593 :     if (PyUnicode_GET_LENGTH(self) >= width)
   11628         110 :         return unicode_result_unchanged(self);
   11629             : 
   11630       25483 :     return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
   11631             : }
   11632             : 
   11633             : /*[clinic input]
   11634             : str.lower as unicode_lower
   11635             : 
   11636             : Return a copy of the string converted to lowercase.
   11637             : [clinic start generated code]*/
   11638             : 
   11639             : static PyObject *
   11640     4048230 : unicode_lower_impl(PyObject *self)
   11641             : /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
   11642             : {
   11643     4048230 :     if (PyUnicode_IS_ASCII(self))
   11644      693951 :         return ascii_upper_or_lower(self, 1);
   11645     3354280 :     return case_operation(self, do_lower);
   11646             : }
   11647             : 
   11648             : #define LEFTSTRIP 0
   11649             : #define RIGHTSTRIP 1
   11650             : #define BOTHSTRIP 2
   11651             : 
   11652             : /* Arrays indexed by above */
   11653             : static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
   11654             : 
   11655             : #define STRIPNAME(i) (stripfuncnames[i])
   11656             : 
   11657             : /* externally visible for str.strip(unicode) */
   11658             : PyObject *
   11659     6980030 : _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
   11660             : {
   11661             :     const void *data;
   11662             :     int kind;
   11663             :     Py_ssize_t i, j, len;
   11664             :     BLOOM_MASK sepmask;
   11665             :     Py_ssize_t seplen;
   11666             : 
   11667     6980030 :     kind = PyUnicode_KIND(self);
   11668     6980030 :     data = PyUnicode_DATA(self);
   11669     6980030 :     len = PyUnicode_GET_LENGTH(self);
   11670     6980030 :     seplen = PyUnicode_GET_LENGTH(sepobj);
   11671     6980030 :     sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
   11672     6980030 :                               PyUnicode_DATA(sepobj),
   11673             :                               seplen);
   11674             : 
   11675     6980030 :     i = 0;
   11676     6980030 :     if (striptype != RIGHTSTRIP) {
   11677      713557 :         while (i < len) {
   11678      703398 :             Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   11679      703398 :             if (!BLOOM(sepmask, ch))
   11680      227633 :                 break;
   11681      475765 :             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
   11682        2581 :                 break;
   11683      473184 :             i++;
   11684             :         }
   11685             :     }
   11686             : 
   11687     6980030 :     j = len;
   11688     6980030 :     if (striptype != LEFTSTRIP) {
   11689     6778570 :         j--;
   11690     7267610 :         while (j >= i) {
   11691     7254220 :             Py_UCS4 ch = PyUnicode_READ(kind, data, j);
   11692     7254220 :             if (!BLOOM(sepmask, ch))
   11693     4890190 :                 break;
   11694     2364030 :             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
   11695     1874990 :                 break;
   11696      489043 :             j--;
   11697             :         }
   11698             : 
   11699     6778570 :         j++;
   11700             :     }
   11701             : 
   11702     6980030 :     return PyUnicode_Substring(self, i, j);
   11703             : }
   11704             : 
   11705             : PyObject*
   11706    29569500 : PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
   11707             : {
   11708             :     const unsigned char *data;
   11709             :     int kind;
   11710             :     Py_ssize_t length;
   11711             : 
   11712    29569500 :     length = PyUnicode_GET_LENGTH(self);
   11713    29569500 :     end = Py_MIN(end, length);
   11714             : 
   11715    29569500 :     if (start == 0 && end == length)
   11716     7161340 :         return unicode_result_unchanged(self);
   11717             : 
   11718    22408200 :     if (start < 0 || end < 0) {
   11719           0 :         PyErr_SetString(PyExc_IndexError, "string index out of range");
   11720           0 :         return NULL;
   11721             :     }
   11722    22408200 :     if (start >= length || end < start)
   11723       29761 :         _Py_RETURN_UNICODE_EMPTY();
   11724             : 
   11725    22378400 :     length = end - start;
   11726    22378400 :     if (PyUnicode_IS_ASCII(self)) {
   11727    22084500 :         data = PyUnicode_1BYTE_DATA(self);
   11728    22084500 :         return _PyUnicode_FromASCII((const char*)(data + start), length);
   11729             :     }
   11730             :     else {
   11731      293914 :         kind = PyUnicode_KIND(self);
   11732      293914 :         data = PyUnicode_1BYTE_DATA(self);
   11733      293914 :         return PyUnicode_FromKindAndData(kind,
   11734      293914 :                                          data + kind * start,
   11735             :                                          length);
   11736             :     }
   11737             : }
   11738             : 
   11739             : static PyObject *
   11740     1205060 : do_strip(PyObject *self, int striptype)
   11741             : {
   11742             :     Py_ssize_t len, i, j;
   11743             : 
   11744     1205060 :     len = PyUnicode_GET_LENGTH(self);
   11745             : 
   11746     1205060 :     if (PyUnicode_IS_ASCII(self)) {
   11747     1204110 :         const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
   11748             : 
   11749     1204110 :         i = 0;
   11750     1204110 :         if (striptype != RIGHTSTRIP) {
   11751     3177310 :             while (i < len) {
   11752     3119040 :                 Py_UCS1 ch = data[i];
   11753     3119040 :                 if (!_Py_ascii_whitespace[ch])
   11754     1071490 :                     break;
   11755     2047560 :                 i++;
   11756             :             }
   11757             :         }
   11758             : 
   11759     1204110 :         j = len;
   11760     1204110 :         if (striptype != LEFTSTRIP) {
   11761     1173150 :             j--;
   11762     2080240 :             while (j >= i) {
   11763     2022310 :                 Py_UCS1 ch = data[j];
   11764     2022310 :                 if (!_Py_ascii_whitespace[ch])
   11765     1115220 :                     break;
   11766      907088 :                 j--;
   11767             :             }
   11768     1173150 :             j++;
   11769             :         }
   11770             :     }
   11771             :     else {
   11772         944 :         int kind = PyUnicode_KIND(self);
   11773         944 :         const void *data = PyUnicode_DATA(self);
   11774             : 
   11775         944 :         i = 0;
   11776         944 :         if (striptype != RIGHTSTRIP) {
   11777        1512 :             while (i < len) {
   11778        1510 :                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   11779        1510 :                 if (!Py_UNICODE_ISSPACE(ch))
   11780         789 :                     break;
   11781         721 :                 i++;
   11782             :             }
   11783             :         }
   11784             : 
   11785         944 :         j = len;
   11786         944 :         if (striptype != LEFTSTRIP) {
   11787         464 :             j--;
   11788         756 :             while (j >= i) {
   11789         754 :                 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
   11790         754 :                 if (!Py_UNICODE_ISSPACE(ch))
   11791         462 :                     break;
   11792         292 :                 j--;
   11793             :             }
   11794         464 :             j++;
   11795             :         }
   11796             :     }
   11797             : 
   11798     1205060 :     return PyUnicode_Substring(self, i, j);
   11799             : }
   11800             : 
   11801             : 
   11802             : static PyObject *
   11803     8185090 : do_argstrip(PyObject *self, int striptype, PyObject *sep)
   11804             : {
   11805     8185090 :     if (sep != Py_None) {
   11806     6980030 :         if (PyUnicode_Check(sep))
   11807     6980030 :             return _PyUnicode_XStrip(self, striptype, sep);
   11808             :         else {
   11809           0 :             PyErr_Format(PyExc_TypeError,
   11810             :                          "%s arg must be None or str",
   11811             :                          STRIPNAME(striptype));
   11812           0 :             return NULL;
   11813             :         }
   11814             :     }
   11815             : 
   11816     1205060 :     return do_strip(self, striptype);
   11817             : }
   11818             : 
   11819             : 
   11820             : /*[clinic input]
   11821             : str.strip as unicode_strip
   11822             : 
   11823             :     chars: object = None
   11824             :     /
   11825             : 
   11826             : Return a copy of the string with leading and trailing whitespace removed.
   11827             : 
   11828             : If chars is given and not None, remove characters in chars instead.
   11829             : [clinic start generated code]*/
   11830             : 
   11831             : static PyObject *
   11832     1138010 : unicode_strip_impl(PyObject *self, PyObject *chars)
   11833             : /*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
   11834             : {
   11835     1138010 :     return do_argstrip(self, BOTHSTRIP, chars);
   11836             : }
   11837             : 
   11838             : 
   11839             : /*[clinic input]
   11840             : str.lstrip as unicode_lstrip
   11841             : 
   11842             :     chars: object = None
   11843             :     /
   11844             : 
   11845             : Return a copy of the string with leading whitespace removed.
   11846             : 
   11847             : If chars is given and not None, remove characters in chars instead.
   11848             : [clinic start generated code]*/
   11849             : 
   11850             : static PyObject *
   11851      232904 : unicode_lstrip_impl(PyObject *self, PyObject *chars)
   11852             : /*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
   11853             : {
   11854      232904 :     return do_argstrip(self, LEFTSTRIP, chars);
   11855             : }
   11856             : 
   11857             : 
   11858             : /*[clinic input]
   11859             : str.rstrip as unicode_rstrip
   11860             : 
   11861             :     chars: object = None
   11862             :     /
   11863             : 
   11864             : Return a copy of the string with trailing whitespace removed.
   11865             : 
   11866             : If chars is given and not None, remove characters in chars instead.
   11867             : [clinic start generated code]*/
   11868             : 
   11869             : static PyObject *
   11870     6814170 : unicode_rstrip_impl(PyObject *self, PyObject *chars)
   11871             : /*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
   11872             : {
   11873     6814170 :     return do_argstrip(self, RIGHTSTRIP, chars);
   11874             : }
   11875             : 
   11876             : 
   11877             : static PyObject*
   11878     1670150 : unicode_repeat(PyObject *str, Py_ssize_t len)
   11879             : {
   11880             :     PyObject *u;
   11881             :     Py_ssize_t nchars, n;
   11882             : 
   11883     1670150 :     if (len < 1)
   11884      135511 :         _Py_RETURN_UNICODE_EMPTY();
   11885             : 
   11886             :     /* no repeat, return original string */
   11887     1534640 :     if (len == 1)
   11888      538292 :         return unicode_result_unchanged(str);
   11889             : 
   11890      996345 :     if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
   11891           0 :         PyErr_SetString(PyExc_OverflowError,
   11892             :                         "repeated string is too long");
   11893           0 :         return NULL;
   11894             :     }
   11895      996345 :     nchars = len * PyUnicode_GET_LENGTH(str);
   11896             : 
   11897      996345 :     u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
   11898      996345 :     if (!u)
   11899           8 :         return NULL;
   11900      996337 :     assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
   11901             : 
   11902      996337 :     if (PyUnicode_GET_LENGTH(str) == 1) {
   11903      614039 :         int kind = PyUnicode_KIND(str);
   11904      614039 :         Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
   11905      614039 :         if (kind == PyUnicode_1BYTE_KIND) {
   11906      613660 :             void *to = PyUnicode_DATA(u);
   11907      613660 :             memset(to, (unsigned char)fill_char, len);
   11908             :         }
   11909         379 :         else if (kind == PyUnicode_2BYTE_KIND) {
   11910         256 :             Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
   11911       69172 :             for (n = 0; n < len; ++n)
   11912       68916 :                 ucs2[n] = fill_char;
   11913             :         } else {
   11914         123 :             Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
   11915         123 :             assert(kind == PyUnicode_4BYTE_KIND);
   11916       11587 :             for (n = 0; n < len; ++n)
   11917       11464 :                 ucs4[n] = fill_char;
   11918             :         }
   11919             :     }
   11920             :     else {
   11921      382298 :         Py_ssize_t char_size = PyUnicode_KIND(str);
   11922      382298 :         char *to = (char *) PyUnicode_DATA(u);
   11923      382298 :         _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str),
   11924      382298 :             PyUnicode_GET_LENGTH(str) * char_size);
   11925             :     }
   11926             : 
   11927      996337 :     assert(_PyUnicode_CheckConsistency(u, 1));
   11928      996337 :     return u;
   11929             : }
   11930             : 
   11931             : PyObject *
   11932         356 : PyUnicode_Replace(PyObject *str,
   11933             :                   PyObject *substr,
   11934             :                   PyObject *replstr,
   11935             :                   Py_ssize_t maxcount)
   11936             : {
   11937         712 :     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
   11938         356 :             ensure_unicode(replstr) < 0)
   11939           0 :         return NULL;
   11940         356 :     return replace(str, substr, replstr, maxcount);
   11941             : }
   11942             : 
   11943             : /*[clinic input]
   11944             : str.replace as unicode_replace
   11945             : 
   11946             :     old: unicode
   11947             :     new: unicode
   11948             :     count: Py_ssize_t = -1
   11949             :         Maximum number of occurrences to replace.
   11950             :         -1 (the default value) means replace all occurrences.
   11951             :     /
   11952             : 
   11953             : Return a copy with all occurrences of substring old replaced by new.
   11954             : 
   11955             : If the optional argument count is given, only the first count occurrences are
   11956             : replaced.
   11957             : [clinic start generated code]*/
   11958             : 
   11959             : static PyObject *
   11960     3239040 : unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
   11961             :                      Py_ssize_t count)
   11962             : /*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
   11963             : {
   11964     3239040 :     return replace(self, old, new, count);
   11965             : }
   11966             : 
   11967             : /*[clinic input]
   11968             : str.removeprefix as unicode_removeprefix
   11969             : 
   11970             :     prefix: unicode
   11971             :     /
   11972             : 
   11973             : Return a str with the given prefix string removed if present.
   11974             : 
   11975             : If the string starts with the prefix string, return string[len(prefix):].
   11976             : Otherwise, return a copy of the original string.
   11977             : [clinic start generated code]*/
   11978             : 
   11979             : static PyObject *
   11980       30314 : unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
   11981             : /*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
   11982             : {
   11983       30314 :     int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
   11984       30314 :     if (match == -1) {
   11985           0 :         return NULL;
   11986             :     }
   11987       30314 :     if (match) {
   11988       30016 :         return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
   11989             :                                    PyUnicode_GET_LENGTH(self));
   11990             :     }
   11991         298 :     return unicode_result_unchanged(self);
   11992             : }
   11993             : 
   11994             : /*[clinic input]
   11995             : str.removesuffix as unicode_removesuffix
   11996             : 
   11997             :     suffix: unicode
   11998             :     /
   11999             : 
   12000             : Return a str with the given suffix string removed if present.
   12001             : 
   12002             : If the string ends with the suffix string and that suffix is not empty,
   12003             : return string[:-len(suffix)]. Otherwise, return a copy of the original
   12004             : string.
   12005             : [clinic start generated code]*/
   12006             : 
   12007             : static PyObject *
   12008          28 : unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
   12009             : /*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
   12010             : {
   12011          28 :     int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
   12012          28 :     if (match == -1) {
   12013           0 :         return NULL;
   12014             :     }
   12015          28 :     if (match) {
   12016          32 :         return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
   12017          16 :                                             - PyUnicode_GET_LENGTH(suffix));
   12018             :     }
   12019          12 :     return unicode_result_unchanged(self);
   12020             : }
   12021             : 
   12022             : static PyObject *
   12023      652601 : unicode_repr(PyObject *unicode)
   12024             : {
   12025             :     PyObject *repr;
   12026             :     Py_ssize_t isize;
   12027             :     Py_ssize_t osize, squote, dquote, i, o;
   12028             :     Py_UCS4 max, quote;
   12029             :     int ikind, okind, unchanged;
   12030             :     const void *idata;
   12031             :     void *odata;
   12032             : 
   12033      652601 :     isize = PyUnicode_GET_LENGTH(unicode);
   12034      652601 :     idata = PyUnicode_DATA(unicode);
   12035             : 
   12036             :     /* Compute length of output, quote characters, and
   12037             :        maximum character */
   12038      652601 :     osize = 0;
   12039      652601 :     max = 127;
   12040      652601 :     squote = dquote = 0;
   12041      652601 :     ikind = PyUnicode_KIND(unicode);
   12042    12778300 :     for (i = 0; i < isize; i++) {
   12043    12125700 :         Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
   12044    12125700 :         Py_ssize_t incr = 1;
   12045    12125700 :         switch (ch) {
   12046       37378 :         case '\'': squote++; break;
   12047       40281 :         case '"':  dquote++; break;
   12048       84351 :         case '\\': case '\t': case '\r': case '\n':
   12049       84351 :             incr = 2;
   12050       84351 :             break;
   12051    11963700 :         default:
   12052             :             /* Fast-path ASCII */
   12053    11963700 :             if (ch < ' ' || ch == 0x7f)
   12054        8855 :                 incr = 4; /* \xHH */
   12055    11954900 :             else if (ch < 0x7f)
   12056             :                 ;
   12057       93795 :             else if (Py_UNICODE_ISPRINTABLE(ch))
   12058       54357 :                 max = ch > max ? ch : max;
   12059       39438 :             else if (ch < 0x100)
   12060         319 :                 incr = 4; /* \xHH */
   12061       39119 :             else if (ch < 0x10000)
   12062       28610 :                 incr = 6; /* \uHHHH */
   12063             :             else
   12064       10509 :                 incr = 10; /* \uHHHHHHHH */
   12065             :         }
   12066    12125700 :         if (osize > PY_SSIZE_T_MAX - incr) {
   12067           0 :             PyErr_SetString(PyExc_OverflowError,
   12068             :                             "string is too long to generate repr");
   12069           0 :             return NULL;
   12070             :         }
   12071    12125700 :         osize += incr;
   12072             :     }
   12073             : 
   12074      652601 :     quote = '\'';
   12075      652601 :     unchanged = (osize == isize);
   12076      652601 :     if (squote) {
   12077       21853 :         unchanged = 0;
   12078       21853 :         if (dquote)
   12079             :             /* Both squote and dquote present. Use squote,
   12080             :                and escape them */
   12081        4121 :             osize += squote;
   12082             :         else
   12083       17732 :             quote = '"';
   12084             :     }
   12085      652601 :     osize += 2;   /* quotes */
   12086             : 
   12087      652601 :     repr = PyUnicode_New(osize, max);
   12088      652601 :     if (repr == NULL)
   12089           0 :         return NULL;
   12090      652601 :     okind = PyUnicode_KIND(repr);
   12091      652601 :     odata = PyUnicode_DATA(repr);
   12092             : 
   12093      652601 :     PyUnicode_WRITE(okind, odata, 0, quote);
   12094      652601 :     PyUnicode_WRITE(okind, odata, osize-1, quote);
   12095      652601 :     if (unchanged) {
   12096      607865 :         _PyUnicode_FastCopyCharacters(repr, 1,
   12097             :                                       unicode, 0,
   12098             :                                       isize);
   12099             :     }
   12100             :     else {
   12101     2418650 :         for (i = 0, o = 1; i < isize; i++) {
   12102     2373910 :             Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
   12103             : 
   12104             :             /* Escape quotes and backslashes */
   12105     2373910 :             if ((ch == quote) || (ch == '\\')) {
   12106       20795 :                 PyUnicode_WRITE(okind, odata, o++, '\\');
   12107       20795 :                 PyUnicode_WRITE(okind, odata, o++, ch);
   12108       20795 :                 continue;
   12109             :             }
   12110             : 
   12111             :             /* Map special whitespace to '\t', \n', '\r' */
   12112     2353120 :             if (ch == '\t') {
   12113       22461 :                 PyUnicode_WRITE(okind, odata, o++, '\\');
   12114       22461 :                 PyUnicode_WRITE(okind, odata, o++, 't');
   12115             :             }
   12116     2330660 :             else if (ch == '\n') {
   12117       51431 :                 PyUnicode_WRITE(okind, odata, o++, '\\');
   12118       51431 :                 PyUnicode_WRITE(okind, odata, o++, 'n');
   12119             :             }
   12120     2279220 :             else if (ch == '\r') {
   12121        1153 :                 PyUnicode_WRITE(okind, odata, o++, '\\');
   12122        1153 :                 PyUnicode_WRITE(okind, odata, o++, 'r');
   12123             :             }
   12124             : 
   12125             :             /* Map non-printable US ASCII to '\xhh' */
   12126     2278070 :             else if (ch < ' ' || ch == 0x7F) {
   12127        8855 :                 PyUnicode_WRITE(okind, odata, o++, '\\');
   12128        8855 :                 PyUnicode_WRITE(okind, odata, o++, 'x');
   12129        8855 :                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
   12130        8855 :                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
   12131             :             }
   12132             : 
   12133             :             /* Copy ASCII characters as-is */
   12134     2269220 :             else if (ch < 0x7F) {
   12135     2206910 :                 PyUnicode_WRITE(okind, odata, o++, ch);
   12136             :             }
   12137             : 
   12138             :             /* Non-ASCII characters */
   12139             :             else {
   12140             :                 /* Map Unicode whitespace and control characters
   12141             :                    (categories Z* and C* except ASCII space)
   12142             :                 */
   12143       62308 :                 if (!Py_UNICODE_ISPRINTABLE(ch)) {
   12144       39438 :                     PyUnicode_WRITE(okind, odata, o++, '\\');
   12145             :                     /* Map 8-bit characters to '\xhh' */
   12146       39438 :                     if (ch <= 0xff) {
   12147         319 :                         PyUnicode_WRITE(okind, odata, o++, 'x');
   12148         319 :                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
   12149         319 :                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
   12150             :                     }
   12151             :                     /* Map 16-bit characters to '\uxxxx' */
   12152       39119 :                     else if (ch <= 0xffff) {
   12153       28610 :                         PyUnicode_WRITE(okind, odata, o++, 'u');
   12154       28610 :                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
   12155       28610 :                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
   12156       28610 :                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
   12157       28610 :                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
   12158             :                     }
   12159             :                     /* Map 21-bit characters to '\U00xxxxxx' */
   12160             :                     else {
   12161       10509 :                         PyUnicode_WRITE(okind, odata, o++, 'U');
   12162       10509 :                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
   12163       10509 :                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
   12164       10509 :                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
   12165       10509 :                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
   12166       10509 :                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
   12167       10509 :                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
   12168       10509 :                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
   12169       10509 :                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
   12170             :                     }
   12171             :                 }
   12172             :                 /* Copy characters as-is */
   12173             :                 else {
   12174       22870 :                     PyUnicode_WRITE(okind, odata, o++, ch);
   12175             :                 }
   12176             :             }
   12177             :         }
   12178             :     }
   12179             :     /* Closing quote already added at the beginning */
   12180      652601 :     assert(_PyUnicode_CheckConsistency(repr, 1));
   12181      652601 :     return repr;
   12182             : }
   12183             : 
   12184             : PyDoc_STRVAR(rfind__doc__,
   12185             :              "S.rfind(sub[, start[, end]]) -> int\n\
   12186             : \n\
   12187             : Return the highest index in S where substring sub is found,\n\
   12188             : such that sub is contained within S[start:end].  Optional\n\
   12189             : arguments start and end are interpreted as in slice notation.\n\
   12190             : \n\
   12191             : Return -1 on failure.");
   12192             : 
   12193             : static PyObject *
   12194     1479480 : unicode_rfind(PyObject *self, PyObject *args)
   12195             : {
   12196             :     /* initialize variables to prevent gcc warning */
   12197     1479480 :     PyObject *substring = NULL;
   12198     1479480 :     Py_ssize_t start = 0;
   12199     1479480 :     Py_ssize_t end = 0;
   12200             :     Py_ssize_t result;
   12201             : 
   12202     1479480 :     if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
   12203           5 :         return NULL;
   12204             : 
   12205     1479470 :     result = any_find_slice(self, substring, start, end, -1);
   12206             : 
   12207     1479470 :     if (result == -2)
   12208           0 :         return NULL;
   12209             : 
   12210     1479470 :     return PyLong_FromSsize_t(result);
   12211             : }
   12212             : 
   12213             : PyDoc_STRVAR(rindex__doc__,
   12214             :              "S.rindex(sub[, start[, end]]) -> int\n\
   12215             : \n\
   12216             : Return the highest index in S where substring sub is found,\n\
   12217             : such that sub is contained within S[start:end].  Optional\n\
   12218             : arguments start and end are interpreted as in slice notation.\n\
   12219             : \n\
   12220             : Raises ValueError when the substring is not found.");
   12221             : 
   12222             : static PyObject *
   12223         198 : unicode_rindex(PyObject *self, PyObject *args)
   12224             : {
   12225             :     /* initialize variables to prevent gcc warning */
   12226         198 :     PyObject *substring = NULL;
   12227         198 :     Py_ssize_t start = 0;
   12228         198 :     Py_ssize_t end = 0;
   12229             :     Py_ssize_t result;
   12230             : 
   12231         198 :     if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
   12232           5 :         return NULL;
   12233             : 
   12234         193 :     result = any_find_slice(self, substring, start, end, -1);
   12235             : 
   12236         193 :     if (result == -2)
   12237           0 :         return NULL;
   12238             : 
   12239         193 :     if (result < 0) {
   12240          25 :         PyErr_SetString(PyExc_ValueError, "substring not found");
   12241          25 :         return NULL;
   12242             :     }
   12243             : 
   12244         168 :     return PyLong_FromSsize_t(result);
   12245             : }
   12246             : 
   12247             : /*[clinic input]
   12248             : str.rjust as unicode_rjust
   12249             : 
   12250             :     width: Py_ssize_t
   12251             :     fillchar: Py_UCS4 = ' '
   12252             :     /
   12253             : 
   12254             : Return a right-justified string of length width.
   12255             : 
   12256             : Padding is done using the specified fill character (default is a space).
   12257             : [clinic start generated code]*/
   12258             : 
   12259             : static PyObject *
   12260       48279 : unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
   12261             : /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
   12262             : {
   12263       48279 :     if (PyUnicode_GET_LENGTH(self) >= width)
   12264        7226 :         return unicode_result_unchanged(self);
   12265             : 
   12266       41053 :     return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
   12267             : }
   12268             : 
   12269             : PyObject *
   12270      198154 : PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
   12271             : {
   12272      198154 :     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
   12273           0 :         return NULL;
   12274             : 
   12275      198154 :     return split(s, sep, maxsplit);
   12276             : }
   12277             : 
   12278             : /*[clinic input]
   12279             : str.split as unicode_split
   12280             : 
   12281             :     sep: object = None
   12282             :         The separator used to split the string.
   12283             : 
   12284             :         When set to None (the default value), will split on any whitespace
   12285             :         character (including \\n \\r \\t \\f and spaces) and will discard
   12286             :         empty strings from the result.
   12287             :     maxsplit: Py_ssize_t = -1
   12288             :         Maximum number of splits (starting from the left).
   12289             :         -1 (the default value) means no limit.
   12290             : 
   12291             : Return a list of the substrings in the string, using sep as the separator string.
   12292             : 
   12293             : Note, str.split() is mainly useful for data that has been intentionally
   12294             : delimited.  With natural text that includes punctuation, consider using
   12295             : the regular expression module.
   12296             : 
   12297             : [clinic start generated code]*/
   12298             : 
   12299             : static PyObject *
   12300     2172230 : unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
   12301             : /*[clinic end generated code: output=3a65b1db356948dc input=906d953b44efc43b]*/
   12302             : {
   12303     2172230 :     if (sep == Py_None)
   12304      478186 :         return split(self, NULL, maxsplit);
   12305     1694050 :     if (PyUnicode_Check(sep))
   12306     1694040 :         return split(self, sep, maxsplit);
   12307             : 
   12308           5 :     PyErr_Format(PyExc_TypeError,
   12309             :                  "must be str or None, not %.100s",
   12310           5 :                  Py_TYPE(sep)->tp_name);
   12311           5 :     return NULL;
   12312             : }
   12313             : 
   12314             : PyObject *
   12315      164174 : PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
   12316             : {
   12317             :     PyObject* out;
   12318             :     int kind1, kind2;
   12319             :     const void *buf1, *buf2;
   12320             :     Py_ssize_t len1, len2;
   12321             : 
   12322      164174 :     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
   12323           2 :         return NULL;
   12324             : 
   12325      164172 :     kind1 = PyUnicode_KIND(str_obj);
   12326      164172 :     kind2 = PyUnicode_KIND(sep_obj);
   12327      164172 :     len1 = PyUnicode_GET_LENGTH(str_obj);
   12328      164172 :     len2 = PyUnicode_GET_LENGTH(sep_obj);
   12329      164172 :     if (kind1 < kind2 || len1 < len2) {
   12330        3795 :         PyObject *empty = unicode_get_empty();  // Borrowed reference
   12331        3795 :         return PyTuple_Pack(3, str_obj, empty, empty);
   12332             :     }
   12333      160377 :     buf1 = PyUnicode_DATA(str_obj);
   12334      160377 :     buf2 = PyUnicode_DATA(sep_obj);
   12335      160377 :     if (kind2 != kind1) {
   12336          46 :         buf2 = unicode_askind(kind2, buf2, len2, kind1);
   12337          46 :         if (!buf2)
   12338           0 :             return NULL;
   12339             :     }
   12340             : 
   12341      160377 :     switch (kind1) {
   12342      160317 :     case PyUnicode_1BYTE_KIND:
   12343      160317 :         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
   12344      153654 :             out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12345             :         else
   12346        6663 :             out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12347      160317 :         break;
   12348          44 :     case PyUnicode_2BYTE_KIND:
   12349          44 :         out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12350          44 :         break;
   12351          16 :     case PyUnicode_4BYTE_KIND:
   12352          16 :         out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12353          16 :         break;
   12354           0 :     default:
   12355           0 :         Py_UNREACHABLE();
   12356             :     }
   12357             : 
   12358      160377 :     assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
   12359      160377 :     if (kind2 != kind1)
   12360          46 :         PyMem_Free((void *)buf2);
   12361             : 
   12362      160377 :     return out;
   12363             : }
   12364             : 
   12365             : 
   12366             : PyObject *
   12367     1735760 : PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
   12368             : {
   12369             :     PyObject* out;
   12370             :     int kind1, kind2;
   12371             :     const void *buf1, *buf2;
   12372             :     Py_ssize_t len1, len2;
   12373             : 
   12374     1735760 :     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
   12375           2 :         return NULL;
   12376             : 
   12377     1735750 :     kind1 = PyUnicode_KIND(str_obj);
   12378     1735750 :     kind2 = PyUnicode_KIND(sep_obj);
   12379     1735750 :     len1 = PyUnicode_GET_LENGTH(str_obj);
   12380     1735750 :     len2 = PyUnicode_GET_LENGTH(sep_obj);
   12381     1735750 :     if (kind1 < kind2 || len1 < len2) {
   12382         143 :         PyObject *empty = unicode_get_empty();  // Borrowed reference
   12383         143 :         return PyTuple_Pack(3, empty, empty, str_obj);
   12384             :     }
   12385     1735610 :     buf1 = PyUnicode_DATA(str_obj);
   12386     1735610 :     buf2 = PyUnicode_DATA(sep_obj);
   12387     1735610 :     if (kind2 != kind1) {
   12388          77 :         buf2 = unicode_askind(kind2, buf2, len2, kind1);
   12389          77 :         if (!buf2)
   12390           0 :             return NULL;
   12391             :     }
   12392             : 
   12393     1735610 :     switch (kind1) {
   12394     1735520 :     case PyUnicode_1BYTE_KIND:
   12395     1735520 :         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
   12396     1735030 :             out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12397             :         else
   12398         487 :             out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12399     1735520 :         break;
   12400          75 :     case PyUnicode_2BYTE_KIND:
   12401          75 :         out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12402          75 :         break;
   12403          16 :     case PyUnicode_4BYTE_KIND:
   12404          16 :         out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12405          16 :         break;
   12406           0 :     default:
   12407           0 :         Py_UNREACHABLE();
   12408             :     }
   12409             : 
   12410     1735610 :     assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
   12411     1735610 :     if (kind2 != kind1)
   12412          77 :         PyMem_Free((void *)buf2);
   12413             : 
   12414     1735610 :     return out;
   12415             : }
   12416             : 
   12417             : /*[clinic input]
   12418             : str.partition as unicode_partition
   12419             : 
   12420             :     sep: object
   12421             :     /
   12422             : 
   12423             : Partition the string into three parts using the given separator.
   12424             : 
   12425             : This will search for the separator in the string.  If the separator is found,
   12426             : returns a 3-tuple containing the part before the separator, the separator
   12427             : itself, and the part after it.
   12428             : 
   12429             : If the separator is not found, returns a 3-tuple containing the original string
   12430             : and two empty strings.
   12431             : [clinic start generated code]*/
   12432             : 
   12433             : static PyObject *
   12434      164174 : unicode_partition(PyObject *self, PyObject *sep)
   12435             : /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
   12436             : {
   12437      164174 :     return PyUnicode_Partition(self, sep);
   12438             : }
   12439             : 
   12440             : /*[clinic input]
   12441             : str.rpartition as unicode_rpartition = str.partition
   12442             : 
   12443             : Partition the string into three parts using the given separator.
   12444             : 
   12445             : This will search for the separator in the string, starting at the end. If
   12446             : the separator is found, returns a 3-tuple containing the part before the
   12447             : separator, the separator itself, and the part after it.
   12448             : 
   12449             : If the separator is not found, returns a 3-tuple containing two empty strings
   12450             : and the original string.
   12451             : [clinic start generated code]*/
   12452             : 
   12453             : static PyObject *
   12454     1735760 : unicode_rpartition(PyObject *self, PyObject *sep)
   12455             : /*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
   12456             : {
   12457     1735760 :     return PyUnicode_RPartition(self, sep);
   12458             : }
   12459             : 
   12460             : PyObject *
   12461           0 : PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
   12462             : {
   12463           0 :     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
   12464           0 :         return NULL;
   12465             : 
   12466           0 :     return rsplit(s, sep, maxsplit);
   12467             : }
   12468             : 
   12469             : /*[clinic input]
   12470             : str.rsplit as unicode_rsplit = str.split
   12471             : 
   12472             : Return a list of the substrings in the string, using sep as the separator string.
   12473             : 
   12474             : Splitting starts at the end of the string and works to the front.
   12475             : [clinic start generated code]*/
   12476             : 
   12477             : static PyObject *
   12478        3824 : unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
   12479             : /*[clinic end generated code: output=c2b815c63bcabffc input=ea78406060fce33c]*/
   12480             : {
   12481        3824 :     if (sep == Py_None)
   12482          76 :         return rsplit(self, NULL, maxsplit);
   12483        3748 :     if (PyUnicode_Check(sep))
   12484        3748 :         return rsplit(self, sep, maxsplit);
   12485             : 
   12486           0 :     PyErr_Format(PyExc_TypeError,
   12487             :                  "must be str or None, not %.100s",
   12488           0 :                  Py_TYPE(sep)->tp_name);
   12489           0 :     return NULL;
   12490             : }
   12491             : 
   12492             : /*[clinic input]
   12493             : str.splitlines as unicode_splitlines
   12494             : 
   12495             :     keepends: bool(accept={int}) = False
   12496             : 
   12497             : Return a list of the lines in the string, breaking at line boundaries.
   12498             : 
   12499             : Line breaks are not included in the resulting list unless keepends is given and
   12500             : true.
   12501             : [clinic start generated code]*/
   12502             : 
   12503             : static PyObject *
   12504      287964 : unicode_splitlines_impl(PyObject *self, int keepends)
   12505             : /*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
   12506             : {
   12507      287964 :     return PyUnicode_Splitlines(self, keepends);
   12508             : }
   12509             : 
   12510             : static
   12511       61346 : PyObject *unicode_str(PyObject *self)
   12512             : {
   12513       61346 :     return unicode_result_unchanged(self);
   12514             : }
   12515             : 
   12516             : /*[clinic input]
   12517             : str.swapcase as unicode_swapcase
   12518             : 
   12519             : Convert uppercase characters to lowercase and lowercase characters to uppercase.
   12520             : [clinic start generated code]*/
   12521             : 
   12522             : static PyObject *
   12523          29 : unicode_swapcase_impl(PyObject *self)
   12524             : /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
   12525             : {
   12526          29 :     return case_operation(self, do_swapcase);
   12527             : }
   12528             : 
   12529             : /*[clinic input]
   12530             : 
   12531             : @staticmethod
   12532             : str.maketrans as unicode_maketrans
   12533             : 
   12534             :   x: object
   12535             : 
   12536             :   y: unicode=NULL
   12537             : 
   12538             :   z: unicode=NULL
   12539             : 
   12540             :   /
   12541             : 
   12542             : Return a translation table usable for str.translate().
   12543             : 
   12544             : If there is only one argument, it must be a dictionary mapping Unicode
   12545             : ordinals (integers) or characters to Unicode ordinals, strings or None.
   12546             : Character keys will be then converted to ordinals.
   12547             : If there are two arguments, they must be strings of equal length, and
   12548             : in the resulting dictionary, each character in x will be mapped to the
   12549             : character at the same position in y. If there is a third argument, it
   12550             : must be a string, whose characters will be mapped to None in the result.
   12551             : [clinic start generated code]*/
   12552             : 
   12553             : static PyObject *
   12554         138 : unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
   12555             : /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
   12556             : {
   12557         138 :     PyObject *new = NULL, *key, *value;
   12558         138 :     Py_ssize_t i = 0;
   12559             :     int res;
   12560             : 
   12561         138 :     new = PyDict_New();
   12562         138 :     if (!new)
   12563           0 :         return NULL;
   12564         138 :     if (y != NULL) {
   12565             :         int x_kind, y_kind, z_kind;
   12566             :         const void *x_data, *y_data, *z_data;
   12567             : 
   12568             :         /* x must be a string too, of equal length */
   12569          27 :         if (!PyUnicode_Check(x)) {
   12570           1 :             PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
   12571             :                             "be a string if there is a second argument");
   12572           1 :             goto err;
   12573             :         }
   12574          26 :         if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
   12575           1 :             PyErr_SetString(PyExc_ValueError, "the first two maketrans "
   12576             :                             "arguments must have equal length");
   12577           1 :             goto err;
   12578             :         }
   12579             :         /* create entries for translating chars in x to those in y */
   12580          25 :         x_kind = PyUnicode_KIND(x);
   12581          25 :         y_kind = PyUnicode_KIND(y);
   12582          25 :         x_data = PyUnicode_DATA(x);
   12583          25 :         y_data = PyUnicode_DATA(y);
   12584          57 :         for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
   12585          32 :             key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
   12586          32 :             if (!key)
   12587           0 :                 goto err;
   12588          32 :             value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
   12589          32 :             if (!value) {
   12590           0 :                 Py_DECREF(key);
   12591           0 :                 goto err;
   12592             :             }
   12593          32 :             res = PyDict_SetItem(new, key, value);
   12594          32 :             Py_DECREF(key);
   12595          32 :             Py_DECREF(value);
   12596          32 :             if (res < 0)
   12597           0 :                 goto err;
   12598             :         }
   12599             :         /* create entries for deleting chars in z */
   12600          25 :         if (z != NULL) {
   12601           2 :             z_kind = PyUnicode_KIND(z);
   12602           2 :             z_data = PyUnicode_DATA(z);
   12603           6 :             for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
   12604           4 :                 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
   12605           4 :                 if (!key)
   12606           0 :                     goto err;
   12607           4 :                 res = PyDict_SetItem(new, key, Py_None);
   12608           4 :                 Py_DECREF(key);
   12609           4 :                 if (res < 0)
   12610           0 :                     goto err;
   12611             :             }
   12612             :         }
   12613             :     } else {
   12614             :         int kind;
   12615             :         const void *data;
   12616             : 
   12617             :         /* x must be a dict */
   12618         111 :         if (!PyDict_CheckExact(x)) {
   12619           0 :             PyErr_SetString(PyExc_TypeError, "if you give only one argument "
   12620             :                             "to maketrans it must be a dict");
   12621           0 :             goto err;
   12622             :         }
   12623             :         /* copy entries into the new dict, converting string keys to int keys */
   12624         749 :         while (PyDict_Next(x, &i, &key, &value)) {
   12625         640 :             if (PyUnicode_Check(key)) {
   12626             :                 /* convert string keys to integer keys */
   12627             :                 PyObject *newkey;
   12628         639 :                 if (PyUnicode_GET_LENGTH(key) != 1) {
   12629           1 :                     PyErr_SetString(PyExc_ValueError, "string keys in translate "
   12630             :                                     "table must be of length 1");
   12631           1 :                     goto err;
   12632             :                 }
   12633         638 :                 kind = PyUnicode_KIND(key);
   12634         638 :                 data = PyUnicode_DATA(key);
   12635         638 :                 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
   12636         638 :                 if (!newkey)
   12637           0 :                     goto err;
   12638         638 :                 res = PyDict_SetItem(new, newkey, value);
   12639         638 :                 Py_DECREF(newkey);
   12640         638 :                 if (res < 0)
   12641           0 :                     goto err;
   12642           1 :             } else if (PyLong_Check(key)) {
   12643             :                 /* just keep integer keys */
   12644           0 :                 if (PyDict_SetItem(new, key, value) < 0)
   12645           0 :                     goto err;
   12646             :             } else {
   12647           1 :                 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
   12648             :                                 "be strings or integers");
   12649           1 :                 goto err;
   12650             :             }
   12651             :         }
   12652             :     }
   12653         134 :     return new;
   12654           4 :   err:
   12655           4 :     Py_DECREF(new);
   12656           4 :     return NULL;
   12657             : }
   12658             : 
   12659             : /*[clinic input]
   12660             : str.translate as unicode_translate
   12661             : 
   12662             :     table: object
   12663             :         Translation table, which must be a mapping of Unicode ordinals to
   12664             :         Unicode ordinals, strings, or None.
   12665             :     /
   12666             : 
   12667             : Replace each character in the string using the given translation table.
   12668             : 
   12669             : The table must implement lookup/indexing via __getitem__, for instance a
   12670             : dictionary or list.  If this operation raises LookupError, the character is
   12671             : left untouched.  Characters mapped to None are deleted.
   12672             : [clinic start generated code]*/
   12673             : 
   12674             : static PyObject *
   12675       99461 : unicode_translate(PyObject *self, PyObject *table)
   12676             : /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
   12677             : {
   12678       99461 :     return _PyUnicode_TranslateCharmap(self, table, "ignore");
   12679             : }
   12680             : 
   12681             : /*[clinic input]
   12682             : str.upper as unicode_upper
   12683             : 
   12684             : Return a copy of the string converted to uppercase.
   12685             : [clinic start generated code]*/
   12686             : 
   12687             : static PyObject *
   12688     3413000 : unicode_upper_impl(PyObject *self)
   12689             : /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
   12690             : {
   12691     3413000 :     if (PyUnicode_IS_ASCII(self))
   12692       63944 :         return ascii_upper_or_lower(self, 0);
   12693     3349050 :     return case_operation(self, do_upper);
   12694             : }
   12695             : 
   12696             : /*[clinic input]
   12697             : str.zfill as unicode_zfill
   12698             : 
   12699             :     width: Py_ssize_t
   12700             :     /
   12701             : 
   12702             : Pad a numeric string with zeros on the left, to fill a field of the given width.
   12703             : 
   12704             : The string is never truncated.
   12705             : [clinic start generated code]*/
   12706             : 
   12707             : static PyObject *
   12708          64 : unicode_zfill_impl(PyObject *self, Py_ssize_t width)
   12709             : /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
   12710             : {
   12711             :     Py_ssize_t fill;
   12712             :     PyObject *u;
   12713             :     int kind;
   12714             :     const void *data;
   12715             :     Py_UCS4 chr;
   12716             : 
   12717          64 :     if (PyUnicode_GET_LENGTH(self) >= width)
   12718          21 :         return unicode_result_unchanged(self);
   12719             : 
   12720          43 :     fill = width - PyUnicode_GET_LENGTH(self);
   12721             : 
   12722          43 :     u = pad(self, fill, 0, '0');
   12723             : 
   12724          43 :     if (u == NULL)
   12725           0 :         return NULL;
   12726             : 
   12727          43 :     kind = PyUnicode_KIND(u);
   12728          43 :     data = PyUnicode_DATA(u);
   12729          43 :     chr = PyUnicode_READ(kind, data, fill);
   12730             : 
   12731          43 :     if (chr == '+' || chr == '-') {
   12732             :         /* move sign to beginning of string */
   12733           5 :         PyUnicode_WRITE(kind, data, 0, chr);
   12734           5 :         PyUnicode_WRITE(kind, data, fill, '0');
   12735             :     }
   12736             : 
   12737          43 :     assert(_PyUnicode_CheckConsistency(u, 1));
   12738          43 :     return u;
   12739             : }
   12740             : 
   12741             : PyDoc_STRVAR(startswith__doc__,
   12742             :              "S.startswith(prefix[, start[, end]]) -> bool\n\
   12743             : \n\
   12744             : Return True if S starts with the specified prefix, False otherwise.\n\
   12745             : With optional start, test S beginning at that position.\n\
   12746             : With optional end, stop comparing S at that position.\n\
   12747             : prefix can also be a tuple of strings to try.");
   12748             : 
   12749             : static PyObject *
   12750     8892020 : unicode_startswith(PyObject *self,
   12751             :                    PyObject *args)
   12752             : {
   12753             :     PyObject *subobj;
   12754             :     PyObject *substring;
   12755     8892020 :     Py_ssize_t start = 0;
   12756     8892020 :     Py_ssize_t end = PY_SSIZE_T_MAX;
   12757             :     int result;
   12758             : 
   12759     8892020 :     if (!asciilib_parse_args_finds("startswith", args, &subobj, &start, &end))
   12760           3 :         return NULL;
   12761     8892010 :     if (PyTuple_Check(subobj)) {
   12762             :         Py_ssize_t i;
   12763      397020 :         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
   12764      284494 :             substring = PyTuple_GET_ITEM(subobj, i);
   12765      284494 :             if (!PyUnicode_Check(substring)) {
   12766           2 :                 PyErr_Format(PyExc_TypeError,
   12767             :                              "tuple for startswith must only contain str, "
   12768             :                              "not %.100s",
   12769           2 :                              Py_TYPE(substring)->tp_name);
   12770           2 :                 return NULL;
   12771             :             }
   12772      284492 :             result = tailmatch(self, substring, start, end, -1);
   12773      284492 :             if (result == -1)
   12774           0 :                 return NULL;
   12775      284492 :             if (result) {
   12776        1823 :                 Py_RETURN_TRUE;
   12777             :             }
   12778             :         }
   12779             :         /* nothing matched */
   12780      112526 :         Py_RETURN_FALSE;
   12781             :     }
   12782     8777660 :     if (!PyUnicode_Check(subobj)) {
   12783          32 :         PyErr_Format(PyExc_TypeError,
   12784             :                      "startswith first arg must be str or "
   12785          32 :                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
   12786          32 :         return NULL;
   12787             :     }
   12788     8777630 :     result = tailmatch(self, subobj, start, end, -1);
   12789     8777630 :     if (result == -1)
   12790           0 :         return NULL;
   12791     8777630 :     return PyBool_FromLong(result);
   12792             : }
   12793             : 
   12794             : 
   12795             : PyDoc_STRVAR(endswith__doc__,
   12796             :              "S.endswith(suffix[, start[, end]]) -> bool\n\
   12797             : \n\
   12798             : Return True if S ends with the specified suffix, False otherwise.\n\
   12799             : With optional start, test S beginning at that position.\n\
   12800             : With optional end, stop comparing S at that position.\n\
   12801             : suffix can also be a tuple of strings to try.");
   12802             : 
   12803             : static PyObject *
   12804     1823900 : unicode_endswith(PyObject *self,
   12805             :                  PyObject *args)
   12806             : {
   12807             :     PyObject *subobj;
   12808             :     PyObject *substring;
   12809     1823900 :     Py_ssize_t start = 0;
   12810     1823900 :     Py_ssize_t end = PY_SSIZE_T_MAX;
   12811             :     int result;
   12812             : 
   12813     1823900 :     if (!asciilib_parse_args_finds("endswith", args, &subobj, &start, &end))
   12814           3 :         return NULL;
   12815     1823900 :     if (PyTuple_Check(subobj)) {
   12816             :         Py_ssize_t i;
   12817      345120 :         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
   12818      282552 :             substring = PyTuple_GET_ITEM(subobj, i);
   12819      282552 :             if (!PyUnicode_Check(substring)) {
   12820           2 :                 PyErr_Format(PyExc_TypeError,
   12821             :                              "tuple for endswith must only contain str, "
   12822             :                              "not %.100s",
   12823           2 :                              Py_TYPE(substring)->tp_name);
   12824           2 :                 return NULL;
   12825             :             }
   12826      282550 :             result = tailmatch(self, substring, start, end, +1);
   12827      282550 :             if (result == -1)
   12828           0 :                 return NULL;
   12829      282550 :             if (result) {
   12830      215041 :                 Py_RETURN_TRUE;
   12831             :             }
   12832             :         }
   12833       62568 :         Py_RETURN_FALSE;
   12834             :     }
   12835     1546290 :     if (!PyUnicode_Check(subobj)) {
   12836           3 :         PyErr_Format(PyExc_TypeError,
   12837             :                      "endswith first arg must be str or "
   12838           3 :                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
   12839           3 :         return NULL;
   12840             :     }
   12841     1546280 :     result = tailmatch(self, subobj, start, end, +1);
   12842     1546280 :     if (result == -1)
   12843           0 :         return NULL;
   12844     1546280 :     return PyBool_FromLong(result);
   12845             : }
   12846             : 
   12847             : static inline void
   12848    15320100 : _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
   12849             : {
   12850    15320100 :     writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
   12851    15320100 :     writer->data = PyUnicode_DATA(writer->buffer);
   12852             : 
   12853    15320100 :     if (!writer->readonly) {
   12854    11844300 :         writer->kind = PyUnicode_KIND(writer->buffer);
   12855    11844300 :         writer->size = PyUnicode_GET_LENGTH(writer->buffer);
   12856             :     }
   12857             :     else {
   12858             :         /* use a value smaller than PyUnicode_1BYTE_KIND() so
   12859             :            _PyUnicodeWriter_PrepareKind() will copy the buffer. */
   12860     3475840 :         writer->kind = 0;
   12861     3475840 :         assert(writer->kind <= PyUnicode_1BYTE_KIND);
   12862             : 
   12863             :         /* Copy-on-write mode: set buffer size to 0 so
   12864             :          * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
   12865             :          * next write. */
   12866     3475840 :         writer->size = 0;
   12867             :     }
   12868    15320100 : }
   12869             : 
   12870             : void
   12871    13902200 : _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
   12872             : {
   12873    13902200 :     memset(writer, 0, sizeof(*writer));
   12874             : 
   12875             :     /* ASCII is the bare minimum */
   12876    13902200 :     writer->min_char = 127;
   12877             : 
   12878             :     /* use a value smaller than PyUnicode_1BYTE_KIND() so
   12879             :        _PyUnicodeWriter_PrepareKind() will copy the buffer. */
   12880    13902200 :     writer->kind = 0;
   12881    13902200 :     assert(writer->kind <= PyUnicode_1BYTE_KIND);
   12882    13902200 : }
   12883             : 
   12884             : // Initialize _PyUnicodeWriter with initial buffer
   12885             : static inline void
   12886      498876 : _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
   12887             : {
   12888      498876 :     memset(writer, 0, sizeof(*writer));
   12889      498876 :     writer->buffer = buffer;
   12890      498876 :     _PyUnicodeWriter_Update(writer);
   12891      498876 :     writer->min_length = writer->size;
   12892      498876 : }
   12893             : 
   12894             : int
   12895    11345400 : _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
   12896             :                                  Py_ssize_t length, Py_UCS4 maxchar)
   12897             : {
   12898             :     Py_ssize_t newlen;
   12899             :     PyObject *newbuffer;
   12900             : 
   12901    11345400 :     assert(maxchar <= MAX_UNICODE);
   12902             : 
   12903             :     /* ensure that the _PyUnicodeWriter_Prepare macro was used */
   12904    11345400 :     assert((maxchar > writer->maxchar && length >= 0)
   12905             :            || length > 0);
   12906             : 
   12907    11345400 :     if (length > PY_SSIZE_T_MAX - writer->pos) {
   12908           0 :         PyErr_NoMemory();
   12909           0 :         return -1;
   12910             :     }
   12911    11345400 :     newlen = writer->pos + length;
   12912             : 
   12913    11345400 :     maxchar = Py_MAX(maxchar, writer->min_char);
   12914             : 
   12915    11345400 :     if (writer->buffer == NULL) {
   12916    10378400 :         assert(!writer->readonly);
   12917    10378400 :         if (writer->overallocate
   12918     8884810 :             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
   12919             :             /* overallocate to limit the number of realloc() */
   12920     8884810 :             newlen += newlen / OVERALLOCATE_FACTOR;
   12921             :         }
   12922    10378400 :         if (newlen < writer->min_length)
   12923     9446910 :             newlen = writer->min_length;
   12924             : 
   12925    10378400 :         writer->buffer = PyUnicode_New(newlen, maxchar);
   12926    10378400 :         if (writer->buffer == NULL)
   12927           2 :             return -1;
   12928             :     }
   12929      967008 :     else if (newlen > writer->size) {
   12930      397049 :         if (writer->overallocate
   12931      296800 :             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
   12932             :             /* overallocate to limit the number of realloc() */
   12933      296800 :             newlen += newlen / OVERALLOCATE_FACTOR;
   12934             :         }
   12935      397049 :         if (newlen < writer->min_length)
   12936           0 :             newlen = writer->min_length;
   12937             : 
   12938      397049 :         if (maxchar > writer->maxchar || writer->readonly) {
   12939             :             /* resize + widen */
   12940         717 :             maxchar = Py_MAX(maxchar, writer->maxchar);
   12941         717 :             newbuffer = PyUnicode_New(newlen, maxchar);
   12942         717 :             if (newbuffer == NULL)
   12943           0 :                 return -1;
   12944         717 :             _PyUnicode_FastCopyCharacters(newbuffer, 0,
   12945             :                                           writer->buffer, 0, writer->pos);
   12946         717 :             Py_DECREF(writer->buffer);
   12947         717 :             writer->readonly = 0;
   12948             :         }
   12949             :         else {
   12950      396332 :             newbuffer = resize_compact(writer->buffer, newlen);
   12951      396332 :             if (newbuffer == NULL)
   12952           0 :                 return -1;
   12953             :         }
   12954      397049 :         writer->buffer = newbuffer;
   12955             :     }
   12956      569959 :     else if (maxchar > writer->maxchar) {
   12957      569959 :         assert(!writer->readonly);
   12958      569959 :         newbuffer = PyUnicode_New(writer->size, maxchar);
   12959      569959 :         if (newbuffer == NULL)
   12960           0 :             return -1;
   12961      569959 :         _PyUnicode_FastCopyCharacters(newbuffer, 0,
   12962             :                                       writer->buffer, 0, writer->pos);
   12963      569959 :         Py_SETREF(writer->buffer, newbuffer);
   12964             :     }
   12965    11345400 :     _PyUnicodeWriter_Update(writer);
   12966    11345400 :     return 0;
   12967             : 
   12968             : #undef OVERALLOCATE_FACTOR
   12969             : }
   12970             : 
   12971             : int
   12972        8080 : _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
   12973             :                                      int kind)
   12974             : {
   12975             :     Py_UCS4 maxchar;
   12976             : 
   12977             :     /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
   12978        8080 :     assert(writer->kind < kind);
   12979             : 
   12980        8080 :     switch (kind)
   12981             :     {
   12982           0 :     case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
   12983        8080 :     case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
   12984           0 :     case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
   12985           0 :     default:
   12986           0 :         Py_UNREACHABLE();
   12987             :     }
   12988             : 
   12989        8080 :     return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
   12990             : }
   12991             : 
   12992             : static inline int
   12993     2588770 : _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
   12994             : {
   12995     2588770 :     assert(ch <= MAX_UNICODE);
   12996     2588770 :     if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
   12997           0 :         return -1;
   12998     2588770 :     PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
   12999     2588770 :     writer->pos++;
   13000     2588770 :     return 0;
   13001             : }
   13002             : 
   13003             : int
   13004     1617060 : _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
   13005             : {
   13006     1617060 :     return _PyUnicodeWriter_WriteCharInline(writer, ch);
   13007             : }
   13008             : 
   13009             : int
   13010    18573300 : _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
   13011             : {
   13012             :     Py_UCS4 maxchar;
   13013             :     Py_ssize_t len;
   13014             : 
   13015    18573300 :     len = PyUnicode_GET_LENGTH(str);
   13016    18573300 :     if (len == 0)
   13017       73724 :         return 0;
   13018    18499600 :     maxchar = PyUnicode_MAX_CHAR_VALUE(str);
   13019    18499600 :     if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
   13020      927922 :         if (writer->buffer == NULL && !writer->overallocate) {
   13021      123103 :             assert(_PyUnicode_CheckConsistency(str, 1));
   13022      123103 :             writer->readonly = 1;
   13023      123103 :             Py_INCREF(str);
   13024      123103 :             writer->buffer = str;
   13025      123103 :             _PyUnicodeWriter_Update(writer);
   13026      123103 :             writer->pos += len;
   13027      123103 :             return 0;
   13028             :         }
   13029      804819 :         if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
   13030           0 :             return -1;
   13031             :     }
   13032    18376400 :     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
   13033             :                                   str, 0, len);
   13034    18376400 :     writer->pos += len;
   13035    18376400 :     return 0;
   13036             : }
   13037             : 
   13038             : int
   13039     2271820 : _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
   13040             :                                 Py_ssize_t start, Py_ssize_t end)
   13041             : {
   13042             :     Py_UCS4 maxchar;
   13043             :     Py_ssize_t len;
   13044             : 
   13045     2271820 :     assert(0 <= start);
   13046     2271820 :     assert(end <= PyUnicode_GET_LENGTH(str));
   13047     2271820 :     assert(start <= end);
   13048             : 
   13049     2271820 :     if (end == 0)
   13050           0 :         return 0;
   13051             : 
   13052     2271820 :     if (start == 0 && end == PyUnicode_GET_LENGTH(str))
   13053         966 :         return _PyUnicodeWriter_WriteStr(writer, str);
   13054             : 
   13055     2270850 :     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
   13056      797059 :         maxchar = _PyUnicode_FindMaxChar(str, start, end);
   13057             :     else
   13058     1473790 :         maxchar = writer->maxchar;
   13059     2270850 :     len = end - start;
   13060             : 
   13061     2270850 :     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
   13062           0 :         return -1;
   13063             : 
   13064     2270850 :     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
   13065             :                                   str, start, len);
   13066     2270850 :     writer->pos += len;
   13067     2270850 :     return 0;
   13068             : }
   13069             : 
   13070             : int
   13071    27761200 : _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
   13072             :                                   const char *ascii, Py_ssize_t len)
   13073             : {
   13074    27761200 :     if (len == -1)
   13075        7883 :         len = strlen(ascii);
   13076             : 
   13077    27761200 :     assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
   13078             : 
   13079    27761200 :     if (writer->buffer == NULL && !writer->overallocate) {
   13080             :         PyObject *str;
   13081             : 
   13082     3352740 :         str = _PyUnicode_FromASCII(ascii, len);
   13083     3352740 :         if (str == NULL)
   13084           0 :             return -1;
   13085             : 
   13086     3352740 :         writer->readonly = 1;
   13087     3352740 :         writer->buffer = str;
   13088     3352740 :         _PyUnicodeWriter_Update(writer);
   13089     3352740 :         writer->pos += len;
   13090     3352740 :         return 0;
   13091             :     }
   13092             : 
   13093    24408500 :     if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
   13094           2 :         return -1;
   13095             : 
   13096    24408500 :     switch (writer->kind)
   13097             :     {
   13098    24407500 :     case PyUnicode_1BYTE_KIND:
   13099             :     {
   13100    24407500 :         const Py_UCS1 *str = (const Py_UCS1 *)ascii;
   13101    24407500 :         Py_UCS1 *data = writer->data;
   13102             : 
   13103    24407500 :         memcpy(data + writer->pos, str, len);
   13104    24407500 :         break;
   13105             :     }
   13106         818 :     case PyUnicode_2BYTE_KIND:
   13107             :     {
   13108        2454 :         _PyUnicode_CONVERT_BYTES(
   13109             :             Py_UCS1, Py_UCS2,
   13110             :             ascii, ascii + len,
   13111             :             (Py_UCS2 *)writer->data + writer->pos);
   13112         818 :         break;
   13113             :     }
   13114         208 :     case PyUnicode_4BYTE_KIND:
   13115             :     {
   13116         598 :         _PyUnicode_CONVERT_BYTES(
   13117             :             Py_UCS1, Py_UCS4,
   13118             :             ascii, ascii + len,
   13119             :             (Py_UCS4 *)writer->data + writer->pos);
   13120         208 :         break;
   13121             :     }
   13122           0 :     default:
   13123           0 :         Py_UNREACHABLE();
   13124             :     }
   13125             : 
   13126    24408500 :     writer->pos += len;
   13127    24408500 :     return 0;
   13128             : }
   13129             : 
   13130             : int
   13131           3 : _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
   13132             :                                    const char *str, Py_ssize_t len)
   13133             : {
   13134             :     Py_UCS4 maxchar;
   13135             : 
   13136           3 :     maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
   13137           3 :     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
   13138           0 :         return -1;
   13139           3 :     unicode_write_cstr(writer->buffer, writer->pos, str, len);
   13140           3 :     writer->pos += len;
   13141           3 :     return 0;
   13142             : }
   13143             : 
   13144             : PyObject *
   13145    14357000 : _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
   13146             : {
   13147             :     PyObject *str;
   13148             : 
   13149    14357000 :     if (writer->pos == 0) {
   13150       58302 :         Py_CLEAR(writer->buffer);
   13151       58302 :         _Py_RETURN_UNICODE_EMPTY();
   13152             :     }
   13153             : 
   13154    14298700 :     str = writer->buffer;
   13155    14298700 :     writer->buffer = NULL;
   13156             : 
   13157    14298700 :     if (writer->readonly) {
   13158     3475650 :         assert(PyUnicode_GET_LENGTH(str) == writer->pos);
   13159     3475650 :         return str;
   13160             :     }
   13161             : 
   13162    10823000 :     if (PyUnicode_GET_LENGTH(str) != writer->pos) {
   13163             :         PyObject *str2;
   13164     9986210 :         str2 = resize_compact(str, writer->pos);
   13165     9986210 :         if (str2 == NULL) {
   13166           0 :             Py_DECREF(str);
   13167           0 :             return NULL;
   13168             :         }
   13169     9986210 :         str = str2;
   13170             :     }
   13171             : 
   13172    10823000 :     assert(_PyUnicode_CheckConsistency(str, 1));
   13173    10823000 :     return unicode_result(str);
   13174             : }
   13175             : 
   13176             : void
   13177       10273 : _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
   13178             : {
   13179       10273 :     Py_CLEAR(writer->buffer);
   13180       10273 : }
   13181             : 
   13182             : #include "stringlib/unicode_format.h"
   13183             : 
   13184             : PyDoc_STRVAR(format__doc__,
   13185             :              "S.format(*args, **kwargs) -> str\n\
   13186             : \n\
   13187             : Return a formatted version of S, using substitutions from args and kwargs.\n\
   13188             : The substitutions are identified by braces ('{' and '}').");
   13189             : 
   13190             : PyDoc_STRVAR(format_map__doc__,
   13191             :              "S.format_map(mapping) -> str\n\
   13192             : \n\
   13193             : Return a formatted version of S, using substitutions from mapping.\n\
   13194             : The substitutions are identified by braces ('{' and '}').");
   13195             : 
   13196             : /*[clinic input]
   13197             : str.__format__ as unicode___format__
   13198             : 
   13199             :     format_spec: unicode
   13200             :     /
   13201             : 
   13202             : Return a formatted version of the string as described by format_spec.
   13203             : [clinic start generated code]*/
   13204             : 
   13205             : static PyObject *
   13206        7931 : unicode___format___impl(PyObject *self, PyObject *format_spec)
   13207             : /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
   13208             : {
   13209             :     _PyUnicodeWriter writer;
   13210             :     int ret;
   13211             : 
   13212        7931 :     _PyUnicodeWriter_Init(&writer);
   13213        7931 :     ret = _PyUnicode_FormatAdvancedWriter(&writer,
   13214             :                                           self, format_spec, 0,
   13215             :                                           PyUnicode_GET_LENGTH(format_spec));
   13216        7931 :     if (ret == -1) {
   13217           6 :         _PyUnicodeWriter_Dealloc(&writer);
   13218           6 :         return NULL;
   13219             :     }
   13220        7925 :     return _PyUnicodeWriter_Finish(&writer);
   13221             : }
   13222             : 
   13223             : /*[clinic input]
   13224             : str.__sizeof__ as unicode_sizeof
   13225             : 
   13226             : Return the size of the string in memory, in bytes.
   13227             : [clinic start generated code]*/
   13228             : 
   13229             : static PyObject *
   13230          12 : unicode_sizeof_impl(PyObject *self)
   13231             : /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
   13232             : {
   13233             :     Py_ssize_t size;
   13234             : 
   13235             :     /* If it's a compact object, account for base structure +
   13236             :        character data. */
   13237          12 :     if (PyUnicode_IS_COMPACT_ASCII(self)) {
   13238           2 :         size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
   13239             :     }
   13240          10 :     else if (PyUnicode_IS_COMPACT(self)) {
   13241          10 :         size = sizeof(PyCompactUnicodeObject) +
   13242          10 :             (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
   13243             :     }
   13244             :     else {
   13245             :         /* If it is a two-block object, account for base object, and
   13246             :            for character block if present. */
   13247           0 :         size = sizeof(PyUnicodeObject);
   13248           0 :         if (_PyUnicode_DATA_ANY(self))
   13249           0 :             size += (PyUnicode_GET_LENGTH(self) + 1) *
   13250           0 :                 PyUnicode_KIND(self);
   13251             :     }
   13252          12 :     if (_PyUnicode_HAS_UTF8_MEMORY(self))
   13253           1 :         size += PyUnicode_UTF8_LENGTH(self) + 1;
   13254             : 
   13255          12 :     return PyLong_FromSsize_t(size);
   13256             : }
   13257             : 
   13258             : static PyObject *
   13259         121 : unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
   13260             : {
   13261         121 :     PyObject *copy = _PyUnicode_Copy(v);
   13262         121 :     if (!copy)
   13263           0 :         return NULL;
   13264         121 :     return Py_BuildValue("(N)", copy);
   13265             : }
   13266             : 
   13267             : static PyMethodDef unicode_methods[] = {
   13268             :     UNICODE_ENCODE_METHODDEF
   13269             :     UNICODE_REPLACE_METHODDEF
   13270             :     UNICODE_SPLIT_METHODDEF
   13271             :     UNICODE_RSPLIT_METHODDEF
   13272             :     UNICODE_JOIN_METHODDEF
   13273             :     UNICODE_CAPITALIZE_METHODDEF
   13274             :     UNICODE_CASEFOLD_METHODDEF
   13275             :     UNICODE_TITLE_METHODDEF
   13276             :     UNICODE_CENTER_METHODDEF
   13277             :     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
   13278             :     UNICODE_EXPANDTABS_METHODDEF
   13279             :     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
   13280             :     UNICODE_PARTITION_METHODDEF
   13281             :     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
   13282             :     UNICODE_LJUST_METHODDEF
   13283             :     UNICODE_LOWER_METHODDEF
   13284             :     UNICODE_LSTRIP_METHODDEF
   13285             :     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
   13286             :     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
   13287             :     UNICODE_RJUST_METHODDEF
   13288             :     UNICODE_RSTRIP_METHODDEF
   13289             :     UNICODE_RPARTITION_METHODDEF
   13290             :     UNICODE_SPLITLINES_METHODDEF
   13291             :     UNICODE_STRIP_METHODDEF
   13292             :     UNICODE_SWAPCASE_METHODDEF
   13293             :     UNICODE_TRANSLATE_METHODDEF
   13294             :     UNICODE_UPPER_METHODDEF
   13295             :     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
   13296             :     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
   13297             :     UNICODE_REMOVEPREFIX_METHODDEF
   13298             :     UNICODE_REMOVESUFFIX_METHODDEF
   13299             :     UNICODE_ISASCII_METHODDEF
   13300             :     UNICODE_ISLOWER_METHODDEF
   13301             :     UNICODE_ISUPPER_METHODDEF
   13302             :     UNICODE_ISTITLE_METHODDEF
   13303             :     UNICODE_ISSPACE_METHODDEF
   13304             :     UNICODE_ISDECIMAL_METHODDEF
   13305             :     UNICODE_ISDIGIT_METHODDEF
   13306             :     UNICODE_ISNUMERIC_METHODDEF
   13307             :     UNICODE_ISALPHA_METHODDEF
   13308             :     UNICODE_ISALNUM_METHODDEF
   13309             :     UNICODE_ISIDENTIFIER_METHODDEF
   13310             :     UNICODE_ISPRINTABLE_METHODDEF
   13311             :     UNICODE_ZFILL_METHODDEF
   13312             :     {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
   13313             :     {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
   13314             :     UNICODE___FORMAT___METHODDEF
   13315             :     UNICODE_MAKETRANS_METHODDEF
   13316             :     UNICODE_SIZEOF_METHODDEF
   13317             :     {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
   13318             :     {NULL, NULL}
   13319             : };
   13320             : 
   13321             : static PyObject *
   13322     1017680 : unicode_mod(PyObject *v, PyObject *w)
   13323             : {
   13324     1017680 :     if (!PyUnicode_Check(v))
   13325           0 :         Py_RETURN_NOTIMPLEMENTED;
   13326     1017680 :     return PyUnicode_Format(v, w);
   13327             : }
   13328             : 
   13329             : static PyNumberMethods unicode_as_number = {
   13330             :     0,              /*nb_add*/
   13331             :     0,              /*nb_subtract*/
   13332             :     0,              /*nb_multiply*/
   13333             :     unicode_mod,            /*nb_remainder*/
   13334             : };
   13335             : 
   13336             : static PySequenceMethods unicode_as_sequence = {
   13337             :     (lenfunc) unicode_length,       /* sq_length */
   13338             :     PyUnicode_Concat,           /* sq_concat */
   13339             :     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
   13340             :     (ssizeargfunc) unicode_getitem,     /* sq_item */
   13341             :     0,                  /* sq_slice */
   13342             :     0,                  /* sq_ass_item */
   13343             :     0,                  /* sq_ass_slice */
   13344             :     PyUnicode_Contains,         /* sq_contains */
   13345             : };
   13346             : 
   13347             : static PyObject*
   13348    80396700 : unicode_subscript(PyObject* self, PyObject* item)
   13349             : {
   13350    80396700 :     if (_PyIndex_Check(item)) {
   13351    55387200 :         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
   13352    55387200 :         if (i == -1 && PyErr_Occurred())
   13353           2 :             return NULL;
   13354    55387200 :         if (i < 0)
   13355      996054 :             i += PyUnicode_GET_LENGTH(self);
   13356    55387200 :         return unicode_getitem(self, i);
   13357    25009500 :     } else if (PySlice_Check(item)) {
   13358             :         Py_ssize_t start, stop, step, slicelength, i;
   13359             :         size_t cur;
   13360             :         PyObject *result;
   13361             :         const void *src_data;
   13362             :         void *dest_data;
   13363             :         int src_kind, dest_kind;
   13364             :         Py_UCS4 ch, max_char, kind_limit;
   13365             : 
   13366    25009500 :         if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
   13367           2 :             return NULL;
   13368             :         }
   13369    25009500 :         slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
   13370             :                                             &start, &stop, step);
   13371             : 
   13372    25009500 :         if (slicelength <= 0) {
   13373      252922 :             _Py_RETURN_UNICODE_EMPTY();
   13374    39725600 :         } else if (start == 0 && step == 1 &&
   13375    14969000 :                    slicelength == PyUnicode_GET_LENGTH(self)) {
   13376     8429650 :             return unicode_result_unchanged(self);
   13377    16326900 :         } else if (step == 1) {
   13378    16325400 :             return PyUnicode_Substring(self,
   13379             :                                        start, start + slicelength);
   13380             :         }
   13381             :         /* General case */
   13382        1493 :         src_kind = PyUnicode_KIND(self);
   13383        1493 :         src_data = PyUnicode_DATA(self);
   13384        1493 :         if (!PyUnicode_IS_ASCII(self)) {
   13385           9 :             kind_limit = kind_maxchar_limit(src_kind);
   13386           9 :             max_char = 0;
   13387           9 :             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
   13388           9 :                 ch = PyUnicode_READ(src_kind, src_data, cur);
   13389           9 :                 if (ch > max_char) {
   13390           9 :                     max_char = ch;
   13391           9 :                     if (max_char >= kind_limit)
   13392           9 :                         break;
   13393             :                 }
   13394             :             }
   13395             :         }
   13396             :         else
   13397        1484 :             max_char = 127;
   13398        1493 :         result = PyUnicode_New(slicelength, max_char);
   13399        1493 :         if (result == NULL)
   13400           0 :             return NULL;
   13401        1493 :         dest_kind = PyUnicode_KIND(result);
   13402        1493 :         dest_data = PyUnicode_DATA(result);
   13403             : 
   13404       11303 :         for (cur = start, i = 0; i < slicelength; cur += step, i++) {
   13405        9810 :             Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
   13406        9810 :             PyUnicode_WRITE(dest_kind, dest_data, i, ch);
   13407             :         }
   13408        1493 :         assert(_PyUnicode_CheckConsistency(result, 1));
   13409        1493 :         return result;
   13410             :     } else {
   13411          12 :         PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
   13412          12 :                      Py_TYPE(item)->tp_name);
   13413          12 :         return NULL;
   13414             :     }
   13415             : }
   13416             : 
   13417             : static PyMappingMethods unicode_as_mapping = {
   13418             :     (lenfunc)unicode_length,        /* mp_length */
   13419             :     (binaryfunc)unicode_subscript,  /* mp_subscript */
   13420             :     (objobjargproc)0,           /* mp_ass_subscript */
   13421             : };
   13422             : 
   13423             : 
   13424             : /* Helpers for PyUnicode_Format() */
   13425             : 
   13426             : struct unicode_formatter_t {
   13427             :     PyObject *args;
   13428             :     int args_owned;
   13429             :     Py_ssize_t arglen, argidx;
   13430             :     PyObject *dict;
   13431             : 
   13432             :     int fmtkind;
   13433             :     Py_ssize_t fmtcnt, fmtpos;
   13434             :     const void *fmtdata;
   13435             :     PyObject *fmtstr;
   13436             : 
   13437             :     _PyUnicodeWriter writer;
   13438             : };
   13439             : 
   13440             : struct unicode_format_arg_t {
   13441             :     Py_UCS4 ch;
   13442             :     int flags;
   13443             :     Py_ssize_t width;
   13444             :     int prec;
   13445             :     int sign;
   13446             : };
   13447             : 
   13448             : static PyObject *
   13449     1462430 : unicode_format_getnextarg(struct unicode_formatter_t *ctx)
   13450             : {
   13451     1462430 :     Py_ssize_t argidx = ctx->argidx;
   13452             : 
   13453     1462430 :     if (argidx < ctx->arglen) {
   13454     1462410 :         ctx->argidx++;
   13455     1462410 :         if (ctx->arglen < 0)
   13456      817933 :             return ctx->args;
   13457             :         else
   13458      644479 :             return PyTuple_GetItem(ctx->args, argidx);
   13459             :     }
   13460          20 :     PyErr_SetString(PyExc_TypeError,
   13461             :                     "not enough arguments for format string");
   13462          20 :     return NULL;
   13463             : }
   13464             : 
   13465             : /* Returns a new reference to a PyUnicode object, or NULL on failure. */
   13466             : 
   13467             : /* Format a float into the writer if the writer is not NULL, or into *p_output
   13468             :    otherwise.
   13469             : 
   13470             :    Return 0 on success, raise an exception and return -1 on error. */
   13471             : static int
   13472       44446 : formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
   13473             :             PyObject **p_output,
   13474             :             _PyUnicodeWriter *writer)
   13475             : {
   13476             :     char *p;
   13477             :     double x;
   13478             :     Py_ssize_t len;
   13479             :     int prec;
   13480       44446 :     int dtoa_flags = 0;
   13481             : 
   13482       44446 :     x = PyFloat_AsDouble(v);
   13483       44446 :     if (x == -1.0 && PyErr_Occurred())
   13484           2 :         return -1;
   13485             : 
   13486       44444 :     prec = arg->prec;
   13487       44444 :     if (prec < 0)
   13488        8976 :         prec = 6;
   13489             : 
   13490       44444 :     if (arg->flags & F_ALT)
   13491       10974 :         dtoa_flags |= Py_DTSF_ALT;
   13492       44444 :     if (arg->flags & F_NO_NEG_0)
   13493           0 :         dtoa_flags |= Py_DTSF_NO_NEG_0;
   13494       44444 :     p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
   13495       44444 :     if (p == NULL)
   13496           0 :         return -1;
   13497       44444 :     len = strlen(p);
   13498       44444 :     if (writer) {
   13499        4700 :         if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
   13500           0 :             PyMem_Free(p);
   13501           0 :             return -1;
   13502             :         }
   13503             :     }
   13504             :     else
   13505       39744 :         *p_output = _PyUnicode_FromASCII(p, len);
   13506       44444 :     PyMem_Free(p);
   13507       44444 :     return 0;
   13508             : }
   13509             : 
   13510             : /* formatlong() emulates the format codes d, u, o, x and X, and
   13511             :  * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
   13512             :  * Python's regular ints.
   13513             :  * Return value:  a new PyUnicodeObject*, or NULL if error.
   13514             :  *     The output string is of the form
   13515             :  *         "-"? ("0x" | "0X")? digit+
   13516             :  *     "0x"/"0X" are present only for x and X conversions, with F_ALT
   13517             :  *         set in flags.  The case of hex digits will be correct,
   13518             :  *     There will be at least prec digits, zero-filled on the left if
   13519             :  *         necessary to get that many.
   13520             :  * val          object to be converted
   13521             :  * flags        bitmask of format flags; only F_ALT is looked at
   13522             :  * prec         minimum number of digits; 0-fill on left if needed
   13523             :  * type         a character in [duoxX]; u acts the same as d
   13524             :  *
   13525             :  * CAUTION:  o, x and X conversions on regular ints can never
   13526             :  * produce a '-' sign, but can for Python's unbounded ints.
   13527             :  */
   13528             : PyObject *
   13529      379629 : _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
   13530             : {
   13531      379629 :     PyObject *result = NULL;
   13532             :     char *buf;
   13533             :     Py_ssize_t i;
   13534             :     int sign;           /* 1 if '-', else 0 */
   13535             :     int len;            /* number of characters */
   13536             :     Py_ssize_t llen;
   13537             :     int numdigits;      /* len == numnondigits + numdigits */
   13538      379629 :     int numnondigits = 0;
   13539             : 
   13540             :     /* Avoid exceeding SSIZE_T_MAX */
   13541      379629 :     if (prec > INT_MAX-3) {
   13542           0 :         PyErr_SetString(PyExc_OverflowError,
   13543             :                         "precision too large");
   13544           0 :         return NULL;
   13545             :     }
   13546             : 
   13547      379629 :     assert(PyLong_Check(val));
   13548             : 
   13549      379629 :     switch (type) {
   13550           0 :     default:
   13551           0 :         Py_UNREACHABLE();
   13552       79112 :     case 'd':
   13553             :     case 'i':
   13554             :     case 'u':
   13555             :         /* int and int subclasses should print numerically when a numeric */
   13556             :         /* format code is used (see issue18780) */
   13557       79112 :         result = PyNumber_ToBase(val, 10);
   13558       79112 :         break;
   13559       16147 :     case 'o':
   13560       16147 :         numnondigits = 2;
   13561       16147 :         result = PyNumber_ToBase(val, 8);
   13562       16147 :         break;
   13563      284370 :     case 'x':
   13564             :     case 'X':
   13565      284370 :         numnondigits = 2;
   13566      284370 :         result = PyNumber_ToBase(val, 16);
   13567      284370 :         break;
   13568             :     }
   13569      379629 :     if (!result)
   13570           0 :         return NULL;
   13571             : 
   13572      379629 :     assert(unicode_modifiable(result));
   13573      379629 :     assert(PyUnicode_IS_ASCII(result));
   13574             : 
   13575             :     /* To modify the string in-place, there can only be one reference. */
   13576      379629 :     if (Py_REFCNT(result) != 1) {
   13577           0 :         Py_DECREF(result);
   13578           0 :         PyErr_BadInternalCall();
   13579           0 :         return NULL;
   13580             :     }
   13581      379629 :     buf = PyUnicode_DATA(result);
   13582      379629 :     llen = PyUnicode_GET_LENGTH(result);
   13583      379629 :     if (llen > INT_MAX) {
   13584           0 :         Py_DECREF(result);
   13585           0 :         PyErr_SetString(PyExc_ValueError,
   13586             :                         "string too large in _PyUnicode_FormatLong");
   13587           0 :         return NULL;
   13588             :     }
   13589      379629 :     len = (int)llen;
   13590      379629 :     sign = buf[0] == '-';
   13591      379629 :     numnondigits += sign;
   13592      379629 :     numdigits = len - numnondigits;
   13593      379629 :     assert(numdigits > 0);
   13594             : 
   13595             :     /* Get rid of base marker unless F_ALT */
   13596      379629 :     if (((alt) == 0 &&
   13597      356294 :         (type == 'o' || type == 'x' || type == 'X'))) {
   13598      297723 :         assert(buf[sign] == '0');
   13599      297723 :         assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
   13600             :                buf[sign+1] == 'o');
   13601      297723 :         numnondigits -= 2;
   13602      297723 :         buf += 2;
   13603      297723 :         len -= 2;
   13604      297723 :         if (sign)
   13605        1370 :             buf[0] = '-';
   13606      297723 :         assert(len == numnondigits + numdigits);
   13607      297723 :         assert(numdigits > 0);
   13608             :     }
   13609             : 
   13610             :     /* Fill with leading zeroes to meet minimum width. */
   13611      379629 :     if (prec > numdigits) {
   13612        6520 :         PyObject *r1 = PyBytes_FromStringAndSize(NULL,
   13613        6520 :                                 numnondigits + prec);
   13614             :         char *b1;
   13615        6520 :         if (!r1) {
   13616           0 :             Py_DECREF(result);
   13617           0 :             return NULL;
   13618             :         }
   13619        6520 :         b1 = PyBytes_AS_STRING(r1);
   13620        8620 :         for (i = 0; i < numnondigits; ++i)
   13621        2100 :             *b1++ = *buf++;
   13622       95526 :         for (i = 0; i < prec - numdigits; i++)
   13623       89006 :             *b1++ = '0';
   13624       30261 :         for (i = 0; i < numdigits; i++)
   13625       23741 :             *b1++ = *buf++;
   13626        6520 :         *b1 = '\0';
   13627        6520 :         Py_DECREF(result);
   13628        6520 :         result = r1;
   13629        6520 :         buf = PyBytes_AS_STRING(result);
   13630        6520 :         len = numnondigits + prec;
   13631             :     }
   13632             : 
   13633             :     /* Fix up case for hex conversions. */
   13634      379629 :     if (type == 'X') {
   13635             :         /* Need to convert all lower case letters to upper case.
   13636             :            and need to convert 0x to 0X (and -0x to -0X). */
   13637       81199 :         for (i = 0; i < len; i++)
   13638       58529 :             if (buf[i] >= 'a' && buf[i] <= 'x')
   13639       18563 :                 buf[i] -= 'a'-'A';
   13640             :     }
   13641      379629 :     if (!PyUnicode_Check(result)
   13642      373109 :         || buf != PyUnicode_DATA(result)) {
   13643             :         PyObject *unicode;
   13644      299739 :         unicode = _PyUnicode_FromASCII(buf, len);
   13645      299739 :         Py_DECREF(result);
   13646      299739 :         result = unicode;
   13647             :     }
   13648       79890 :     else if (len != PyUnicode_GET_LENGTH(result)) {
   13649           0 :         if (PyUnicode_Resize(&result, len) < 0)
   13650           0 :             Py_CLEAR(result);
   13651             :     }
   13652      379629 :     return result;
   13653             : }
   13654             : 
   13655             : /* Format an integer or a float as an integer.
   13656             :  * Return 1 if the number has been formatted into the writer,
   13657             :  *        0 if the number has been formatted into *p_output
   13658             :  *       -1 and raise an exception on error */
   13659             : static int
   13660      781600 : mainformatlong(PyObject *v,
   13661             :                struct unicode_format_arg_t *arg,
   13662             :                PyObject **p_output,
   13663             :                _PyUnicodeWriter *writer)
   13664             : {
   13665             :     PyObject *iobj, *res;
   13666      781600 :     char type = (char)arg->ch;
   13667             : 
   13668      781600 :     if (!PyNumber_Check(v))
   13669         112 :         goto wrongtype;
   13670             : 
   13671             :     /* make sure number is a type of integer for o, x, and X */
   13672      781488 :     if (!PyLong_Check(v)) {
   13673        6519 :         if (type == 'o' || type == 'x' || type == 'X') {
   13674           9 :             iobj = _PyNumber_Index(v);
   13675             :         }
   13676             :         else {
   13677        6510 :             iobj = PyNumber_Long(v);
   13678             :         }
   13679        6519 :         if (iobj == NULL ) {
   13680           8 :             if (PyErr_ExceptionMatches(PyExc_TypeError))
   13681           8 :                 goto wrongtype;
   13682           0 :             return -1;
   13683             :         }
   13684        6511 :         assert(PyLong_Check(iobj));
   13685             :     }
   13686             :     else {
   13687      774969 :         iobj = v;
   13688      774969 :         Py_INCREF(iobj);
   13689             :     }
   13690             : 
   13691      781480 :     if (PyLong_CheckExact(v)
   13692      774794 :         && arg->width == -1 && arg->prec == -1
   13693      415610 :         && !(arg->flags & (F_SIGN | F_BLANK))
   13694      402124 :         && type != 'X')
   13695             :     {
   13696             :         /* Fast path */
   13697      402037 :         int alternate = arg->flags & F_ALT;
   13698             :         int base;
   13699             : 
   13700      402037 :         switch(type)
   13701             :         {
   13702           0 :             default:
   13703           0 :                 Py_UNREACHABLE();
   13704      370375 :             case 'd':
   13705             :             case 'i':
   13706             :             case 'u':
   13707      370375 :                 base = 10;
   13708      370375 :                 break;
   13709         628 :             case 'o':
   13710         628 :                 base = 8;
   13711         628 :                 break;
   13712       31034 :             case 'x':
   13713             :             case 'X':
   13714       31034 :                 base = 16;
   13715       31034 :                 break;
   13716             :         }
   13717             : 
   13718      402037 :         if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
   13719           0 :             Py_DECREF(iobj);
   13720           0 :             return -1;
   13721             :         }
   13722      402037 :         Py_DECREF(iobj);
   13723      402037 :         return 1;
   13724             :     }
   13725             : 
   13726      379443 :     res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
   13727      379443 :     Py_DECREF(iobj);
   13728      379443 :     if (res == NULL)
   13729           0 :         return -1;
   13730      379443 :     *p_output = res;
   13731      379443 :     return 0;
   13732             : 
   13733         120 : wrongtype:
   13734         120 :     switch(type)
   13735             :     {
   13736           8 :         case 'o':
   13737             :         case 'x':
   13738             :         case 'X':
   13739           8 :             PyErr_Format(PyExc_TypeError,
   13740             :                     "%%%c format: an integer is required, "
   13741             :                     "not %.200s",
   13742           8 :                     type, Py_TYPE(v)->tp_name);
   13743           8 :             break;
   13744         112 :         default:
   13745         112 :             PyErr_Format(PyExc_TypeError,
   13746             :                     "%%%c format: a real number is required, "
   13747             :                     "not %.200s",
   13748         112 :                     type, Py_TYPE(v)->tp_name);
   13749         112 :             break;
   13750             :     }
   13751         120 :     return -1;
   13752             : }
   13753             : 
   13754             : static Py_UCS4
   13755        2255 : formatchar(PyObject *v)
   13756             : {
   13757             :     /* presume that the buffer is at least 3 characters long */
   13758        2255 :     if (PyUnicode_Check(v)) {
   13759        1521 :         if (PyUnicode_GET_LENGTH(v) == 1) {
   13760        1519 :             return PyUnicode_READ_CHAR(v, 0);
   13761             :         }
   13762           2 :         goto onError;
   13763             :     }
   13764             :     else {
   13765             :         int overflow;
   13766         734 :         long x = PyLong_AsLongAndOverflow(v, &overflow);
   13767         734 :         if (x == -1 && PyErr_Occurred()) {
   13768           5 :             if (PyErr_ExceptionMatches(PyExc_TypeError)) {
   13769           5 :                 goto onError;
   13770             :             }
   13771         729 :             return (Py_UCS4) -1;
   13772             :         }
   13773             : 
   13774         729 :         if (x < 0 || x > MAX_UNICODE) {
   13775             :             /* this includes an overflow in converting to C long */
   13776           7 :             PyErr_SetString(PyExc_OverflowError,
   13777             :                             "%c arg not in range(0x110000)");
   13778           7 :             return (Py_UCS4) -1;
   13779             :         }
   13780             : 
   13781         722 :         return (Py_UCS4) x;
   13782             :     }
   13783             : 
   13784           7 :   onError:
   13785           7 :     PyErr_SetString(PyExc_TypeError,
   13786             :                     "%c requires int or char");
   13787           7 :     return (Py_UCS4) -1;
   13788             : }
   13789             : 
   13790             : /* Parse options of an argument: flags, width, precision.
   13791             :    Handle also "%(name)" syntax.
   13792             : 
   13793             :    Return 0 if the argument has been formatted into arg->str.
   13794             :    Return 1 if the argument has been written into ctx->writer,
   13795             :    Raise an exception and return -1 on error. */
   13796             : static int
   13797     1437020 : unicode_format_arg_parse(struct unicode_formatter_t *ctx,
   13798             :                          struct unicode_format_arg_t *arg)
   13799             : {
   13800             : #define FORMAT_READ(ctx) \
   13801             :         PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
   13802             : 
   13803             :     PyObject *v;
   13804             : 
   13805     1437020 :     if (arg->ch == '(') {
   13806             :         /* Get argument value from a dictionary. Example: "%(name)s". */
   13807             :         Py_ssize_t keystart;
   13808             :         Py_ssize_t keylen;
   13809             :         PyObject *key;
   13810       33227 :         int pcount = 1;
   13811             : 
   13812       33227 :         if (ctx->dict == NULL) {
   13813           5 :             PyErr_SetString(PyExc_TypeError,
   13814             :                             "format requires a mapping");
   13815           5 :             return -1;
   13816             :         }
   13817       33222 :         ++ctx->fmtpos;
   13818       33222 :         --ctx->fmtcnt;
   13819       33222 :         keystart = ctx->fmtpos;
   13820             :         /* Skip over balanced parentheses */
   13821      218998 :         while (pcount > 0 && --ctx->fmtcnt >= 0) {
   13822      185776 :             arg->ch = FORMAT_READ(ctx);
   13823      185776 :             if (arg->ch == ')')
   13824       33221 :                 --pcount;
   13825      152555 :             else if (arg->ch == '(')
   13826           2 :                 ++pcount;
   13827      185776 :             ctx->fmtpos++;
   13828             :         }
   13829       33222 :         keylen = ctx->fmtpos - keystart - 1;
   13830       33222 :         if (ctx->fmtcnt < 0 || pcount > 0) {
   13831           3 :             PyErr_SetString(PyExc_ValueError,
   13832             :                             "incomplete format key");
   13833           3 :             return -1;
   13834             :         }
   13835       33219 :         key = PyUnicode_Substring(ctx->fmtstr,
   13836             :                                   keystart, keystart + keylen);
   13837       33219 :         if (key == NULL)
   13838           0 :             return -1;
   13839       33219 :         if (ctx->args_owned) {
   13840       22200 :             ctx->args_owned = 0;
   13841       22200 :             Py_DECREF(ctx->args);
   13842             :         }
   13843       33219 :         ctx->args = PyObject_GetItem(ctx->dict, key);
   13844       33219 :         Py_DECREF(key);
   13845       33219 :         if (ctx->args == NULL)
   13846           3 :             return -1;
   13847       33216 :         ctx->args_owned = 1;
   13848       33216 :         ctx->arglen = -1;
   13849       33216 :         ctx->argidx = -2;
   13850             :     }
   13851             : 
   13852             :     /* Parse flags. Example: "%+i" => flags=F_SIGN. */
   13853     1810120 :     while (--ctx->fmtcnt >= 0) {
   13854     1810110 :         arg->ch = FORMAT_READ(ctx);
   13855     1810110 :         ctx->fmtpos++;
   13856     1810110 :         switch (arg->ch) {
   13857       21790 :         case '-': arg->flags |= F_LJUST; continue;
   13858       33816 :         case '+': arg->flags |= F_SIGN; continue;
   13859       20887 :         case ' ': arg->flags |= F_BLANK; continue;
   13860       21835 :         case '#': arg->flags |= F_ALT; continue;
   13861      274779 :         case '0': arg->flags |= F_ZERO; continue;
   13862             :         }
   13863     1437010 :         break;
   13864             :     }
   13865             : 
   13866             :     /* Parse width. Example: "%10s" => width=10 */
   13867     1437010 :     if (arg->ch == '*') {
   13868       25394 :         v = unicode_format_getnextarg(ctx);
   13869       25394 :         if (v == NULL)
   13870           0 :             return -1;
   13871       25394 :         if (!PyLong_Check(v)) {
   13872           4 :             PyErr_SetString(PyExc_TypeError,
   13873             :                             "* wants int");
   13874           4 :             return -1;
   13875             :         }
   13876       25390 :         arg->width = PyLong_AsSsize_t(v);
   13877       25390 :         if (arg->width == -1 && PyErr_Occurred())
   13878           6 :             return -1;
   13879       25384 :         if (arg->width < 0) {
   13880           1 :             arg->flags |= F_LJUST;
   13881           1 :             arg->width = -arg->width;
   13882             :         }
   13883       25384 :         if (--ctx->fmtcnt >= 0) {
   13884       25384 :             arg->ch = FORMAT_READ(ctx);
   13885       25384 :             ctx->fmtpos++;
   13886             :         }
   13887             :     }
   13888     1411620 :     else if (arg->ch >= '0' && arg->ch <= '9') {
   13889      292842 :         arg->width = arg->ch - '0';
   13890      311182 :         while (--ctx->fmtcnt >= 0) {
   13891      311180 :             arg->ch = FORMAT_READ(ctx);
   13892      311180 :             ctx->fmtpos++;
   13893      311180 :             if (arg->ch < '0' || arg->ch > '9')
   13894             :                 break;
   13895             :             /* Since arg->ch is unsigned, the RHS would end up as unsigned,
   13896             :                mixing signed and unsigned comparison. Since arg->ch is between
   13897             :                '0' and '9', casting to int is safe. */
   13898       18343 :             if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
   13899           3 :                 PyErr_SetString(PyExc_ValueError,
   13900             :                                 "width too big");
   13901           3 :                 return -1;
   13902             :             }
   13903       18340 :             arg->width = arg->width*10 + (arg->ch - '0');
   13904             :         }
   13905             :     }
   13906             : 
   13907             :     /* Parse precision. Example: "%.3f" => prec=3 */
   13908     1437000 :     if (arg->ch == '.') {
   13909      116914 :         arg->prec = 0;
   13910      116914 :         if (--ctx->fmtcnt >= 0) {
   13911      116914 :             arg->ch = FORMAT_READ(ctx);
   13912      116914 :             ctx->fmtpos++;
   13913             :         }
   13914      116914 :         if (arg->ch == '*') {
   13915          60 :             v = unicode_format_getnextarg(ctx);
   13916          60 :             if (v == NULL)
   13917           0 :                 return -1;
   13918          60 :             if (!PyLong_Check(v)) {
   13919           2 :                 PyErr_SetString(PyExc_TypeError,
   13920             :                                 "* wants int");
   13921           2 :                 return -1;
   13922             :             }
   13923          58 :             arg->prec = _PyLong_AsInt(v);
   13924          58 :             if (arg->prec == -1 && PyErr_Occurred())
   13925           7 :                 return -1;
   13926          51 :             if (arg->prec < 0)
   13927           0 :                 arg->prec = 0;
   13928          51 :             if (--ctx->fmtcnt >= 0) {
   13929          51 :                 arg->ch = FORMAT_READ(ctx);
   13930          51 :                 ctx->fmtpos++;
   13931             :             }
   13932             :         }
   13933      116854 :         else if (arg->ch >= '0' && arg->ch <= '9') {
   13934      107914 :             arg->prec = arg->ch - '0';
   13935      124830 :             while (--ctx->fmtcnt >= 0) {
   13936      124830 :                 arg->ch = FORMAT_READ(ctx);
   13937      124830 :                 ctx->fmtpos++;
   13938      124830 :                 if (arg->ch < '0' || arg->ch > '9')
   13939             :                     break;
   13940       16920 :                 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
   13941           4 :                     PyErr_SetString(PyExc_ValueError,
   13942             :                                     "precision too big");
   13943           4 :                     return -1;
   13944             :                 }
   13945       16916 :                 arg->prec = arg->prec*10 + (arg->ch - '0');
   13946             :             }
   13947             :         }
   13948             :     }
   13949             : 
   13950             :     /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
   13951     1436990 :     if (ctx->fmtcnt >= 0) {
   13952     1436980 :         if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
   13953           2 :             if (--ctx->fmtcnt >= 0) {
   13954           2 :                 arg->ch = FORMAT_READ(ctx);
   13955           2 :                 ctx->fmtpos++;
   13956             :             }
   13957             :         }
   13958             :     }
   13959     1436990 :     if (ctx->fmtcnt < 0) {
   13960           8 :         PyErr_SetString(PyExc_ValueError,
   13961             :                         "incomplete format");
   13962           8 :         return -1;
   13963             :     }
   13964     1436980 :     return 0;
   13965             : 
   13966             : #undef FORMAT_READ
   13967             : }
   13968             : 
   13969             : /* Format one argument. Supported conversion specifiers:
   13970             : 
   13971             :    - "s", "r", "a": any type
   13972             :    - "i", "d", "u": int or float
   13973             :    - "o", "x", "X": int
   13974             :    - "e", "E", "f", "F", "g", "G": float
   13975             :    - "c": int or str (1 character)
   13976             : 
   13977             :    When possible, the output is written directly into the Unicode writer
   13978             :    (ctx->writer). A string is created when padding is required.
   13979             : 
   13980             :    Return 0 if the argument has been formatted into *p_str,
   13981             :           1 if the argument has been written into ctx->writer,
   13982             :          -1 on error. */
   13983             : static int
   13984     1436980 : unicode_format_arg_format(struct unicode_formatter_t *ctx,
   13985             :                           struct unicode_format_arg_t *arg,
   13986             :                           PyObject **p_str)
   13987             : {
   13988             :     PyObject *v;
   13989     1436980 :     _PyUnicodeWriter *writer = &ctx->writer;
   13990             : 
   13991     1436980 :     if (ctx->fmtcnt == 0)
   13992      638952 :         ctx->writer.overallocate = 0;
   13993             : 
   13994     1436980 :     v = unicode_format_getnextarg(ctx);
   13995     1436980 :     if (v == NULL)
   13996          20 :         return -1;
   13997             : 
   13998             : 
   13999     1436960 :     switch (arg->ch) {
   14000      608652 :     case 's':
   14001             :     case 'r':
   14002             :     case 'a':
   14003      608652 :         if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
   14004             :             /* Fast path */
   14005       17964 :             if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
   14006           0 :                 return -1;
   14007       17964 :             return 1;
   14008             :         }
   14009             : 
   14010      590688 :         if (PyUnicode_CheckExact(v) && arg->ch == 's') {
   14011      548235 :             *p_str = v;
   14012      548235 :             Py_INCREF(*p_str);
   14013             :         }
   14014             :         else {
   14015       42453 :             if (arg->ch == 's')
   14016       13618 :                 *p_str = PyObject_Str(v);
   14017       28835 :             else if (arg->ch == 'r')
   14018       27281 :                 *p_str = PyObject_Repr(v);
   14019             :             else
   14020        1554 :                 *p_str = PyObject_ASCII(v);
   14021             :         }
   14022      590688 :         break;
   14023             : 
   14024      781600 :     case 'i':
   14025             :     case 'd':
   14026             :     case 'u':
   14027             :     case 'o':
   14028             :     case 'x':
   14029             :     case 'X':
   14030             :     {
   14031      781600 :         int ret = mainformatlong(v, arg, p_str, writer);
   14032      781600 :         if (ret != 0)
   14033      402157 :             return ret;
   14034      379443 :         arg->sign = 1;
   14035      379443 :         break;
   14036             :     }
   14037             : 
   14038       44446 :     case 'e':
   14039             :     case 'E':
   14040             :     case 'f':
   14041             :     case 'F':
   14042             :     case 'g':
   14043             :     case 'G':
   14044       44446 :         if (arg->width == -1 && arg->prec == -1
   14045        5906 :             && !(arg->flags & (F_SIGN | F_BLANK)))
   14046             :         {
   14047             :             /* Fast path */
   14048        4702 :             if (formatfloat(v, arg, NULL, writer) == -1)
   14049           2 :                 return -1;
   14050        4700 :             return 1;
   14051             :         }
   14052             : 
   14053       39744 :         arg->sign = 1;
   14054       39744 :         if (formatfloat(v, arg, p_str, NULL) == -1)
   14055           0 :             return -1;
   14056       39744 :         break;
   14057             : 
   14058        2255 :     case 'c':
   14059             :     {
   14060        2255 :         Py_UCS4 ch = formatchar(v);
   14061        2255 :         if (ch == (Py_UCS4) -1)
   14062          14 :             return -1;
   14063        2241 :         if (arg->width == -1 && arg->prec == -1) {
   14064             :             /* Fast path */
   14065        2241 :             if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
   14066           0 :                 return -1;
   14067        2241 :             return 1;
   14068             :         }
   14069           0 :         *p_str = PyUnicode_FromOrdinal(ch);
   14070           0 :         break;
   14071             :     }
   14072             : 
   14073           5 :     default:
   14074           5 :         PyErr_Format(PyExc_ValueError,
   14075             :                      "unsupported format character '%c' (0x%x) "
   14076             :                      "at index %zd",
   14077           5 :                      (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
   14078           5 :                      (int)arg->ch,
   14079           5 :                      ctx->fmtpos - 1);
   14080           5 :         return -1;
   14081             :     }
   14082     1009880 :     if (*p_str == NULL)
   14083         484 :         return -1;
   14084     1009390 :     assert (PyUnicode_Check(*p_str));
   14085     1009390 :     return 0;
   14086             : }
   14087             : 
   14088             : static int
   14089     1009390 : unicode_format_arg_output(struct unicode_formatter_t *ctx,
   14090             :                           struct unicode_format_arg_t *arg,
   14091             :                           PyObject *str)
   14092             : {
   14093             :     Py_ssize_t len;
   14094             :     int kind;
   14095             :     const void *pbuf;
   14096             :     Py_ssize_t pindex;
   14097             :     Py_UCS4 signchar;
   14098             :     Py_ssize_t buflen;
   14099             :     Py_UCS4 maxchar;
   14100             :     Py_ssize_t sublen;
   14101     1009390 :     _PyUnicodeWriter *writer = &ctx->writer;
   14102             :     Py_UCS4 fill;
   14103             : 
   14104     1009390 :     fill = ' ';
   14105     1009390 :     if (arg->sign && arg->flags & F_ZERO)
   14106      272491 :         fill = '0';
   14107             : 
   14108     1009390 :     len = PyUnicode_GET_LENGTH(str);
   14109     1009390 :     if ((arg->width == -1 || arg->width <= len)
   14110      857516 :         && (arg->prec == -1 || arg->prec >= len)
   14111      822055 :         && !(arg->flags & (F_SIGN | F_BLANK)))
   14112             :     {
   14113             :         /* Fast path */
   14114      802933 :         if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
   14115           0 :             return -1;
   14116      802933 :         return 0;
   14117             :     }
   14118             : 
   14119             :     /* Truncate the string for "s", "r" and "a" formats
   14120             :        if the precision is set */
   14121      206458 :     if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
   14122        5758 :         if (arg->prec >= 0 && len > arg->prec)
   14123        2115 :             len = arg->prec;
   14124             :     }
   14125             : 
   14126             :     /* Adjust sign and width */
   14127      206458 :     kind = PyUnicode_KIND(str);
   14128      206458 :     pbuf = PyUnicode_DATA(str);
   14129      206458 :     pindex = 0;
   14130      206458 :     signchar = '\0';
   14131      206458 :     if (arg->sign) {
   14132      200700 :         Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
   14133      200700 :         if (ch == '-' || ch == '+') {
   14134       26135 :             signchar = ch;
   14135       26135 :             len--;
   14136       26135 :             pindex++;
   14137             :         }
   14138      174565 :         else if (arg->flags & F_SIGN)
   14139       15834 :             signchar = '+';
   14140      158731 :         else if (arg->flags & F_BLANK)
   14141        4746 :             signchar = ' ';
   14142             :         else
   14143      153985 :             arg->sign = 0;
   14144             :     }
   14145      206458 :     if (arg->width < len)
   14146       54066 :         arg->width = len;
   14147             : 
   14148             :     /* Prepare the writer */
   14149      206458 :     maxchar = writer->maxchar;
   14150      206458 :     if (!(arg->flags & F_LJUST)) {
   14151      185887 :         if (arg->sign) {
   14152       30621 :             if ((arg->width-1) > len)
   14153        5301 :                 maxchar = Py_MAX(maxchar, fill);
   14154             :         }
   14155             :         else {
   14156      155266 :             if (arg->width > len)
   14157      139529 :                 maxchar = Py_MAX(maxchar, fill);
   14158             :         }
   14159             :     }
   14160      206458 :     if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
   14161      184566 :         Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
   14162      184566 :         maxchar = Py_MAX(maxchar, strmaxchar);
   14163             :     }
   14164             : 
   14165      206458 :     buflen = arg->width;
   14166      206458 :     if (arg->sign && len == arg->width)
   14167       36360 :         buflen++;
   14168      206458 :     if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
   14169           0 :         return -1;
   14170             : 
   14171             :     /* Write the sign if needed */
   14172      206458 :     if (arg->sign) {
   14173       46715 :         if (fill != ' ') {
   14174       17096 :             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
   14175       17096 :             writer->pos += 1;
   14176             :         }
   14177       46715 :         if (arg->width > len)
   14178       10355 :             arg->width--;
   14179             :     }
   14180             : 
   14181             :     /* Write the numeric prefix for "x", "X" and "o" formats
   14182             :        if the alternate form is used.
   14183             :        For example, write "0x" for the "%#x" format. */
   14184      206458 :     if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
   14185        2628 :         assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
   14186        2628 :         assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
   14187        2628 :         if (fill != ' ') {
   14188        1305 :             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
   14189        1305 :             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
   14190        1305 :             writer->pos += 2;
   14191        1305 :             pindex += 2;
   14192             :         }
   14193        2628 :         arg->width -= 2;
   14194        2628 :         if (arg->width < 0)
   14195           0 :             arg->width = 0;
   14196        2628 :         len -= 2;
   14197             :     }
   14198             : 
   14199             :     /* Pad left with the fill character if needed */
   14200      206458 :     if (arg->width > len && !(arg->flags & F_LJUST)) {
   14201      144830 :         sublen = arg->width - len;
   14202      144830 :         unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
   14203      144830 :         writer->pos += sublen;
   14204      144830 :         arg->width = len;
   14205             :     }
   14206             : 
   14207             :     /* If padding with spaces: write sign if needed and/or numeric prefix if
   14208             :        the alternate form is used */
   14209      206458 :     if (fill == ' ') {
   14210       71437 :         if (arg->sign) {
   14211       29619 :             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
   14212       29619 :             writer->pos += 1;
   14213             :         }
   14214       71437 :         if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
   14215        1323 :             assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
   14216        1323 :             assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
   14217        1323 :             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
   14218        1323 :             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
   14219        1323 :             writer->pos += 2;
   14220        1323 :             pindex += 2;
   14221             :         }
   14222             :     }
   14223             : 
   14224             :     /* Write characters */
   14225      206458 :     if (len) {
   14226      203365 :         _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
   14227             :                                       str, pindex, len);
   14228      203365 :         writer->pos += len;
   14229             :     }
   14230             : 
   14231             :     /* Pad right with the fill character if needed */
   14232      206458 :     if (arg->width > len) {
   14233        7557 :         sublen = arg->width - len;
   14234        7557 :         unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
   14235        7557 :         writer->pos += sublen;
   14236             :     }
   14237      206458 :     return 0;
   14238             : }
   14239             : 
   14240             : /* Helper of PyUnicode_Format(): format one arg.
   14241             :    Return 0 on success, raise an exception and return -1 on error. */
   14242             : static int
   14243     1441440 : unicode_format_arg(struct unicode_formatter_t *ctx)
   14244             : {
   14245             :     struct unicode_format_arg_t arg;
   14246             :     PyObject *str;
   14247             :     int ret;
   14248             : 
   14249     1441440 :     arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
   14250     1441440 :     if (arg.ch == '%') {
   14251        4416 :         ctx->fmtpos++;
   14252        4416 :         ctx->fmtcnt--;
   14253        4416 :         if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
   14254           0 :             return -1;
   14255        4416 :         return 0;
   14256             :     }
   14257     1437020 :     arg.flags = 0;
   14258     1437020 :     arg.width = -1;
   14259     1437020 :     arg.prec = -1;
   14260     1437020 :     arg.sign = 0;
   14261     1437020 :     str = NULL;
   14262             : 
   14263     1437020 :     ret = unicode_format_arg_parse(ctx, &arg);
   14264     1437020 :     if (ret == -1)
   14265          45 :         return -1;
   14266             : 
   14267     1436980 :     ret = unicode_format_arg_format(ctx, &arg, &str);
   14268     1436980 :     if (ret == -1)
   14269         645 :         return -1;
   14270             : 
   14271     1436330 :     if (ret != 1) {
   14272     1009390 :         ret = unicode_format_arg_output(ctx, &arg, str);
   14273     1009390 :         Py_DECREF(str);
   14274     1009390 :         if (ret == -1)
   14275           0 :             return -1;
   14276             :     }
   14277             : 
   14278     1436330 :     if (ctx->dict && (ctx->argidx < ctx->arglen)) {
   14279           0 :         PyErr_SetString(PyExc_TypeError,
   14280             :                         "not all arguments converted during string formatting");
   14281           0 :         return -1;
   14282             :     }
   14283     1436330 :     return 0;
   14284             : }
   14285             : 
   14286             : PyObject *
   14287     1017680 : PyUnicode_Format(PyObject *format, PyObject *args)
   14288             : {
   14289             :     struct unicode_formatter_t ctx;
   14290             : 
   14291     1017680 :     if (format == NULL || args == NULL) {
   14292           0 :         PyErr_BadInternalCall();
   14293           0 :         return NULL;
   14294             :     }
   14295             : 
   14296     1017680 :     if (ensure_unicode(format) < 0)
   14297           0 :         return NULL;
   14298             : 
   14299     1017680 :     ctx.fmtstr = format;
   14300     1017680 :     ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
   14301     1017680 :     ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
   14302     1017680 :     ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
   14303     1017680 :     ctx.fmtpos = 0;
   14304             : 
   14305     1017680 :     _PyUnicodeWriter_Init(&ctx.writer);
   14306     1017680 :     ctx.writer.min_length = ctx.fmtcnt + 100;
   14307     1017680 :     ctx.writer.overallocate = 1;
   14308             : 
   14309     1017680 :     if (PyTuple_Check(args)) {
   14310      221143 :         ctx.arglen = PyTuple_Size(args);
   14311      221143 :         ctx.argidx = 0;
   14312             :     }
   14313             :     else {
   14314      796536 :         ctx.arglen = -1;
   14315      796536 :         ctx.argidx = -2;
   14316             :     }
   14317     1017680 :     ctx.args_owned = 0;
   14318     1017680 :     if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
   14319       13369 :         ctx.dict = args;
   14320             :     else
   14321     1004310 :         ctx.dict = NULL;
   14322     1017680 :     ctx.args = args;
   14323             : 
   14324     3613190 :     while (--ctx.fmtcnt >= 0) {
   14325     2596200 :         if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
   14326             :             Py_ssize_t nonfmtpos;
   14327             : 
   14328     1154770 :             nonfmtpos = ctx.fmtpos++;
   14329    15565700 :             while (ctx.fmtcnt >= 0 &&
   14330     7595720 :                    PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
   14331     6815230 :                 ctx.fmtpos++;
   14332     6815230 :                 ctx.fmtcnt--;
   14333             :             }
   14334     1154770 :             if (ctx.fmtcnt < 0) {
   14335      374270 :                 ctx.fmtpos--;
   14336      374270 :                 ctx.writer.overallocate = 0;
   14337             :             }
   14338             : 
   14339     1154770 :             if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
   14340             :                                                 nonfmtpos, ctx.fmtpos) < 0)
   14341           0 :                 goto onError;
   14342             :         }
   14343             :         else {
   14344     1441440 :             ctx.fmtpos++;
   14345     1441440 :             if (unicode_format_arg(&ctx) == -1)
   14346         690 :                 goto onError;
   14347             :         }
   14348             :     }
   14349             : 
   14350     1016990 :     if (ctx.argidx < ctx.arglen && !ctx.dict) {
   14351          19 :         PyErr_SetString(PyExc_TypeError,
   14352             :                         "not all arguments converted during string formatting");
   14353          19 :         goto onError;
   14354             :     }
   14355             : 
   14356     1016970 :     if (ctx.args_owned) {
   14357       11013 :         Py_DECREF(ctx.args);
   14358             :     }
   14359     1016970 :     return _PyUnicodeWriter_Finish(&ctx.writer);
   14360             : 
   14361         709 :   onError:
   14362         709 :     _PyUnicodeWriter_Dealloc(&ctx.writer);
   14363         709 :     if (ctx.args_owned) {
   14364           3 :         Py_DECREF(ctx.args);
   14365             :     }
   14366         709 :     return NULL;
   14367             : }
   14368             : 
   14369             : static PyObject *
   14370             : unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
   14371             : 
   14372             : /*[clinic input]
   14373             : @classmethod
   14374             : str.__new__ as unicode_new
   14375             : 
   14376             :     object as x: object = NULL
   14377             :     encoding: str = NULL
   14378             :     errors: str = NULL
   14379             : 
   14380             : [clinic start generated code]*/
   14381             : 
   14382             : static PyObject *
   14383     1204760 : unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
   14384             :                  const char *errors)
   14385             : /*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
   14386             : {
   14387             :     PyObject *unicode;
   14388     1204760 :     if (x == NULL) {
   14389       15122 :         unicode = unicode_new_empty();
   14390             :     }
   14391     1189640 :     else if (encoding == NULL && errors == NULL) {
   14392      730328 :         unicode = PyObject_Str(x);
   14393             :     }
   14394             :     else {
   14395      459315 :         unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
   14396             :     }
   14397             : 
   14398     1204760 :     if (unicode != NULL && type != &PyUnicode_Type) {
   14399      128750 :         Py_SETREF(unicode, unicode_subtype_new(type, unicode));
   14400             :     }
   14401     1204760 :     return unicode;
   14402             : }
   14403             : 
   14404             : static PyObject *
   14405      128750 : unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
   14406             : {
   14407             :     PyObject *self;
   14408             :     Py_ssize_t length, char_size;
   14409             :     int share_utf8;
   14410             :     int kind;
   14411             :     void *data;
   14412             : 
   14413      128750 :     assert(PyType_IsSubtype(type, &PyUnicode_Type));
   14414      128750 :     assert(_PyUnicode_CHECK(unicode));
   14415             : 
   14416      128750 :     self = type->tp_alloc(type, 0);
   14417      128750 :     if (self == NULL) {
   14418           0 :         return NULL;
   14419             :     }
   14420      128750 :     kind = PyUnicode_KIND(unicode);
   14421      128750 :     length = PyUnicode_GET_LENGTH(unicode);
   14422             : 
   14423      128750 :     _PyUnicode_LENGTH(self) = length;
   14424             : #ifdef Py_DEBUG
   14425      128750 :     _PyUnicode_HASH(self) = -1;
   14426             : #else
   14427             :     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
   14428             : #endif
   14429      128750 :     _PyUnicode_STATE(self).interned = 0;
   14430      128750 :     _PyUnicode_STATE(self).kind = kind;
   14431      128750 :     _PyUnicode_STATE(self).compact = 0;
   14432      128750 :     _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
   14433      128750 :     _PyUnicode_UTF8_LENGTH(self) = 0;
   14434      128750 :     _PyUnicode_UTF8(self) = NULL;
   14435      128750 :     _PyUnicode_DATA_ANY(self) = NULL;
   14436             : 
   14437      128750 :     share_utf8 = 0;
   14438      128750 :     if (kind == PyUnicode_1BYTE_KIND) {
   14439      128497 :         char_size = 1;
   14440      128497 :         if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
   14441      127703 :             share_utf8 = 1;
   14442             :     }
   14443         253 :     else if (kind == PyUnicode_2BYTE_KIND) {
   14444         232 :         char_size = 2;
   14445             :     }
   14446             :     else {
   14447          21 :         assert(kind == PyUnicode_4BYTE_KIND);
   14448          21 :         char_size = 4;
   14449             :     }
   14450             : 
   14451             :     /* Ensure we won't overflow the length. */
   14452      128750 :     if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
   14453           0 :         PyErr_NoMemory();
   14454           0 :         goto onError;
   14455             :     }
   14456      128750 :     data = PyObject_Malloc((length + 1) * char_size);
   14457      128750 :     if (data == NULL) {
   14458           0 :         PyErr_NoMemory();
   14459           0 :         goto onError;
   14460             :     }
   14461             : 
   14462      128750 :     _PyUnicode_DATA_ANY(self) = data;
   14463      128750 :     if (share_utf8) {
   14464      127703 :         _PyUnicode_UTF8_LENGTH(self) = length;
   14465      127703 :         _PyUnicode_UTF8(self) = data;
   14466             :     }
   14467             : 
   14468      128750 :     memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
   14469      128750 :     assert(_PyUnicode_CheckConsistency(self, 1));
   14470             : #ifdef Py_DEBUG
   14471      128750 :     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
   14472             : #endif
   14473      128750 :     return self;
   14474             : 
   14475           0 : onError:
   14476           0 :     Py_DECREF(self);
   14477           0 :     return NULL;
   14478             : }
   14479             : 
   14480             : void
   14481     3966520 : _PyUnicode_ExactDealloc(PyObject *op)
   14482             : {
   14483     3966520 :     assert(PyUnicode_CheckExact(op));
   14484     3966520 :     unicode_dealloc(op);
   14485     3966520 : }
   14486             : 
   14487             : PyDoc_STRVAR(unicode_doc,
   14488             : "str(object='') -> str\n\
   14489             : str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
   14490             : \n\
   14491             : Create a new string object from the given object. If encoding or\n\
   14492             : errors is specified, then the object must expose a data buffer\n\
   14493             : that will be decoded using the given encoding and error handler.\n\
   14494             : Otherwise, returns the result of object.__str__() (if defined)\n\
   14495             : or repr(object).\n\
   14496             : encoding defaults to sys.getdefaultencoding().\n\
   14497             : errors defaults to 'strict'.");
   14498             : 
   14499             : static PyObject *unicode_iter(PyObject *seq);
   14500             : 
   14501             : PyTypeObject PyUnicode_Type = {
   14502             :     PyVarObject_HEAD_INIT(&PyType_Type, 0)
   14503             :     "str",                        /* tp_name */
   14504             :     sizeof(PyUnicodeObject),      /* tp_basicsize */
   14505             :     0,                            /* tp_itemsize */
   14506             :     /* Slots */
   14507             :     (destructor)unicode_dealloc,  /* tp_dealloc */
   14508             :     0,                            /* tp_vectorcall_offset */
   14509             :     0,                            /* tp_getattr */
   14510             :     0,                            /* tp_setattr */
   14511             :     0,                            /* tp_as_async */
   14512             :     unicode_repr,                 /* tp_repr */
   14513             :     &unicode_as_number,           /* tp_as_number */
   14514             :     &unicode_as_sequence,         /* tp_as_sequence */
   14515             :     &unicode_as_mapping,          /* tp_as_mapping */
   14516             :     (hashfunc) unicode_hash,      /* tp_hash*/
   14517             :     0,                            /* tp_call*/
   14518             :     (reprfunc) unicode_str,       /* tp_str */
   14519             :     PyObject_GenericGetAttr,      /* tp_getattro */
   14520             :     0,                            /* tp_setattro */
   14521             :     0,                            /* tp_as_buffer */
   14522             :     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
   14523             :         Py_TPFLAGS_UNICODE_SUBCLASS |
   14524             :         _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
   14525             :     unicode_doc,                  /* tp_doc */
   14526             :     0,                            /* tp_traverse */
   14527             :     0,                            /* tp_clear */
   14528             :     PyUnicode_RichCompare,        /* tp_richcompare */
   14529             :     0,                            /* tp_weaklistoffset */
   14530             :     unicode_iter,                 /* tp_iter */
   14531             :     0,                            /* tp_iternext */
   14532             :     unicode_methods,              /* tp_methods */
   14533             :     0,                            /* tp_members */
   14534             :     0,                            /* tp_getset */
   14535             :     0,                            /* tp_base */
   14536             :     0,                            /* tp_dict */
   14537             :     0,                            /* tp_descr_get */
   14538             :     0,                            /* tp_descr_set */
   14539             :     0,                            /* tp_dictoffset */
   14540             :     0,                            /* tp_init */
   14541             :     0,                            /* tp_alloc */
   14542             :     unicode_new,                  /* tp_new */
   14543             :     PyObject_Del,                 /* tp_free */
   14544             : };
   14545             : 
   14546             : /* Initialize the Unicode implementation */
   14547             : 
   14548             : void
   14549        3134 : _PyUnicode_InitState(PyInterpreterState *interp)
   14550             : {
   14551        3134 :     if (!_Py_IsMainInterpreter(interp)) {
   14552         171 :         return;
   14553             :     }
   14554             : 
   14555             :     /* initialize the linebreak bloom filter */
   14556        2963 :     const Py_UCS2 linebreak[] = {
   14557             :         0x000A, /* LINE FEED */
   14558             :         0x000D, /* CARRIAGE RETURN */
   14559             :         0x001C, /* FILE SEPARATOR */
   14560             :         0x001D, /* GROUP SEPARATOR */
   14561             :         0x001E, /* RECORD SEPARATOR */
   14562             :         0x0085, /* NEXT LINE */
   14563             :         0x2028, /* LINE SEPARATOR */
   14564             :         0x2029, /* PARAGRAPH SEPARATOR */
   14565             :     };
   14566        2963 :     bloom_linebreak = make_bloom_mask(
   14567             :         PyUnicode_2BYTE_KIND, linebreak,
   14568             :         Py_ARRAY_LENGTH(linebreak));
   14569             : }
   14570             : 
   14571             : 
   14572             : PyStatus
   14573        3134 : _PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
   14574             : {
   14575        3134 :     if (!_Py_IsMainInterpreter(interp)) {
   14576         171 :         return _PyStatus_OK();
   14577             :     }
   14578             : 
   14579             : #ifdef Py_DEBUG
   14580        2963 :     assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
   14581             : 
   14582      761491 :     for (int i = 0; i < 256; i++) {
   14583      758528 :         assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
   14584             :     }
   14585             : #endif
   14586             : 
   14587        2963 :     return _PyStatus_OK();
   14588             : }
   14589             : 
   14590             : 
   14591             : PyStatus
   14592        3134 : _PyUnicode_InitTypes(PyInterpreterState *interp)
   14593             : {
   14594        3134 :     if (!_Py_IsMainInterpreter(interp)) {
   14595         171 :         return _PyStatus_OK();
   14596             :     }
   14597             : 
   14598        2963 :     if (PyType_Ready(&EncodingMapType) < 0) {
   14599           0 :         goto error;
   14600             :     }
   14601        2963 :     if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
   14602           0 :         goto error;
   14603             :     }
   14604        2963 :     if (PyType_Ready(&PyFormatterIter_Type) < 0) {
   14605           0 :         goto error;
   14606             :     }
   14607        2963 :     return _PyStatus_OK();
   14608             : 
   14609           0 : error:
   14610           0 :     return _PyStatus_ERR("Can't initialize unicode types");
   14611             : }
   14612             : 
   14613             : 
   14614             : void
   14615   290141000 : PyUnicode_InternInPlace(PyObject **p)
   14616             : {
   14617   290141000 :     PyObject *s = *p;
   14618             : #ifdef Py_DEBUG
   14619   290141000 :     assert(s != NULL);
   14620   290141000 :     assert(_PyUnicode_CHECK(s));
   14621             : #else
   14622             :     if (s == NULL || !PyUnicode_Check(s)) {
   14623             :         return;
   14624             :     }
   14625             : #endif
   14626             : 
   14627             :     /* If it's a subclass, we don't really know what putting
   14628             :        it in the interned dict might do. */
   14629   290141000 :     if (!PyUnicode_CheckExact(s)) {
   14630           4 :         return;
   14631             :     }
   14632             : 
   14633   290141000 :     if (PyUnicode_CHECK_INTERNED(s)) {
   14634   190087000 :         return;
   14635             :     }
   14636             : 
   14637   100054000 :     if (interned == NULL) {
   14638        2963 :         interned = PyDict_New();
   14639        2963 :         if (interned == NULL) {
   14640           0 :             PyErr_Clear(); /* Don't leave an exception */
   14641           0 :             return;
   14642             :         }
   14643             :     }
   14644             : 
   14645   100054000 :     PyObject *t = PyDict_SetDefault(interned, s, s);
   14646   100054000 :     if (t == NULL) {
   14647           0 :         PyErr_Clear();
   14648           0 :         return;
   14649             :     }
   14650             : 
   14651   100054000 :     if (t != s) {
   14652    75099600 :         Py_INCREF(t);
   14653    75099600 :         Py_SETREF(*p, t);
   14654    75099600 :         return;
   14655             :     }
   14656             : 
   14657             :     /* The two references in interned dict (key and value) are not counted by
   14658             :        refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
   14659             :        this. */
   14660    24954000 :     Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
   14661    24954000 :     _PyUnicode_STATE(s).interned = 1;
   14662             : }
   14663             : 
   14664             : // Function kept for the stable ABI.
   14665             : PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
   14666             : void
   14667           0 : PyUnicode_InternImmortal(PyObject **p)
   14668             : {
   14669           0 :     PyUnicode_InternInPlace(p);
   14670             :     // Leak a reference on purpose
   14671           0 :     Py_INCREF(*p);
   14672           0 : }
   14673             : 
   14674             : PyObject *
   14675    10919000 : PyUnicode_InternFromString(const char *cp)
   14676             : {
   14677    10919000 :     PyObject *s = PyUnicode_FromString(cp);
   14678    10919000 :     if (s == NULL)
   14679           0 :         return NULL;
   14680    10919000 :     PyUnicode_InternInPlace(&s);
   14681    10919000 :     return s;
   14682             : }
   14683             : 
   14684             : 
   14685             : void
   14686        3120 : _PyUnicode_ClearInterned(PyInterpreterState *interp)
   14687             : {
   14688        3120 :     if (!_Py_IsMainInterpreter(interp)) {
   14689             :         // interned dict is shared by all interpreters
   14690         169 :         return;
   14691             :     }
   14692             : 
   14693        2951 :     if (interned == NULL) {
   14694           0 :         return;
   14695             :     }
   14696        2951 :     assert(PyDict_CheckExact(interned));
   14697             : 
   14698             :     /* Interned unicode strings are not forcibly deallocated; rather, we give
   14699             :        them their stolen references back, and then clear and DECREF the
   14700             :        interned dict. */
   14701             : 
   14702             : #ifdef INTERNED_STATS
   14703             :     fprintf(stderr, "releasing %zd interned strings\n",
   14704             :             PyDict_GET_SIZE(interned));
   14705             : 
   14706             :     Py_ssize_t total_length = 0;
   14707             : #endif
   14708        2951 :     Py_ssize_t pos = 0;
   14709             :     PyObject *s, *ignored_value;
   14710     5218870 :     while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
   14711     5215920 :         assert(PyUnicode_CHECK_INTERNED(s));
   14712             :         // Restore the two references (key and value) ignored
   14713             :         // by PyUnicode_InternInPlace().
   14714     5215920 :         Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
   14715             : #ifdef INTERNED_STATS
   14716             :         total_length += PyUnicode_GET_LENGTH(s);
   14717             : #endif
   14718             : 
   14719     5215920 :         _PyUnicode_STATE(s).interned = 0;
   14720             :     }
   14721             : #ifdef INTERNED_STATS
   14722             :     fprintf(stderr,
   14723             :             "total length of all interned strings: %zd characters\n",
   14724             :             total_length);
   14725             : #endif
   14726             : 
   14727        2951 :     PyDict_Clear(interned);
   14728        2951 :     Py_CLEAR(interned);
   14729             : }
   14730             : 
   14731             : 
   14732             : /********************* Unicode Iterator **************************/
   14733             : 
   14734             : typedef struct {
   14735             :     PyObject_HEAD
   14736             :     Py_ssize_t it_index;
   14737             :     PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
   14738             : } unicodeiterobject;
   14739             : 
   14740             : static void
   14741     1023510 : unicodeiter_dealloc(unicodeiterobject *it)
   14742             : {
   14743     1023510 :     _PyObject_GC_UNTRACK(it);
   14744     1023510 :     Py_XDECREF(it->it_seq);
   14745     1023510 :     PyObject_GC_Del(it);
   14746     1023510 : }
   14747             : 
   14748             : static int
   14749         354 : unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
   14750             : {
   14751         354 :     Py_VISIT(it->it_seq);
   14752         354 :     return 0;
   14753             : }
   14754             : 
   14755             : static PyObject *
   14756      369665 : unicodeiter_next(unicodeiterobject *it)
   14757             : {
   14758             :     PyObject *seq;
   14759             : 
   14760      369665 :     assert(it != NULL);
   14761      369665 :     seq = it->it_seq;
   14762      369665 :     if (seq == NULL)
   14763          43 :         return NULL;
   14764      369622 :     assert(_PyUnicode_CHECK(seq));
   14765             : 
   14766      369622 :     if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
   14767      364692 :         int kind = PyUnicode_KIND(seq);
   14768      364692 :         const void *data = PyUnicode_DATA(seq);
   14769      364692 :         Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
   14770      364692 :         it->it_index++;
   14771      364692 :         return unicode_char(chr);
   14772             :     }
   14773             : 
   14774        4930 :     it->it_seq = NULL;
   14775        4930 :     Py_DECREF(seq);
   14776        4930 :     return NULL;
   14777             : }
   14778             : 
   14779             : static PyObject *
   14780     4980520 : unicode_ascii_iter_next(unicodeiterobject *it)
   14781             : {
   14782     4980520 :     assert(it != NULL);
   14783     4980520 :     PyObject *seq = it->it_seq;
   14784     4980520 :     if (seq == NULL) {
   14785         227 :         return NULL;
   14786             :     }
   14787     4980290 :     assert(_PyUnicode_CHECK(seq));
   14788     4980290 :     assert(PyUnicode_IS_COMPACT_ASCII(seq));
   14789     4980290 :     if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
   14790     3968780 :         const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
   14791     3968780 :         Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
   14792             :                                               data, it->it_index);
   14793     3968780 :         it->it_index++;
   14794     3968780 :         PyObject *item = (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
   14795     3968780 :         return Py_NewRef(item);
   14796             :     }
   14797     1011510 :     it->it_seq = NULL;
   14798     1011510 :     Py_DECREF(seq);
   14799     1011510 :     return NULL;
   14800             : }
   14801             : 
   14802             : static PyObject *
   14803       12060 : unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
   14804             : {
   14805       12060 :     Py_ssize_t len = 0;
   14806       12060 :     if (it->it_seq)
   14807       12059 :         len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
   14808       12060 :     return PyLong_FromSsize_t(len);
   14809             : }
   14810             : 
   14811             : PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
   14812             : 
   14813             : static PyObject *
   14814         438 : unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
   14815             : {
   14816         438 :     if (it->it_seq != NULL) {
   14817         432 :         return Py_BuildValue("N(O)n", _PyEval_GetBuiltin(&_Py_ID(iter)),
   14818             :                              it->it_seq, it->it_index);
   14819             :     } else {
   14820           6 :         PyObject *u = unicode_new_empty();
   14821           6 :         if (u == NULL)
   14822           0 :             return NULL;
   14823           6 :         return Py_BuildValue("N(N)", _PyEval_GetBuiltin(&_Py_ID(iter)), u);
   14824             :     }
   14825             : }
   14826             : 
   14827             : PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
   14828             : 
   14829             : static PyObject *
   14830         564 : unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
   14831             : {
   14832         564 :     Py_ssize_t index = PyLong_AsSsize_t(state);
   14833         564 :     if (index == -1 && PyErr_Occurred())
   14834           0 :         return NULL;
   14835         564 :     if (it->it_seq != NULL) {
   14836         564 :         if (index < 0)
   14837           0 :             index = 0;
   14838         564 :         else if (index > PyUnicode_GET_LENGTH(it->it_seq))
   14839           0 :             index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
   14840         564 :         it->it_index = index;
   14841             :     }
   14842         564 :     Py_RETURN_NONE;
   14843             : }
   14844             : 
   14845             : PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
   14846             : 
   14847             : static PyMethodDef unicodeiter_methods[] = {
   14848             :     {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
   14849             :      length_hint_doc},
   14850             :     {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
   14851             :      reduce_doc},
   14852             :     {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
   14853             :      setstate_doc},
   14854             :     {NULL,      NULL}       /* sentinel */
   14855             : };
   14856             : 
   14857             : PyTypeObject PyUnicodeIter_Type = {
   14858             :     PyVarObject_HEAD_INIT(&PyType_Type, 0)
   14859             :     "str_iterator",         /* tp_name */
   14860             :     sizeof(unicodeiterobject),      /* tp_basicsize */
   14861             :     0,                  /* tp_itemsize */
   14862             :     /* methods */
   14863             :     (destructor)unicodeiter_dealloc,    /* tp_dealloc */
   14864             :     0,                  /* tp_vectorcall_offset */
   14865             :     0,                  /* tp_getattr */
   14866             :     0,                  /* tp_setattr */
   14867             :     0,                  /* tp_as_async */
   14868             :     0,                  /* tp_repr */
   14869             :     0,                  /* tp_as_number */
   14870             :     0,                  /* tp_as_sequence */
   14871             :     0,                  /* tp_as_mapping */
   14872             :     0,                  /* tp_hash */
   14873             :     0,                  /* tp_call */
   14874             :     0,                  /* tp_str */
   14875             :     PyObject_GenericGetAttr,        /* tp_getattro */
   14876             :     0,                  /* tp_setattro */
   14877             :     0,                  /* tp_as_buffer */
   14878             :     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
   14879             :     0,                  /* tp_doc */
   14880             :     (traverseproc)unicodeiter_traverse, /* tp_traverse */
   14881             :     0,                  /* tp_clear */
   14882             :     0,                  /* tp_richcompare */
   14883             :     0,                  /* tp_weaklistoffset */
   14884             :     PyObject_SelfIter,          /* tp_iter */
   14885             :     (iternextfunc)unicodeiter_next,     /* tp_iternext */
   14886             :     unicodeiter_methods,            /* tp_methods */
   14887             :     0,
   14888             : };
   14889             : 
   14890             : PyTypeObject _PyUnicodeASCIIIter_Type = {
   14891             :     PyVarObject_HEAD_INIT(&PyType_Type, 0)
   14892             :     .tp_name = "str_ascii_iterator",
   14893             :     .tp_basicsize = sizeof(unicodeiterobject),
   14894             :     .tp_dealloc = (destructor)unicodeiter_dealloc,
   14895             :     .tp_getattro = PyObject_GenericGetAttr,
   14896             :     .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
   14897             :     .tp_traverse = (traverseproc)unicodeiter_traverse,
   14898             :     .tp_iter = PyObject_SelfIter,
   14899             :     .tp_iternext = (iternextfunc)unicode_ascii_iter_next,
   14900             :     .tp_methods = unicodeiter_methods,
   14901             : };
   14902             : 
   14903             : static PyObject *
   14904     1023510 : unicode_iter(PyObject *seq)
   14905             : {
   14906             :     unicodeiterobject *it;
   14907             : 
   14908     1023510 :     if (!PyUnicode_Check(seq)) {
   14909           0 :         PyErr_BadInternalCall();
   14910           0 :         return NULL;
   14911             :     }
   14912     1023510 :     if (PyUnicode_IS_COMPACT_ASCII(seq)) {
   14913     1018540 :         it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
   14914             :     }
   14915             :     else {
   14916        4976 :         it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
   14917             :     }
   14918     1023510 :     if (it == NULL)
   14919           0 :         return NULL;
   14920     1023510 :     it->it_index = 0;
   14921     1023510 :     Py_INCREF(seq);
   14922     1023510 :     it->it_seq = seq;
   14923     1023510 :     _PyObject_GC_TRACK(it);
   14924     1023510 :     return (PyObject *)it;
   14925             : }
   14926             : 
   14927             : static int
   14928       12520 : encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
   14929             : {
   14930             :     int res;
   14931       12520 :     res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
   14932       12520 :     if (res == -2) {
   14933           0 :         PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
   14934           0 :         return -1;
   14935             :     }
   14936       12520 :     if (res < 0) {
   14937           0 :         PyErr_NoMemory();
   14938           0 :         return -1;
   14939             :     }
   14940       12520 :     return 0;
   14941             : }
   14942             : 
   14943             : 
   14944             : static int
   14945        6260 : config_get_codec_name(wchar_t **config_encoding)
   14946             : {
   14947             :     char *encoding;
   14948        6260 :     if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
   14949           0 :         return -1;
   14950             :     }
   14951             : 
   14952        6260 :     PyObject *name_obj = NULL;
   14953        6260 :     PyObject *codec = _PyCodec_Lookup(encoding);
   14954        6260 :     PyMem_RawFree(encoding);
   14955             : 
   14956        6260 :     if (!codec)
   14957           0 :         goto error;
   14958             : 
   14959        6260 :     name_obj = PyObject_GetAttrString(codec, "name");
   14960        6260 :     Py_CLEAR(codec);
   14961        6260 :     if (!name_obj) {
   14962           0 :         goto error;
   14963             :     }
   14964             : 
   14965        6260 :     wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
   14966        6260 :     Py_DECREF(name_obj);
   14967        6260 :     if (wname == NULL) {
   14968           0 :         goto error;
   14969             :     }
   14970             : 
   14971        6260 :     wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
   14972        6260 :     if (raw_wname == NULL) {
   14973           0 :         PyMem_Free(wname);
   14974           0 :         PyErr_NoMemory();
   14975           0 :         goto error;
   14976             :     }
   14977             : 
   14978        6260 :     PyMem_RawFree(*config_encoding);
   14979        6260 :     *config_encoding = raw_wname;
   14980             : 
   14981        6260 :     PyMem_Free(wname);
   14982        6260 :     return 0;
   14983             : 
   14984           0 : error:
   14985           0 :     Py_XDECREF(codec);
   14986           0 :     Py_XDECREF(name_obj);
   14987           0 :     return -1;
   14988             : }
   14989             : 
   14990             : 
   14991             : static PyStatus
   14992        3130 : init_stdio_encoding(PyInterpreterState *interp)
   14993             : {
   14994             :     /* Update the stdio encoding to the normalized Python codec name. */
   14995        3130 :     PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
   14996        3130 :     if (config_get_codec_name(&config->stdio_encoding) < 0) {
   14997           0 :         return _PyStatus_ERR("failed to get the Python codec name "
   14998             :                              "of the stdio encoding");
   14999             :     }
   15000        3130 :     return _PyStatus_OK();
   15001             : }
   15002             : 
   15003             : 
   15004             : static int
   15005        3130 : init_fs_codec(PyInterpreterState *interp)
   15006             : {
   15007        3130 :     const PyConfig *config = _PyInterpreterState_GetConfig(interp);
   15008             : 
   15009             :     _Py_error_handler error_handler;
   15010        3130 :     error_handler = get_error_handler_wide(config->filesystem_errors);
   15011        3130 :     if (error_handler == _Py_ERROR_UNKNOWN) {
   15012           0 :         PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
   15013           0 :         return -1;
   15014             :     }
   15015             : 
   15016             :     char *encoding, *errors;
   15017        3130 :     if (encode_wstr_utf8(config->filesystem_encoding,
   15018             :                          &encoding,
   15019             :                          "filesystem_encoding") < 0) {
   15020           0 :         return -1;
   15021             :     }
   15022             : 
   15023        3130 :     if (encode_wstr_utf8(config->filesystem_errors,
   15024             :                          &errors,
   15025             :                          "filesystem_errors") < 0) {
   15026           0 :         PyMem_RawFree(encoding);
   15027           0 :         return -1;
   15028             :     }
   15029             : 
   15030        3130 :     struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
   15031        3130 :     PyMem_RawFree(fs_codec->encoding);
   15032        3130 :     fs_codec->encoding = encoding;
   15033             :     /* encoding has been normalized by init_fs_encoding() */
   15034        3130 :     fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
   15035        3130 :     PyMem_RawFree(fs_codec->errors);
   15036        3130 :     fs_codec->errors = errors;
   15037        3130 :     fs_codec->error_handler = error_handler;
   15038             : 
   15039             : #ifdef _Py_FORCE_UTF8_FS_ENCODING
   15040             :     assert(fs_codec->utf8 == 1);
   15041             : #endif
   15042             : 
   15043             :     /* At this point, PyUnicode_EncodeFSDefault() and
   15044             :        PyUnicode_DecodeFSDefault() can now use the Python codec rather than
   15045             :        the C implementation of the filesystem encoding. */
   15046             : 
   15047             :     /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
   15048             :        global configuration variables. */
   15049        3130 :     if (_Py_SetFileSystemEncoding(fs_codec->encoding,
   15050        3130 :                                   fs_codec->errors) < 0) {
   15051           0 :         PyErr_NoMemory();
   15052           0 :         return -1;
   15053             :     }
   15054        3130 :     return 0;
   15055             : }
   15056             : 
   15057             : 
   15058             : static PyStatus
   15059        3130 : init_fs_encoding(PyThreadState *tstate)
   15060             : {
   15061        3130 :     PyInterpreterState *interp = tstate->interp;
   15062             : 
   15063             :     /* Update the filesystem encoding to the normalized Python codec name.
   15064             :        For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
   15065             :        (Python codec name). */
   15066        3130 :     PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
   15067        3130 :     if (config_get_codec_name(&config->filesystem_encoding) < 0) {
   15068           0 :         _Py_DumpPathConfig(tstate);
   15069           0 :         return _PyStatus_ERR("failed to get the Python codec "
   15070             :                              "of the filesystem encoding");
   15071             :     }
   15072             : 
   15073        3130 :     if (init_fs_codec(interp) < 0) {
   15074           0 :         return _PyStatus_ERR("cannot initialize filesystem codec");
   15075             :     }
   15076        3130 :     return _PyStatus_OK();
   15077             : }
   15078             : 
   15079             : 
   15080             : PyStatus
   15081        3130 : _PyUnicode_InitEncodings(PyThreadState *tstate)
   15082             : {
   15083        3130 :     PyStatus status = init_fs_encoding(tstate);
   15084        3130 :     if (_PyStatus_EXCEPTION(status)) {
   15085           0 :         return status;
   15086             :     }
   15087             : 
   15088        3130 :     return init_stdio_encoding(tstate->interp);
   15089             : }
   15090             : 
   15091             : 
   15092             : static void
   15093        3120 : _PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
   15094             : {
   15095        3120 :     PyMem_RawFree(fs_codec->encoding);
   15096        3120 :     fs_codec->encoding = NULL;
   15097        3120 :     fs_codec->utf8 = 0;
   15098        3120 :     PyMem_RawFree(fs_codec->errors);
   15099        3120 :     fs_codec->errors = NULL;
   15100        3120 :     fs_codec->error_handler = _Py_ERROR_UNKNOWN;
   15101        3120 : }
   15102             : 
   15103             : 
   15104             : #ifdef MS_WINDOWS
   15105             : int
   15106             : _PyUnicode_EnableLegacyWindowsFSEncoding(void)
   15107             : {
   15108             :     PyInterpreterState *interp = _PyInterpreterState_GET();
   15109             :     PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
   15110             : 
   15111             :     /* Set the filesystem encoding to mbcs/replace (PEP 529) */
   15112             :     wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
   15113             :     wchar_t *errors = _PyMem_RawWcsdup(L"replace");
   15114             :     if (encoding == NULL || errors == NULL) {
   15115             :         PyMem_RawFree(encoding);
   15116             :         PyMem_RawFree(errors);
   15117             :         PyErr_NoMemory();
   15118             :         return -1;
   15119             :     }
   15120             : 
   15121             :     PyMem_RawFree(config->filesystem_encoding);
   15122             :     config->filesystem_encoding = encoding;
   15123             :     PyMem_RawFree(config->filesystem_errors);
   15124             :     config->filesystem_errors = errors;
   15125             : 
   15126             :     return init_fs_codec(interp);
   15127             : }
   15128             : #endif
   15129             : 
   15130             : 
   15131             : #ifdef Py_DEBUG
   15132             : static inline int
   15133   249125000 : unicode_is_finalizing(void)
   15134             : {
   15135   249125000 :     return (interned == NULL);
   15136             : }
   15137             : #endif
   15138             : 
   15139             : 
   15140             : void
   15141        3120 : _PyUnicode_FiniTypes(PyInterpreterState *interp)
   15142             : {
   15143        3120 :     if (!_Py_IsMainInterpreter(interp)) {
   15144         169 :         return;
   15145             :     }
   15146             : 
   15147        2951 :     _PyStaticType_Dealloc(&EncodingMapType);
   15148        2951 :     _PyStaticType_Dealloc(&PyFieldNameIter_Type);
   15149        2951 :     _PyStaticType_Dealloc(&PyFormatterIter_Type);
   15150             : }
   15151             : 
   15152             : 
   15153     8361050 : static void unicode_static_dealloc(PyObject *op)
   15154             : {
   15155     8361050 :     PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
   15156             : 
   15157     8361050 :     assert(ascii->state.compact);
   15158             : 
   15159     8361050 :     if (!ascii->state.ascii) {
   15160      416940 :         PyCompactUnicodeObject* compact = (PyCompactUnicodeObject*)op;
   15161      416940 :         if (compact->utf8) {
   15162         278 :             PyObject_Free(compact->utf8);
   15163         278 :             compact->utf8 = NULL;
   15164         278 :             compact->utf8_length = 0;
   15165             :         }
   15166             :     }
   15167     8361050 : }
   15168             : 
   15169             : 
   15170             : void
   15171        3120 : _PyUnicode_Fini(PyInterpreterState *interp)
   15172             : {
   15173        3120 :     struct _Py_unicode_state *state = &interp->unicode;
   15174             : 
   15175        3120 :     if (_Py_IsMainInterpreter(interp)) {
   15176             :         // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
   15177        2951 :         assert(interned == NULL);
   15178             :         // bpo-47182: force a unicodedata CAPI capsule re-import on
   15179             :         // subsequent initialization of main interpreter.
   15180        2951 :         ucnhash_capi = NULL;
   15181             :     }
   15182             : 
   15183        3120 :     _PyUnicode_FiniEncodings(&state->fs_codec);
   15184             : 
   15185        3120 :     unicode_clear_identifiers(state);
   15186             : 
   15187             :     // Clear the single character singletons
   15188      402480 :     for (int i = 0; i < 128; i++) {
   15189      399360 :         unicode_static_dealloc((PyObject*)&_Py_SINGLETON(strings).ascii[i]);
   15190             :     }
   15191      402480 :     for (int i = 0; i < 128; i++) {
   15192      399360 :         unicode_static_dealloc((PyObject*)&_Py_SINGLETON(strings).latin1[i]);
   15193             :     }
   15194        3120 : }
   15195             : 
   15196             : 
   15197             : void
   15198     7562330 : _PyStaticUnicode_Dealloc(PyObject *op)
   15199             : {
   15200     7562330 :     unicode_static_dealloc(op);
   15201     7562330 : }
   15202             : 
   15203             : 
   15204             : /* A _string module, to export formatter_parser and formatter_field_name_split
   15205             :    to the string.Formatter class implemented in Python. */
   15206             : 
   15207             : static PyMethodDef _string_methods[] = {
   15208             :     {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
   15209             :      METH_O, PyDoc_STR("split the argument as a field name")},
   15210             :     {"formatter_parser", (PyCFunction) formatter_parser,
   15211             :      METH_O, PyDoc_STR("parse the argument as a format string")},
   15212             :     {NULL, NULL}
   15213             : };
   15214             : 
   15215             : static struct PyModuleDef _string_module = {
   15216             :     PyModuleDef_HEAD_INIT,
   15217             :     .m_name = "_string",
   15218             :     .m_doc = PyDoc_STR("string helper module"),
   15219             :     .m_size = 0,
   15220             :     .m_methods = _string_methods,
   15221             : };
   15222             : 
   15223             : PyMODINIT_FUNC
   15224         714 : PyInit__string(void)
   15225             : {
   15226         714 :     return PyModuleDef_Init(&_string_module);
   15227             : }
   15228             : 
   15229             : 
   15230             : #ifdef __cplusplus
   15231             : }
   15232             : #endif

Generated by: LCOV version 1.14