Coverage Report

Created: 2022-07-08 09:39

/home/mdboom/Work/builds/cpython/Objects/unicodeobject.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
3
Unicode implementation based on original code by Fredrik Lundh,
4
modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6
Major speed upgrades to the method implementations at the Reykjavik
7
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9
Copyright (c) Corporation for National Research Initiatives.
10
11
--------------------------------------------------------------------
12
The original string type implementation is:
13
14
  Copyright (c) 1999 by Secret Labs AB
15
  Copyright (c) 1999 by Fredrik Lundh
16
17
By obtaining, using, and/or copying this software and/or its
18
associated documentation, you agree that you have read, understood,
19
and will comply with the following terms and conditions:
20
21
Permission to use, copy, modify, and distribute this software and its
22
associated documentation for any purpose and without fee is hereby
23
granted, provided that the above copyright notice appears in all
24
copies, and that both that copyright notice and this permission notice
25
appear in supporting documentation, and that the name of Secret Labs
26
AB or the author not be used in advertising or publicity pertaining to
27
distribution of the software without specific, written prior
28
permission.
29
30
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37
--------------------------------------------------------------------
38
39
*/
40
41
#define PY_SSIZE_T_CLEAN
42
#include "Python.h"
43
#include "pycore_abstract.h"      // _PyIndex_Check()
44
#include "pycore_atomic_funcs.h"  // _Py_atomic_size_get()
45
#include "pycore_bytesobject.h"   // _PyBytes_Repeat()
46
#include "pycore_bytes_methods.h" // _Py_bytes_lower()
47
#include "pycore_format.h"        // F_LJUST
48
#include "pycore_initconfig.h"    // _PyStatus_OK()
49
#include "pycore_interp.h"        // PyInterpreterState.fs_codec
50
#include "pycore_long.h"          // _PyLong_FormatWriter()
51
#include "pycore_object.h"        // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
52
#include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
53
#include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
54
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
55
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
56
#include "pycore_unicodeobject.h" // struct _Py_unicode_state
57
#include "stringlib/eq.h"         // unicode_eq()
58
59
#ifdef MS_WINDOWS
60
#include <windows.h>
61
#endif
62
63
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
64
#  include "pycore_fileutils.h"   // _Py_LocaleUsesNonUnicodeWchar()
65
#endif
66
67
/* Uncomment to display statistics on interned strings at exit
68
   in _PyUnicode_ClearInterned(). */
69
/* #define INTERNED_STATS 1 */
70
71
72
/*[clinic input]
73
class str "PyObject *" "&PyUnicode_Type"
74
[clinic start generated code]*/
75
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
76
77
/*[python input]
78
class Py_UCS4_converter(CConverter):
79
    type = 'Py_UCS4'
80
    converter = 'convert_uc'
81
82
    def converter_init(self):
83
        if self.default is not unspecified:
84
            self.c_default = ascii(self.default)
85
            if len(self.c_default) > 4 or self.c_default[0] != "'":
86
                self.c_default = hex(ord(self.default))
87
88
[python start generated code]*/
89
/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
90
91
/* --- Globals ------------------------------------------------------------
92
93
NOTE: In the interpreter's initialization phase, some globals are currently
94
      initialized dynamically as needed. In the process Unicode objects may
95
      be created before the Unicode type is ready.
96
97
*/
98
99
100
#ifdef __cplusplus
101
extern "C" {
102
#endif
103
104
// Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
105
// The value must be the same in fileutils.c.
106
#define MAX_UNICODE 0x10ffff
107
108
#ifdef Py_DEBUG
109
#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
110
#else
111
#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
112
#endif
113
114
#define _PyUnicode_UTF8(op)                             \
115
    (
_PyCompactUnicodeObject_CAST14.9M
(op)->utf8)
116
#define PyUnicode_UTF8(op)                              \
117
    (assert(_PyUnicode_CHECK(op)),                      \
118
     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
119
         
((char*)(32.5M
_PyASCIIObject_CAST32.5M
(op) + 1)) : \
120
         
_PyUnicode_UTF8347k
(op))
121
#define _PyUnicode_UTF8_LENGTH(op)                      \
122
    (
_PyCompactUnicodeObject_CAST289k
(op)->utf8_length)
123
#define PyUnicode_UTF8_LENGTH(op)                       \
124
    (assert(_PyUnicode_CHECK(op)),                      \
125
     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
126
         
_PyASCIIObject_CAST16.0M
(op)->length16.0M
: \
127
         
_PyUnicode_UTF8_LENGTH58.1k
(op))
128
129
#define _PyUnicode_LENGTH(op)                           \
130
    (
_PyASCIIObject_CAST84.1M
(op)->length)
131
#define _PyUnicode_STATE(op)                            \
132
    (_PyASCIIObject_CAST(op)->state)
133
#define _PyUnicode_HASH(op)                             \
134
    (_PyASCIIObject_CAST(op)->hash)
135
#define _PyUnicode_KIND(op)                             \
136
    (assert(_PyUnicode_CHECK(op)),                      \
137
     _PyASCIIObject_CAST(op)->state.kind)
138
#define _PyUnicode_GET_LENGTH(op)                       \
139
    (assert(_PyUnicode_CHECK(op)),                      \
140
     _PyASCIIObject_CAST(op)->length)
141
#define _PyUnicode_DATA_ANY(op)                         \
142
    (_PyUnicodeObject_CAST(op)->data.any)
143
144
#define _PyUnicode_SHARE_UTF8(op)                       \
145
    (assert(_PyUnicode_CHECK(op)),                      \
146
     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
147
     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
148
149
/* true if the Unicode object has an allocated UTF-8 memory block
150
   (not shared with other data) */
151
#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
152
    ((!PyUnicode_IS_COMPACT_ASCII(op)                   \
153
      && 
_PyUnicode_UTF814.2M
(op) \
154
      && 
_PyUnicode_UTF8116k
(op) != 116k
PyUnicode_DATA116k
(op)))
155
156
/* Generic helper macro to convert characters of different types.
157
   from_type and to_type have to be valid type names, begin and end
158
   are pointers to the source characters which should be of type
159
   "from_type *".  to is a pointer of type "to_type *" and points to the
160
   buffer where the result characters are written to. */
161
#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
162
    do {                                                \
163
        to_type *_to = (to_type *)(to);                 \
164
        const from_type *_iter = (const from_type *)(begin);\
165
        const from_type *_end = (const from_type *)(end);\
166
        Py_ssize_t n = (_end) - (_iter);                \
167
        const from_type *_unrolled_end =                \
168
            _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
169
        while (_iter < (_unrolled_end)) {               \
170
            _to[0] = (to_type) _iter[0];                \
171
            _to[1] = (to_type) _iter[1];                \
172
            _to[2] = (to_type) _iter[2];                \
173
            _to[3] = (to_type) _iter[3];                \
174
            _iter += 4; _to += 4;                       \
175
        }                                               \
176
        while (_iter < (_end))                          \
177
            *_to++ = (to_type) *_iter++;                \
178
    } while (0)
179
180
#define LATIN1(ch)  \
181
    (ch < 128 \
182
     ? 
(PyObject*)&179k
_Py_SINGLETON179k
(strings).ascii[ch] \
183
     : 
(PyObject*)&9.74k
_Py_SINGLETON9.74k
(strings).latin1[ch - 128])
184
185
#ifdef MS_WINDOWS
186
   /* On Windows, overallocate by 50% is the best factor */
187
#  define OVERALLOCATE_FACTOR 2
188
#else
189
   /* On Linux, overallocate by 25% is the best factor */
190
#  define OVERALLOCATE_FACTOR 4
191
#endif
192
193
/* This dictionary holds all interned unicode strings.  Note that references
194
   to strings in this dictionary are *not* counted in the string's ob_refcnt.
195
   When the interned string reaches a refcnt of 0 the string deallocation
196
   function will delete the reference from this dictionary.
197
198
   Another way to look at this is that to say that the actual reference
199
   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
200
*/
201
static PyObject *interned = NULL;
202
203
/* Forward declaration */
204
static inline int
205
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
206
static inline void
207
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
208
static PyObject *
209
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
210
                    const char *errors);
211
static PyObject *
212
unicode_decode_utf8(const char *s, Py_ssize_t size,
213
                    _Py_error_handler error_handler, const char *errors,
214
                    Py_ssize_t *consumed);
215
#ifdef Py_DEBUG
216
static inline int unicode_is_finalizing(void);
217
static int unicode_is_singleton(PyObject *unicode);
218
#endif
219
220
221
// Return a borrowed reference to the empty string singleton.
222
static inline PyObject* unicode_get_empty(void)
223
{
224
    _Py_DECLARE_STR(empty, "");
225
    return &_Py_STR(empty);
226
}
227
228
229
// Return a strong reference to the empty string singleton.
230
static inline PyObject* unicode_new_empty(void)
231
{
232
    PyObject *empty = unicode_get_empty();
233
    Py_INCREF(empty);
234
    return empty;
235
}
236
237
#define _Py_RETURN_UNICODE_EMPTY()   \
238
    do {                             \
239
        return unicode_new_empty();  \
240
    } while (
00
)
241
242
static inline void
243
unicode_fill(int kind, void *data, Py_UCS4 value,
244
             Py_ssize_t start, Py_ssize_t length)
245
{
246
    assert(0 <= start);
247
    switch (kind) {
248
    case PyUnicode_1BYTE_KIND: {
  Branch (248:5): [True: 252k, False: 489]
249
        assert(value <= 0xff);
250
        Py_UCS1 ch = (unsigned char)value;
251
        Py_UCS1 *to = (Py_UCS1 *)data + start;
252
        memset(to, ch, length);
253
        break;
254
    }
255
    case PyUnicode_2BYTE_KIND: {
  Branch (255:5): [True: 461, False: 252k]
256
        assert(value <= 0xffff);
257
        Py_UCS2 ch = (Py_UCS2)value;
258
        Py_UCS2 *to = (Py_UCS2 *)data + start;
259
        const Py_UCS2 *end = to + length;
260
        for (; to < end; 
++to10.2k
)
*to = ch10.2k
;
  Branch (260:16): [True: 10.2k, False: 461]
261
        break;
262
    }
263
    case PyUnicode_4BYTE_KIND: {
  Branch (263:5): [True: 28, False: 253k]
264
        assert(value <= MAX_UNICODE);
265
        Py_UCS4 ch = value;
266
        Py_UCS4 * to = (Py_UCS4 *)data + start;
267
        const Py_UCS4 *end = to + length;
268
        for (; to < end; 
++to85
)
*to = ch85
;
  Branch (268:16): [True: 85, False: 28]
269
        break;
270
    }
271
    default: Py_UNREACHABLE();
  Branch (271:5): [True: 0, False: 253k]
272
    }
273
}
274
275
276
/* Fast detection of the most frequent whitespace characters */
277
const unsigned char _Py_ascii_whitespace[] = {
278
    0, 0, 0, 0, 0, 0, 0, 0,
279
/*     case 0x0009: * CHARACTER TABULATION */
280
/*     case 0x000A: * LINE FEED */
281
/*     case 0x000B: * LINE TABULATION */
282
/*     case 0x000C: * FORM FEED */
283
/*     case 0x000D: * CARRIAGE RETURN */
284
    0, 1, 1, 1, 1, 1, 0, 0,
285
    0, 0, 0, 0, 0, 0, 0, 0,
286
/*     case 0x001C: * FILE SEPARATOR */
287
/*     case 0x001D: * GROUP SEPARATOR */
288
/*     case 0x001E: * RECORD SEPARATOR */
289
/*     case 0x001F: * UNIT SEPARATOR */
290
    0, 0, 0, 0, 1, 1, 1, 1,
291
/*     case 0x0020: * SPACE */
292
    1, 0, 0, 0, 0, 0, 0, 0,
293
    0, 0, 0, 0, 0, 0, 0, 0,
294
    0, 0, 0, 0, 0, 0, 0, 0,
295
    0, 0, 0, 0, 0, 0, 0, 0,
296
297
    0, 0, 0, 0, 0, 0, 0, 0,
298
    0, 0, 0, 0, 0, 0, 0, 0,
299
    0, 0, 0, 0, 0, 0, 0, 0,
300
    0, 0, 0, 0, 0, 0, 0, 0,
301
    0, 0, 0, 0, 0, 0, 0, 0,
302
    0, 0, 0, 0, 0, 0, 0, 0,
303
    0, 0, 0, 0, 0, 0, 0, 0,
304
    0, 0, 0, 0, 0, 0, 0, 0
305
};
306
307
/* forward */
308
static PyObject* get_latin1_char(unsigned char ch);
309
static int unicode_modifiable(PyObject *unicode);
310
311
312
static PyObject *
313
_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
314
static PyObject *
315
_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
316
static PyObject *
317
_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
318
319
static PyObject *
320
unicode_encode_call_errorhandler(const char *errors,
321
       PyObject **errorHandler,const char *encoding, const char *reason,
322
       PyObject *unicode, PyObject **exceptionObject,
323
       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
324
325
static void
326
raise_encode_exception(PyObject **exceptionObject,
327
                       const char *encoding,
328
                       PyObject *unicode,
329
                       Py_ssize_t startpos, Py_ssize_t endpos,
330
                       const char *reason);
331
332
/* Same for linebreaks */
333
static const unsigned char ascii_linebreak[] = {
334
    0, 0, 0, 0, 0, 0, 0, 0,
335
/*         0x000A, * LINE FEED */
336
/*         0x000B, * LINE TABULATION */
337
/*         0x000C, * FORM FEED */
338
/*         0x000D, * CARRIAGE RETURN */
339
    0, 0, 1, 1, 1, 1, 0, 0,
340
    0, 0, 0, 0, 0, 0, 0, 0,
341
/*         0x001C, * FILE SEPARATOR */
342
/*         0x001D, * GROUP SEPARATOR */
343
/*         0x001E, * RECORD SEPARATOR */
344
    0, 0, 0, 0, 1, 1, 1, 0,
345
    0, 0, 0, 0, 0, 0, 0, 0,
346
    0, 0, 0, 0, 0, 0, 0, 0,
347
    0, 0, 0, 0, 0, 0, 0, 0,
348
    0, 0, 0, 0, 0, 0, 0, 0,
349
350
    0, 0, 0, 0, 0, 0, 0, 0,
351
    0, 0, 0, 0, 0, 0, 0, 0,
352
    0, 0, 0, 0, 0, 0, 0, 0,
353
    0, 0, 0, 0, 0, 0, 0, 0,
354
    0, 0, 0, 0, 0, 0, 0, 0,
355
    0, 0, 0, 0, 0, 0, 0, 0,
356
    0, 0, 0, 0, 0, 0, 0, 0,
357
    0, 0, 0, 0, 0, 0, 0, 0
358
};
359
360
static int convert_uc(PyObject *obj, void *addr);
361
362
struct encoding_map;
363
#include "clinic/unicodeobject.c.h"
364
365
_Py_error_handler
366
_Py_GetErrorHandler(const char *errors)
367
{
368
    if (errors == NULL || 
strcmp(errors, "strict") == 0181k
) {
  Branch (368:9): [True: 22.2k, False: 181k]
  Branch (368:27): [True: 485, False: 181k]
369
        return _Py_ERROR_STRICT;
370
    }
371
    if (strcmp(errors, "surrogateescape") == 0) {
  Branch (371:9): [True: 175k, False: 5.88k]
372
        return _Py_ERROR_SURROGATEESCAPE;
373
    }
374
    if (strcmp(errors, "replace") == 0) {
  Branch (374:9): [True: 820, False: 5.06k]
375
        return _Py_ERROR_REPLACE;
376
    }
377
    if (strcmp(errors, "ignore") == 0) {
  Branch (377:9): [True: 613, False: 4.45k]
378
        return _Py_ERROR_IGNORE;
379
    }
380
    if (strcmp(errors, "backslashreplace") == 0) {
  Branch (380:9): [True: 3.73k, False: 716]
381
        return _Py_ERROR_BACKSLASHREPLACE;
382
    }
383
    if (strcmp(errors, "surrogatepass") == 0) {
  Branch (383:9): [True: 372, False: 344]
384
        return _Py_ERROR_SURROGATEPASS;
385
    }
386
    if (strcmp(errors, "xmlcharrefreplace") == 0) {
  Branch (386:9): [True: 190, False: 154]
387
        return _Py_ERROR_XMLCHARREFREPLACE;
388
    }
389
    return _Py_ERROR_OTHER;
390
}
391
392
393
static _Py_error_handler
394
get_error_handler_wide(const wchar_t *errors)
395
{
396
    if (errors == NULL || wcscmp(errors, L"strict") == 0) {
  Branch (396:9): [True: 0, False: 99.1k]
  Branch (396:27): [True: 0, False: 99.1k]
397
        return _Py_ERROR_STRICT;
398
    }
399
    if (wcscmp(errors, L"surrogateescape") == 0) {
  Branch (399:9): [True: 99.1k, False: 0]
400
        return _Py_ERROR_SURROGATEESCAPE;
401
    }
402
    if (wcscmp(errors, L"replace") == 0) {
  Branch (402:9): [True: 0, False: 0]
403
        return _Py_ERROR_REPLACE;
404
    }
405
    if (wcscmp(errors, L"ignore") == 0) {
  Branch (405:9): [True: 0, False: 0]
406
        return _Py_ERROR_IGNORE;
407
    }
408
    if (wcscmp(errors, L"backslashreplace") == 0) {
  Branch (408:9): [True: 0, False: 0]
409
        return _Py_ERROR_BACKSLASHREPLACE;
410
    }
411
    if (wcscmp(errors, L"surrogatepass") == 0) {
  Branch (411:9): [True: 0, False: 0]
412
        return _Py_ERROR_SURROGATEPASS;
413
    }
414
    if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
  Branch (414:9): [True: 0, False: 0]
415
        return _Py_ERROR_XMLCHARREFREPLACE;
416
    }
417
    return _Py_ERROR_OTHER;
418
}
419
420
421
static inline int
422
unicode_check_encoding_errors(const char *encoding, const char *errors)
423
{
424
    if (encoding == NULL && 
errors == NULL92.7k
) {
  Branch (424:9): [True: 92.7k, False: 8.75M]
  Branch (424:29): [True: 92.7k, False: 4]
425
        return 0;
426
    }
427
428
    PyInterpreterState *interp = _PyInterpreterState_GET();
429
#ifndef Py_DEBUG
430
    /* In release mode, only check in development mode (-X dev) */
431
    if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
  Branch (431:9): [True: 8.75M, False: 40]
432
        return 0;
433
    }
434
#else
435
    /* Always check in debug mode */
436
#endif
437
438
    /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
439
       codec registry is ready: before_PyUnicode_InitEncodings() is called. */
440
    if (!interp->unicode.fs_codec.encoding) {
  Branch (440:9): [True: 0, False: 40]
441
        return 0;
442
    }
443
444
    /* Disable checks during Python finalization. For example, it allows to
445
       call _PyObject_Dump() during finalization for debugging purpose. */
446
    if (interp->finalizing) {
  Branch (446:9): [True: 0, False: 40]
447
        return 0;
448
    }
449
450
    if (encoding != NULL
  Branch (450:9): [True: 40, False: 0]
451
        // Fast path for the most common built-in encodings. Even if the codec
452
        // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to
453
        // create a temporary Unicode string (the key in the cache).
454
        && strcmp(encoding, "utf-8") != 0
  Branch (454:12): [True: 15, False: 25]
455
        && 
strcmp(encoding, "utf8") != 015
  Branch (455:12): [True: 15, False: 0]
456
        && 
strcmp(encoding, "ascii") != 015
)
  Branch (456:12): [True: 5, False: 10]
457
    {
458
        PyObject *handler = _PyCodec_Lookup(encoding);
459
        if (handler == NULL) {
  Branch (459:13): [True: 0, False: 5]
460
            return -1;
461
        }
462
        Py_DECREF(handler);
463
    }
464
465
    if (errors != NULL
  Branch (465:9): [True: 25, False: 15]
466
        // Fast path for the most common built-in error handlers.
467
        && 
strcmp(errors, "strict") != 025
  Branch (467:12): [True: 25, False: 0]
468
        && 
strcmp(errors, "ignore") != 025
  Branch (468:12): [True: 25, False: 0]
469
        && 
strcmp(errors, "replace") != 025
  Branch (469:12): [True: 25, False: 0]
470
        && 
strcmp(errors, "surrogateescape") != 025
  Branch (470:12): [True: 0, False: 25]
471
        && 
strcmp(errors, "surrogatepass") != 00
)
  Branch (471:12): [True: 0, False: 0]
472
    {
473
        PyObject *handler = PyCodec_LookupError(errors);
474
        if (handler == NULL) {
  Branch (474:13): [True: 0, False: 0]
475
            return -1;
476
        }
477
        Py_DECREF(handler);
478
    }
479
    return 0;
480
}
481
482
483
int
484
_PyUnicode_CheckConsistency(PyObject *op, int check_content)
485
{
486
#define CHECK(expr) \
487
    do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
488
489
    assert(op != NULL);
490
    CHECK(PyUnicode_Check(op));
491
492
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
493
    int kind = ascii->state.kind;
494
495
    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
  Branch (495:9): [True: 0, False: 0]
  Branch (495:36): [True: 0, False: 0]
496
        CHECK(kind == PyUnicode_1BYTE_KIND);
497
    }
498
    else {
499
        PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
500
        void *data;
501
502
        if (ascii->state.compact == 1) {
  Branch (502:13): [True: 0, False: 0]
503
            data = compact + 1;
504
            CHECK(kind == PyUnicode_1BYTE_KIND
505
                                 || kind == PyUnicode_2BYTE_KIND
506
                                 || kind == PyUnicode_4BYTE_KIND);
507
            CHECK(ascii->state.ascii == 0);
508
            CHECK(compact->utf8 != data);
509
        }
510
        else {
511
            PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
512
513
            data = unicode->data.any;
514
            CHECK(kind == PyUnicode_1BYTE_KIND
515
                     || kind == PyUnicode_2BYTE_KIND
516
                     || kind == PyUnicode_4BYTE_KIND);
517
            CHECK(ascii->state.compact == 0);
518
            CHECK(data != NULL);
519
            if (ascii->state.ascii) {
  Branch (519:17): [True: 0, False: 0]
520
                CHECK(compact->utf8 == data);
521
                CHECK(compact->utf8_length == ascii->length);
522
            }
523
            else {
524
                CHECK(compact->utf8 != data);
525
            }
526
        }
527
528
        if (compact->utf8 == NULL)
  Branch (528:13): [True: 0, False: 0]
529
            CHECK(compact->utf8_length == 0);
530
    }
531
532
    /* check that the best kind is used: O(n) operation */
533
    if (check_content) {
  Branch (533:9): [True: 0, False: 0]
534
        Py_ssize_t i;
535
        Py_UCS4 maxchar = 0;
536
        const void *data;
537
        Py_UCS4 ch;
538
539
        data = PyUnicode_DATA(ascii);
540
        for (i=0; i < ascii->length; i++)
  Branch (540:19): [True: 0, False: 0]
541
        {
542
            ch = PyUnicode_READ(kind, data, i);
543
            if (ch > maxchar)
  Branch (543:17): [True: 0, False: 0]
544
                maxchar = ch;
545
        }
546
        if (kind == PyUnicode_1BYTE_KIND) {
  Branch (546:13): [True: 0, False: 0]
547
            if (ascii->state.ascii == 0) {
  Branch (547:17): [True: 0, False: 0]
548
                CHECK(maxchar >= 128);
549
                CHECK(maxchar <= 255);
550
            }
551
            else
552
                CHECK(maxchar < 128);
553
        }
554
        else if (kind == PyUnicode_2BYTE_KIND) {
  Branch (554:18): [True: 0, False: 0]
555
            CHECK(maxchar >= 0x100);
556
            CHECK(maxchar <= 0xFFFF);
557
        }
558
        else {
559
            CHECK(maxchar >= 0x10000);
560
            CHECK(maxchar <= MAX_UNICODE);
561
        }
562
        CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
563
    }
564
    return 1;
565
566
#undef CHECK
567
}
568
569
static PyObject*
570
unicode_result(PyObject *unicode)
571
{
572
    assert(_PyUnicode_CHECK(unicode));
573
574
    Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
575
    if (length == 0) {
  Branch (575:9): [True: 2, False: 12.9M]
576
        PyObject *empty = unicode_get_empty();
577
        if (unicode != empty) {
  Branch (577:13): [True: 0, False: 2]
578
            Py_DECREF(unicode);
579
            Py_INCREF(empty);
580
        }
581
        return empty;
582
    }
583
584
    if (length == 1) {
  Branch (584:9): [True: 353k, False: 12.5M]
585
        int kind = PyUnicode_KIND(unicode);
586
        if (kind == PyUnicode_1BYTE_KIND) {
  Branch (586:13): [True: 189k, False: 164k]
587
            const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
588
            Py_UCS1 ch = data[0];
589
            PyObject *latin1_char = LATIN1(ch);
590
            if (unicode != latin1_char) {
  Branch (590:17): [True: 186k, False: 2.62k]
591
                Py_INCREF(latin1_char);
592
                Py_DECREF(unicode);
593
            }
594
            return latin1_char;
595
        }
596
    }
597
598
    assert(_PyUnicode_CheckConsistency(unicode, 1));
599
    return unicode;
600
}
601
602
static PyObject*
603
unicode_result_unchanged(PyObject *unicode)
604
{
605
    if (PyUnicode_CheckExact(unicode)) {
606
        Py_INCREF(unicode);
607
        return unicode;
608
    }
609
    else
610
        /* Subtype -- return genuine unicode string with the same value. */
611
        return _PyUnicode_Copy(unicode);
612
}
613
614
/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
615
   ASCII, Latin1, UTF-8, etc. */
616
static char*
617
backslashreplace(_PyBytesWriter *writer, char *str,
618
                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
619
{
620
    Py_ssize_t size, i;
621
    Py_UCS4 ch;
622
    int kind;
623
    const void *data;
624
625
    kind = PyUnicode_KIND(unicode);
626
    data = PyUnicode_DATA(unicode);
627
628
    size = 0;
629
    /* determine replacement size */
630
    for (i = collstart; i < collend; 
++i23.5k
) {
  Branch (630:25): [True: 23.5k, False: 8.19k]
631
        Py_ssize_t incr;
632
633
        ch = PyUnicode_READ(kind, data, i);
634
        if (ch < 0x100)
  Branch (634:13): [True: 4.58k, False: 19.0k]
635
            incr = 2+2;
636
        else if (ch < 0x10000)
  Branch (636:18): [True: 18.7k, False: 293]
637
            incr = 2+4;
638
        else {
639
            assert(ch <= MAX_UNICODE);
640
            incr = 2+8;
641
        }
642
        if (size > PY_SSIZE_T_MAX - incr) {
  Branch (642:13): [True: 0, False: 23.5k]
643
            PyErr_SetString(PyExc_OverflowError,
644
                            "encoded result is too long for a Python string");
645
            return NULL;
646
        }
647
        size += incr;
648
    }
649
650
    str = _PyBytesWriter_Prepare(writer, str, size);
651
    if (str == NULL)
  Branch (651:9): [True: 0, False: 8.19k]
652
        return NULL;
653
654
    /* generate replacement */
655
    
for (i = collstart; 8.19k
i < collend;
++i23.5k
) {
  Branch (655:25): [True: 23.5k, False: 8.19k]
656
        ch = PyUnicode_READ(kind, data, i);
657
        *str++ = '\\';
658
        if (ch >= 0x00010000) {
  Branch (658:13): [True: 293, False: 23.3k]
659
            *str++ = 'U';
660
            *str++ = Py_hexdigits[(ch>>28)&0xf];
661
            *str++ = Py_hexdigits[(ch>>24)&0xf];
662
            *str++ = Py_hexdigits[(ch>>20)&0xf];
663
            *str++ = Py_hexdigits[(ch>>16)&0xf];
664
            *str++ = Py_hexdigits[(ch>>12)&0xf];
665
            *str++ = Py_hexdigits[(ch>>8)&0xf];
666
        }
667
        else if (ch >= 0x100) {
  Branch (667:18): [True: 18.7k, False: 4.58k]
668
            *str++ = 'u';
669
            *str++ = Py_hexdigits[(ch>>12)&0xf];
670
            *str++ = Py_hexdigits[(ch>>8)&0xf];
671
        }
672
        else
673
            *str++ = 'x';
674
        *str++ = Py_hexdigits[(ch>>4)&0xf];
675
        *str++ = Py_hexdigits[ch&0xf];
676
    }
677
    return str;
678
}
679
680
/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
681
   ASCII, Latin1, UTF-8, etc. */
682
static char*
683
xmlcharrefreplace(_PyBytesWriter *writer, char *str,
684
                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
685
{
686
    Py_ssize_t size, i;
687
    Py_UCS4 ch;
688
    int kind;
689
    const void *data;
690
691
    kind = PyUnicode_KIND(unicode);
692
    data = PyUnicode_DATA(unicode);
693
694
    size = 0;
695
    /* determine replacement size */
696
    for (i = collstart; i < collend; 
++i4.31k
) {
  Branch (696:25): [True: 4.31k, False: 1.24k]
697
        Py_ssize_t incr;
698
699
        ch = PyUnicode_READ(kind, data, i);
700
        if (ch < 10)
  Branch (700:13): [True: 0, False: 4.31k]
701
            incr = 2+1+1;
702
        else if (ch < 100)
  Branch (702:18): [True: 0, False: 4.31k]
703
            incr = 2+2+1;
704
        else if (ch < 1000)
  Branch (704:18): [True: 1.06k, False: 3.24k]
705
            incr = 2+3+1;
706
        else if (ch < 10000)
  Branch (706:18): [True: 193, False: 3.05k]
707
            incr = 2+4+1;
708
        else if (ch < 100000)
  Branch (708:18): [True: 3.05k, False: 3]
709
            incr = 2+5+1;
710
        else if (ch < 1000000)
  Branch (710:18): [True: 2, False: 1]
711
            incr = 2+6+1;
712
        else {
713
            assert(ch <= MAX_UNICODE);
714
            incr = 2+7+1;
715
        }
716
        if (size > PY_SSIZE_T_MAX - incr) {
  Branch (716:13): [True: 0, False: 4.31k]
717
            PyErr_SetString(PyExc_OverflowError,
718
                            "encoded result is too long for a Python string");
719
            return NULL;
720
        }
721
        size += incr;
722
    }
723
724
    str = _PyBytesWriter_Prepare(writer, str, size);
725
    if (str == NULL)
  Branch (725:9): [True: 0, False: 1.24k]
726
        return NULL;
727
728
    /* generate replacement */
729
    
for (i = collstart; 1.24k
i < collend;
++i4.31k
) {
  Branch (729:25): [True: 4.31k, False: 1.24k]
730
        size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
731
        if (size < 0) {
  Branch (731:13): [True: 0, False: 4.31k]
732
            return NULL;
733
        }
734
        str += size;
735
    }
736
    return str;
737
}
738
739
/* --- Bloom Filters ----------------------------------------------------- */
740
741
/* stuff to implement simple "bloom filters" for Unicode characters.
742
   to keep things simple, we use a single bitmask, using the least 5
743
   bits from each unicode characters as the bit index. */
744
745
/* the linebreak mask is set up by _PyUnicode_Init() below */
746
747
#if LONG_BIT >= 128
748
#define BLOOM_WIDTH 128
749
#elif LONG_BIT >= 64
750
#define BLOOM_WIDTH 64
751
#elif LONG_BIT >= 32
752
#define BLOOM_WIDTH 32
753
#else
754
#error "LONG_BIT is smaller than 32"
755
#endif
756
757
#define BLOOM_MASK unsigned long
758
759
static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
760
761
#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (
BLOOM_WIDTH2.96M
- 1)))))
762
763
#define BLOOM_LINEBREAK(ch)                                             \
764
    ((ch) < 128U ? 
ascii_linebreak[(ch)]11.1M
: \
765
     
(951k
BLOOM951k
(bloom_linebreak, (ch)) &&
Py_UNICODE_ISLINEBREAK118k
(ch)))
766
767
static inline BLOOM_MASK
768
make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
769
{
770
#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
771
    do {                                               \
772
        TYPE *data = (TYPE *)PTR;                      \
773
        TYPE *end = data + LEN;                        \
774
        Py_UCS4 ch;                                    \
775
        for (; data != end; 
data++1.55M
) { \
776
            ch = *data;                                \
777
            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
778
        }                                              \
779
        break;                                         \
780
    } while (
00
)
781
782
    /* calculate simple bloom-style bitmask for a given unicode string */
783
784
    BLOOM_MASK mask;
785
786
    mask = 0;
787
    switch (kind) {
788
    case PyUnicode_1BYTE_KIND:
  Branch (788:5): [True: 1.22M, False: 107]
789
        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
790
        break;
791
    case PyUnicode_2BYTE_KIND:
  Branch (791:5): [True: 107, False: 1.22M]
792
        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
793
        break;
794
    case PyUnicode_4BYTE_KIND:
  Branch (794:5): [True: 0, False: 1.23M]
795
        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
796
        break;
797
    default:
  Branch (797:5): [True: 0, False: 1.23M]
798
        Py_UNREACHABLE();
799
    }
800
    return mask;
801
802
#undef BLOOM_UPDATE
803
}
804
805
static int
806
ensure_unicode(PyObject *obj)
807
{
808
    if (!PyUnicode_Check(obj)) {
  Branch (808:9): [True: 15, False: 19.1M]
809
        PyErr_Format(PyExc_TypeError,
810
                     "must be str, not %.100s",
811
                     Py_TYPE(obj)->tp_name);
812
        return -1;
813
    }
814
    return 0;
815
}
816
817
/* Compilation of templated routines */
818
819
#define STRINGLIB_GET_EMPTY() unicode_get_empty()
820
821
#include "stringlib/asciilib.h"
822
#include "stringlib/fastsearch.h"
823
#include "stringlib/partition.h"
824
#include "stringlib/split.h"
825
#include "stringlib/count.h"
826
#include "stringlib/find.h"
827
#include "stringlib/find_max_char.h"
828
#include "stringlib/undef.h"
829
830
#include "stringlib/ucs1lib.h"
831
#include "stringlib/fastsearch.h"
832
#include "stringlib/partition.h"
833
#include "stringlib/split.h"
834
#include "stringlib/count.h"
835
#include "stringlib/find.h"
836
#include "stringlib/replace.h"
837
#include "stringlib/find_max_char.h"
838
#include "stringlib/undef.h"
839
840
#include "stringlib/ucs2lib.h"
841
#include "stringlib/fastsearch.h"
842
#include "stringlib/partition.h"
843
#include "stringlib/split.h"
844
#include "stringlib/count.h"
845
#include "stringlib/find.h"
846
#include "stringlib/replace.h"
847
#include "stringlib/find_max_char.h"
848
#include "stringlib/undef.h"
849
850
#include "stringlib/ucs4lib.h"
851
#include "stringlib/fastsearch.h"
852
#include "stringlib/partition.h"
853
#include "stringlib/split.h"
854
#include "stringlib/count.h"
855
#include "stringlib/find.h"
856
#include "stringlib/replace.h"
857
#include "stringlib/find_max_char.h"
858
#include "stringlib/undef.h"
859
860
#undef STRINGLIB_GET_EMPTY
861
862
/* --- Unicode Object ----------------------------------------------------- */
863
864
static inline Py_ssize_t
865
findchar(const void *s, int kind,
866
         Py_ssize_t size, Py_UCS4 ch,
867
         int direction)
868
{
869
    switch (kind) {
870
    case PyUnicode_1BYTE_KIND:
  Branch (870:5): [True: 6.00M, False: 1.12M]
871
        if ((Py_UCS1) ch != ch)
  Branch (871:13): [True: 2, False: 6.00M]
872
            return -1;
873
        if (direction > 0)
  Branch (873:13): [True: 5.30M, False: 705k]
874
            return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
875
        else
876
            return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
877
    case PyUnicode_2BYTE_KIND:
  Branch (877:5): [True: 73.4k, False: 7.05M]
878
        if ((Py_UCS2) ch != ch)
  Branch (878:13): [True: 0, False: 73.4k]
879
            return -1;
880
        if (direction > 0)
  Branch (880:13): [True: 73.1k, False: 332]
881
            return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
882
        else
883
            return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
884
    case PyUnicode_4BYTE_KIND:
  Branch (884:5): [True: 1.04M, False: 6.07M]
885
        if (direction > 0)
  Branch (885:13): [True: 1.04M, False: 9]
886
            return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
887
        else
888
            return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
889
    default:
  Branch (889:5): [True: 0, False: 7.12M]
890
        Py_UNREACHABLE();
891
    }
892
}
893
894
#ifdef Py_DEBUG
895
/* Fill the data of a Unicode string with invalid characters to detect bugs
896
   earlier.
897
898
   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
899
   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
900
   invalid character in Unicode 6.0. */
901
static void
902
unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
903
{
904
    int kind = PyUnicode_KIND(unicode);
905
    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
906
    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
907
    if (length <= old_length)
908
        return;
909
    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
910
}
911
#endif
912
913
static PyObject*
914
resize_compact(PyObject *unicode, Py_ssize_t length)
915
{
916
    Py_ssize_t char_size;
917
    Py_ssize_t struct_size;
918
    Py_ssize_t new_size;
919
    PyObject *new_unicode;
920
#ifdef Py_DEBUG
921
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
922
#endif
923
924
    assert(unicode_modifiable(unicode));
925
    assert(PyUnicode_IS_COMPACT(unicode));
926
927
    char_size = PyUnicode_KIND(unicode);
928
    if (PyUnicode_IS_ASCII(unicode))
929
        struct_size = sizeof(PyASCIIObject);
930
    else
931
        struct_size = sizeof(PyCompactUnicodeObject);
932
933
    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
  Branch (933:9): [True: 0, False: 13.1M]
934
        PyErr_NoMemory();
935
        return NULL;
936
    }
937
    new_size = (struct_size + (length + 1) * char_size);
938
939
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
940
        PyObject_Free(_PyUnicode_UTF8(unicode));
941
        _PyUnicode_UTF8(unicode) = NULL;
942
        _PyUnicode_UTF8_LENGTH(unicode) = 0;
943
    }
944
#ifdef Py_REF_DEBUG
945
    _Py_RefTotal--;
946
#endif
947
#ifdef Py_TRACE_REFS
948
    _Py_ForgetReference(unicode);
949
#endif
950
951
    new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
952
    if (new_unicode == NULL) {
  Branch (952:9): [True: 0, False: 13.1M]
953
        _Py_NewReference(unicode);
954
        PyErr_NoMemory();
955
        return NULL;
956
    }
957
    unicode = new_unicode;
958
    _Py_NewReference(unicode);
959
960
    _PyUnicode_LENGTH(unicode) = length;
961
#ifdef Py_DEBUG
962
    unicode_fill_invalid(unicode, old_length);
963
#endif
964
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
965
                    length, 0);
966
    assert(_PyUnicode_CheckConsistency(unicode, 0));
967
    return unicode;
968
}
969
970
static int
971
resize_inplace(PyObject *unicode, Py_ssize_t length)
972
{
973
    assert(!PyUnicode_IS_COMPACT(unicode));
974
    assert(Py_REFCNT(unicode) == 1);
975
976
    Py_ssize_t new_size;
977
    Py_ssize_t char_size;
978
    int share_utf8;
979
    void *data;
980
#ifdef Py_DEBUG
981
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
982
#endif
983
984
    data = _PyUnicode_DATA_ANY(unicode);
985
    char_size = PyUnicode_KIND(unicode);
986
    share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
987
988
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
  Branch (988:9): [True: 0, False: 0]
989
        PyErr_NoMemory();
990
        return -1;
991
    }
992
    new_size = (length + 1) * char_size;
993
994
    if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
  Branch (994:9): [True: 0, False: 0]
995
    {
996
        PyObject_Free(_PyUnicode_UTF8(unicode));
997
        _PyUnicode_UTF8(unicode) = NULL;
998
        _PyUnicode_UTF8_LENGTH(unicode) = 0;
999
    }
1000
1001
    data = (PyObject *)PyObject_Realloc(data, new_size);
1002
    if (data == NULL) {
  Branch (1002:9): [True: 0, False: 0]
1003
        PyErr_NoMemory();
1004
        return -1;
1005
    }
1006
    _PyUnicode_DATA_ANY(unicode) = data;
1007
    if (share_utf8) {
  Branch (1007:9): [True: 0, False: 0]
1008
        _PyUnicode_UTF8(unicode) = data;
1009
        _PyUnicode_UTF8_LENGTH(unicode) = length;
1010
    }
1011
    _PyUnicode_LENGTH(unicode) = length;
1012
    PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1013
#ifdef Py_DEBUG
1014
    unicode_fill_invalid(unicode, old_length);
1015
#endif
1016
1017
    /* check for integer overflow */
1018
    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
  Branch (1018:9): [True: 0, False: 0]
1019
        PyErr_NoMemory();
1020
        return -1;
1021
    }
1022
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1023
    return 0;
1024
}
1025
1026
static PyObject*
1027
resize_copy(PyObject *unicode, Py_ssize_t length)
1028
{
1029
    Py_ssize_t copy_length;
1030
    PyObject *copy;
1031
1032
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1033
    if (copy == NULL)
  Branch (1033:9): [True: 0, False: 0]
1034
        return NULL;
1035
1036
    copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1037
    _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1038
    return copy;
1039
}
1040
1041
static const char*
1042
unicode_kind_name(PyObject *unicode)
1043
{
1044
    /* don't check consistency: unicode_kind_name() is called from
1045
       _PyUnicode_Dump() */
1046
    if (!PyUnicode_IS_COMPACT(unicode))
  Branch (1046:9): [True: 0, False: 12]
1047
    {
1048
        switch (PyUnicode_KIND(unicode))
1049
        {
1050
        case PyUnicode_1BYTE_KIND:
  Branch (1050:9): [True: 0, False: 0]
1051
            if (PyUnicode_IS_ASCII(unicode))
1052
                return "legacy ascii";
1053
            else
1054
                return "legacy latin1";
1055
        case PyUnicode_2BYTE_KIND:
  Branch (1055:9): [True: 0, False: 0]
1056
            return "legacy UCS2";
1057
        case PyUnicode_4BYTE_KIND:
  Branch (1057:9): [True: 0, False: 0]
1058
            return "legacy UCS4";
1059
        default:
  Branch (1059:9): [True: 0, False: 0]
1060
            return "<legacy invalid kind>";
1061
        }
1062
    }
1063
    switch (PyUnicode_KIND(unicode)) {
1064
    case PyUnicode_1BYTE_KIND:
  Branch (1064:5): [True: 6, False: 6]
1065
        if (PyUnicode_IS_ASCII(unicode))
1066
            return "ascii";
1067
        else
1068
            return "latin1";
1069
    case PyUnicode_2BYTE_KIND:
  Branch (1069:5): [True: 3, False: 9]
1070
        return "UCS2";
1071
    case PyUnicode_4BYTE_KIND:
  Branch (1071:5): [True: 3, False: 9]
1072
        return "UCS4";
1073
    default:
  Branch (1073:5): [True: 0, False: 12]
1074
        return "<invalid compact kind>";
1075
    }
1076
}
1077
1078
#ifdef Py_DEBUG
1079
/* Functions wrapping macros for use in debugger */
1080
const char *_PyUnicode_utf8(void *unicode_raw){
1081
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1082
    return PyUnicode_UTF8(unicode);
1083
}
1084
1085
const void *_PyUnicode_compact_data(void *unicode_raw) {
1086
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1087
    return _PyUnicode_COMPACT_DATA(unicode);
1088
}
1089
const void *_PyUnicode_data(void *unicode_raw) {
1090
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1091
    printf("obj %p\n", (void*)unicode);
1092
    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1093
    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1094
    printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1095
    printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1096
    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1097
    return PyUnicode_DATA(unicode);
1098
}
1099
1100
void
1101
_PyUnicode_Dump(PyObject *op)
1102
{
1103
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1104
    PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1105
    PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1106
    const void *data;
1107
1108
    if (ascii->state.compact)
1109
    {
1110
        if (ascii->state.ascii)
1111
            data = (ascii + 1);
1112
        else
1113
            data = (compact + 1);
1114
    }
1115
    else
1116
        data = unicode->data.any;
1117
    printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1118
1119
    if (!ascii->state.ascii) {
1120
        printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1121
    }
1122
    printf(", data=%p\n", data);
1123
}
1124
#endif
1125
1126
1127
PyObject *
1128
PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1129
{
1130
    /* Optimization for empty strings */
1131
    if (size == 0) {
  Branch (1131:9): [True: 621k, False: 70.4M]
1132
        return unicode_new_empty();
1133
    }
1134
1135
    PyObject *obj;
1136
    PyCompactUnicodeObject *unicode;
1137
    void *data;
1138
    int kind;
1139
    int is_ascii;
1140
    Py_ssize_t char_size;
1141
    Py_ssize_t struct_size;
1142
1143
    is_ascii = 0;
1144
    struct_size = sizeof(PyCompactUnicodeObject);
1145
    if (maxchar < 128) {
  Branch (1145:9): [True: 56.7M, False: 13.6M]
1146
        kind = PyUnicode_1BYTE_KIND;
1147
        char_size = 1;
1148
        is_ascii = 1;
1149
        struct_size = sizeof(PyASCIIObject);
1150
    }
1151
    else if (maxchar < 256) {
  Branch (1151:14): [True: 230k, False: 13.4M]
1152
        kind = PyUnicode_1BYTE_KIND;
1153
        char_size = 1;
1154
    }
1155
    else if (maxchar < 65536) {
  Branch (1155:14): [True: 1.92M, False: 11.5M]
1156
        kind = PyUnicode_2BYTE_KIND;
1157
        char_size = 2;
1158
    }
1159
    else {
1160
        if (maxchar > MAX_UNICODE) {
  Branch (1160:13): [True: 0, False: 11.5M]
1161
            PyErr_SetString(PyExc_SystemError,
1162
                            "invalid maximum character passed to PyUnicode_New");
1163
            return NULL;
1164
        }
1165
        kind = PyUnicode_4BYTE_KIND;
1166
        char_size = 4;
1167
    }
1168
1169
    /* Ensure we won't overflow the size. */
1170
    if (size < 0) {
  Branch (1170:9): [True: 0, False: 70.4M]
1171
        PyErr_SetString(PyExc_SystemError,
1172
                        "Negative size passed to PyUnicode_New");
1173
        return NULL;
1174
    }
1175
    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
  Branch (1175:9): [True: 8, False: 70.4M]
1176
        return PyErr_NoMemory();
1177
1178
    /* Duplicated allocation code from _PyObject_New() instead of a call to
1179
     * PyObject_New() so we are able to allocate space for the object and
1180
     * it's data buffer.
1181
     */
1182
    obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1183
    if (obj == NULL) {
  Branch (1183:9): [True: 0, False: 70.4M]
1184
        return PyErr_NoMemory();
1185
    }
1186
    _PyObject_Init(obj, &PyUnicode_Type);
1187
1188
    unicode = (PyCompactUnicodeObject *)obj;
1189
    if (is_ascii)
  Branch (1189:9): [True: 56.7M, False: 13.6M]
1190
        data = ((PyASCIIObject*)obj) + 1;
1191
    else
1192
        data = unicode + 1;
1193
    _PyUnicode_LENGTH(unicode) = size;
1194
    _PyUnicode_HASH(unicode) = -1;
1195
    _PyUnicode_STATE(unicode).interned = 0;
1196
    _PyUnicode_STATE(unicode).kind = kind;
1197
    _PyUnicode_STATE(unicode).compact = 1;
1198
    _PyUnicode_STATE(unicode).ascii = is_ascii;
1199
    if (is_ascii) {
  Branch (1199:9): [True: 56.7M, False: 13.6M]
1200
        ((char*)data)[size] = 0;
1201
    }
1202
    else if (kind == PyUnicode_1BYTE_KIND) {
  Branch (1202:14): [True: 230k, False: 13.4M]
1203
        ((char*)data)[size] = 0;
1204
        unicode->utf8 = NULL;
1205
        unicode->utf8_length = 0;
1206
    }
1207
    else {
1208
        unicode->utf8 = NULL;
1209
        unicode->utf8_length = 0;
1210
        if (kind == PyUnicode_2BYTE_KIND)
  Branch (1210:13): [True: 1.92M, False: 11.5M]
1211
            ((Py_UCS2*)data)[size] = 0;
1212
        else /* kind == PyUnicode_4BYTE_KIND */
1213
            ((Py_UCS4*)data)[size] = 0;
1214
    }
1215
#ifdef Py_DEBUG
1216
    unicode_fill_invalid((PyObject*)unicode, 0);
1217
#endif
1218
    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1219
    return obj;
1220
}
1221
1222
#if SIZEOF_WCHAR_T == 2
1223
/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1224
   will decode surrogate pairs, the other conversions are implemented as macros
1225
   for efficiency.
1226
1227
   This function assumes that unicode can hold one more code point than wstr
1228
   characters for a terminating null character. */
1229
static void
1230
unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1231
                              PyObject *unicode)
1232
{
1233
    const wchar_t *iter;
1234
    Py_UCS4 *ucs4_out;
1235
1236
    assert(unicode != NULL);
1237
    assert(_PyUnicode_CHECK(unicode));
1238
    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1239
    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1240
1241
    for (iter = begin; iter < end; ) {
1242
        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1243
                           _PyUnicode_GET_LENGTH(unicode)));
1244
        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1245
            && (iter+1) < end
1246
            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1247
        {
1248
            *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1249
            iter += 2;
1250
        }
1251
        else {
1252
            *ucs4_out++ = *iter;
1253
            iter++;
1254
        }
1255
    }
1256
    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1257
                        _PyUnicode_GET_LENGTH(unicode)));
1258
1259
}
1260
#endif
1261
1262
static int
1263
unicode_check_modifiable(PyObject *unicode)
1264
{
1265
    if (!unicode_modifiable(unicode)) {
  Branch (1265:9): [True: 0, False: 8.76k]
1266
        PyErr_SetString(PyExc_SystemError,
1267
                        "Cannot modify a string currently used");
1268
        return -1;
1269
    }
1270
    return 0;
1271
}
1272
1273
static int
1274
_copy_characters(PyObject *to, Py_ssize_t to_start,
1275
                 PyObject *from, Py_ssize_t from_start,
1276
                 Py_ssize_t how_many, int check_maxchar)
1277
{
1278
    int from_kind, to_kind;
1279
    const void *from_data;
1280
    void *to_data;
1281
1282
    assert(0 <= how_many);
1283
    assert(0 <= from_start);
1284
    assert(0 <= to_start);
1285
    assert(PyUnicode_Check(from));
1286
    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1287
1288
    assert(PyUnicode_Check(to));
1289
    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1290
1291
    if (how_many == 0)
  Branch (1291:9): [True: 175k, False: 53.7M]
1292
        return 0;
1293
1294
    from_kind = PyUnicode_KIND(from);
1295
    from_data = PyUnicode_DATA(from);
1296
    to_kind = PyUnicode_KIND(to);
1297
    to_data = PyUnicode_DATA(to);
1298
1299
#ifdef Py_DEBUG
1300
    if (!check_maxchar
1301
        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1302
    {
1303
        Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1304
        Py_UCS4 ch;
1305
        Py_ssize_t i;
1306
        for (i=0; i < how_many; i++) {
1307
            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1308
            assert(ch <= to_maxchar);
1309
        }
1310
    }
1311
#endif
1312
1313
    if (from_kind == to_kind) {
  Branch (1313:9): [True: 53.3M, False: 368k]
1314
        if (check_maxchar
  Branch (1314:13): [True: 1.33k, False: 53.3M]
1315
            && 
!1.33k
PyUnicode_IS_ASCII1.33k
(from) &&
PyUnicode_IS_ASCII31
(to))
  Branch (1315:16): [True: 31, False: 1.30k]
1316
        {
1317
            /* Writing Latin-1 characters into an ASCII string requires to
1318
               check that all written characters are pure ASCII */
1319
            Py_UCS4 max_char;
1320
            max_char = ucs1lib_find_max_char(from_data,
1321
                                             (const Py_UCS1*)from_data + how_many);
1322
            if (max_char >= 128)
  Branch (1322:17): [True: 1, False: 0]
1323
                return -1;
1324
        }
1325
        memcpy((char*)to_data + to_kind * to_start,
1326
                  (const char*)from_data + from_kind * from_start,
1327
                  to_kind * how_many);
1328
    }
1329
    else if (from_kind == PyUnicode_1BYTE_KIND
  Branch (1329:14): [True: 303k, False: 65.6k]
1330
             && 
to_kind == PyUnicode_2BYTE_KIND303k
)
  Branch (1330:17): [True: 300k, False: 2.42k]
1331
    {
1332
        _PyUnicode_CONVERT_BYTES(
1333
            Py_UCS1, Py_UCS2,
1334
            PyUnicode_1BYTE_DATA(from) + from_start,
1335
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1336
            PyUnicode_2BYTE_DATA(to) + to_start
1337
            );
1338
    }
1339
    else if (from_kind == PyUnicode_1BYTE_KIND
  Branch (1339:14): [True: 2.42k, False: 65.6k]
1340
             && 
to_kind == PyUnicode_4BYTE_KIND2.42k
)
  Branch (1340:17): [True: 2.42k, False: 0]
1341
    {
1342
        _PyUnicode_CONVERT_BYTES(
1343
            Py_UCS1, Py_UCS4,
1344
            PyUnicode_1BYTE_DATA(from) + from_start,
1345
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1346
            PyUnicode_4BYTE_DATA(to) + to_start
1347
            );
1348
    }
1349
    else if (from_kind == PyUnicode_2BYTE_KIND
  Branch (1349:14): [True: 65.6k, False: 11]
1350
             && 
to_kind == PyUnicode_4BYTE_KIND65.6k
)
  Branch (1350:17): [True: 65.4k, False: 130]
1351
    {
1352
        _PyUnicode_CONVERT_BYTES(
1353
            Py_UCS2, Py_UCS4,
1354
            PyUnicode_2BYTE_DATA(from) + from_start,
1355
            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1356
            PyUnicode_4BYTE_DATA(to) + to_start
1357
            );
1358
    }
1359
    else {
1360
        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1361
1362
        if (!check_maxchar) {
  Branch (1362:13): [True: 136, False: 5]
1363
            if (from_kind == PyUnicode_2BYTE_KIND
  Branch (1363:17): [True: 128, False: 8]
1364
                && 
to_kind == PyUnicode_1BYTE_KIND128
)
  Branch (1364:20): [True: 128, False: 0]
1365
            {
1366
                _PyUnicode_CONVERT_BYTES(
1367
                    Py_UCS2, Py_UCS1,
1368
                    PyUnicode_2BYTE_DATA(from) + from_start,
1369
                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1370
                    PyUnicode_1BYTE_DATA(to) + to_start
1371
                    );
1372
            }
1373
            else if (from_kind == PyUnicode_4BYTE_KIND
  Branch (1373:22): [True: 8, False: 0]
1374
                     && to_kind == PyUnicode_1BYTE_KIND)
  Branch (1374:25): [True: 2, False: 6]
1375
            {
1376
                _PyUnicode_CONVERT_BYTES(
1377
                    Py_UCS4, Py_UCS1,
1378
                    PyUnicode_4BYTE_DATA(from) + from_start,
1379
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1380
                    PyUnicode_1BYTE_DATA(to) + to_start
1381
                    );
1382
            }
1383
            else if (from_kind == PyUnicode_4BYTE_KIND
  Branch (1383:22): [True: 6, False: 0]
1384
                     && to_kind == PyUnicode_2BYTE_KIND)
  Branch (1384:25): [True: 6, False: 0]
1385
            {
1386
                _PyUnicode_CONVERT_BYTES(
1387
                    Py_UCS4, Py_UCS2,
1388
                    PyUnicode_4BYTE_DATA(from) + from_start,
1389
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1390
                    PyUnicode_2BYTE_DATA(to) + to_start
1391
                    );
1392
            }
1393
            else {
1394
                Py_UNREACHABLE();
1395
            }
1396
        }
1397
        else {
1398
            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1399
            Py_UCS4 ch;
1400
            Py_ssize_t i;
1401
1402
            for (i=0; i < how_many; 
i++0
) {
  Branch (1402:23): [True: 5, False: 0]
1403
                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1404
                if (ch > to_maxchar)
  Branch (1404:21): [True: 5, False: 0]
1405
                    return -1;
1406
                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1407
            }
1408
        }
1409
    }
1410
    return 0;
1411
}
1412
1413
void
1414
_PyUnicode_FastCopyCharacters(
1415
    PyObject *to, Py_ssize_t to_start,
1416
    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1417
{
1418
    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1419
}
1420
1421
Py_ssize_t
1422
PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1423
                         PyObject *from, Py_ssize_t from_start,
1424
                         Py_ssize_t how_many)
1425
{
1426
    int err;
1427
1428
    if (!PyUnicode_Check(from) || 
!1.34k
PyUnicode_Check1.34k
(to)) {
  Branch (1428:9): [True: 1, False: 1.34k]
  Branch (1428:35): [True: 0, False: 1.34k]
1429
        PyErr_BadInternalCall();
1430
        return -1;
1431
    }
1432
1433
    if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
  Branch (1433:9): [True: 2, False: 1.34k]
1434
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1435
        return -1;
1436
    }
1437
    if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
  Branch (1437:9): [True: 2, False: 1.34k]
1438
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1439
        return -1;
1440
    }
1441
    if (how_many < 0) {
  Branch (1441:9): [True: 1, False: 1.34k]
1442
        PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1443
        return -1;
1444
    }
1445
    how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1446
    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
  Branch (1446:9): [True: 1, False: 1.34k]
1447
        PyErr_Format(PyExc_SystemError,
1448
                     "Cannot write %zi characters at %zi "
1449
                     "in a string of %zi characters",
1450
                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1451
        return -1;
1452
    }
1453
1454
    if (how_many == 0)
  Branch (1454:9): [True: 0, False: 1.34k]
1455
        return 0;
1456
1457
    if (unicode_check_modifiable(to))
  Branch (1457:9): [True: 0, False: 1.34k]
1458
        return -1;
1459
1460
    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1461
    if (err) {
  Branch (1461:9): [True: 6, False: 1.33k]
1462
        PyErr_Format(PyExc_SystemError,
1463
                     "Cannot copy %s characters "
1464
                     "into a string of %s characters",
1465
                     unicode_kind_name(from),
1466
                     unicode_kind_name(to));
1467
        return -1;
1468
    }
1469
    return how_many;
1470
}
1471
1472
/* Find the maximum code point and count the number of surrogate pairs so a
1473
   correct string length can be computed before converting a string to UCS4.
1474
   This function counts single surrogates as a character and not as a pair.
1475
1476
   Return 0 on success, or -1 on error. */
1477
static int
1478
find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1479
                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1480
{
1481
    const wchar_t *iter;
1482
    Py_UCS4 ch;
1483
1484
    assert(num_surrogates != NULL && maxchar != NULL);
1485
    *num_surrogates = 0;
1486
    *maxchar = 0;
1487
1488
    for (iter = begin; iter < end; ) {
  Branch (1488:24): [True: 6.91M, False: 322k]
1489
#if SIZEOF_WCHAR_T == 2
1490
        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1491
            && (iter+1) < end
1492
            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1493
        {
1494
            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1495
            ++(*num_surrogates);
1496
            iter += 2;
1497
        }
1498
        else
1499
#endif
1500
        {
1501
            ch = *iter;
1502
            iter++;
1503
        }
1504
        if (ch > *maxchar) {
  Branch (1504:13): [True: 1.28M, False: 5.62M]
1505
            *maxchar = ch;
1506
            if (*maxchar > MAX_UNICODE) {
  Branch (1506:17): [True: 3, False: 1.28M]
1507
                PyErr_Format(PyExc_ValueError,
1508
                             "character U+%x is not in range [U+0000; U+%x]",
1509
                             ch, MAX_UNICODE);
1510
                return -1;
1511
            }
1512
        }
1513
    }
1514
    return 0;
1515
}
1516
1517
static void
1518
unicode_dealloc(PyObject *unicode)
1519
{
1520
#ifdef Py_DEBUG
1521
    if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1522
        _Py_FatalRefcountError("deallocating an Unicode singleton");
1523
    }
1524
#endif
1525
1526
    if (PyUnicode_CHECK_INTERNED(unicode)) {
1527
        /* Revive the dead object temporarily. PyDict_DelItem() removes two
1528
           references (key and value) which were ignored by
1529
           PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
1530
           to prevent calling unicode_dealloc() again. Adjust refcnt after
1531
           PyDict_DelItem(). */
1532
        assert(Py_REFCNT(unicode) == 0);
1533
        Py_SET_REFCNT(unicode, 3);
1534
        if (PyDict_DelItem(interned, unicode) != 0) {
  Branch (1534:13): [True: 0, False: 354k]
1535
            _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1536
                                      NULL);
1537
        }
1538
        assert(Py_REFCNT(unicode) == 1);
1539
        Py_SET_REFCNT(unicode, 0);
1540
    }
1541
1542
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1543
        PyObject_Free(_PyUnicode_UTF8(unicode));
1544
    }
1545
    if (!PyUnicode_IS_COMPACT(unicode) && 
_PyUnicode_DATA_ANY114k
(unicode)) {
  Branch (1545:9): [True: 114k, False: 70.3M]
1546
        PyObject_Free(_PyUnicode_DATA_ANY(unicode));
1547
    }
1548
1549
    Py_TYPE(unicode)->tp_free(unicode);
1550
}
1551
1552
#ifdef Py_DEBUG
1553
static int
1554
unicode_is_singleton(PyObject *unicode)
1555
{
1556
    if (unicode == &_Py_STR(empty)) {
1557
        return 1;
1558
    }
1559
1560
    PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1561
    if (ascii->length == 1) {
1562
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1563
        if (ch < 256 && LATIN1(ch) == unicode) {
1564
            return 1;
1565
        }
1566
    }
1567
    return 0;
1568
}
1569
#endif
1570
1571
static int
1572
unicode_modifiable(PyObject *unicode)
1573
{
1574
    assert(_PyUnicode_CHECK(unicode));
1575
    if (Py_REFCNT(unicode) != 1)
  Branch (1575:9): [True: 926k, False: 1.74M]
1576
        return 0;
1577
    if (_PyUnicode_HASH(unicode) != -1)
  Branch (1577:9): [True: 184, False: 1.74M]
1578
        return 0;
1579
    if (PyUnicode_CHECK_INTERNED(unicode))
1580
        return 0;
1581
    if (!PyUnicode_CheckExact(unicode))
  Branch (1581:9): [True: 0, False: 1.74M]
1582
        return 0;
1583
#ifdef Py_DEBUG
1584
    /* singleton refcount is greater than 1 */
1585
    assert(!unicode_is_singleton(unicode));
1586
#endif
1587
    return 1;
1588
}
1589
1590
static int
1591
unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1592
{
1593
    PyObject *unicode;
1594
    Py_ssize_t old_length;
1595
1596
    assert(p_unicode != NULL);
1597
    unicode = *p_unicode;
1598
1599
    assert(unicode != NULL);
1600
    assert(PyUnicode_Check(unicode));
1601
    assert(0 <= length);
1602
1603
    old_length = PyUnicode_GET_LENGTH(unicode);
1604
    if (old_length == length)
  Branch (1604:9): [True: 0, False: 867k]
1605
        return 0;
1606
1607
    if (length == 0) {
  Branch (1607:9): [True: 0, False: 867k]
1608
        PyObject *empty = unicode_new_empty();
1609
        Py_SETREF(*p_unicode, empty);
1610
        return 0;
1611
    }
1612
1613
    if (!unicode_modifiable(unicode)) {
  Branch (1613:9): [True: 0, False: 867k]
1614
        PyObject *copy = resize_copy(unicode, length);
1615
        if (copy == NULL)
  Branch (1615:13): [True: 0, False: 0]
1616
            return -1;
1617
        Py_SETREF(*p_unicode, copy);
1618
        return 0;
1619
    }
1620
1621
    if (PyUnicode_IS_COMPACT(unicode)) {
1622
        PyObject *new_unicode = resize_compact(unicode, length);
1623
        if (new_unicode == NULL)
  Branch (1623:13): [True: 0, False: 867k]
1624
            return -1;
1625
        *p_unicode = new_unicode;
1626
        return 0;
1627
    }
1628
    return resize_inplace(unicode, length);
1629
}
1630
1631
int
1632
PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1633
{
1634
    PyObject *unicode;
1635
    if (p_unicode == NULL) {
  Branch (1635:9): [True: 0, False: 0]
1636
        PyErr_BadInternalCall();
1637
        return -1;
1638
    }
1639
    unicode = *p_unicode;
1640
    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
  Branch (1640:9): [True: 0, False: 0]
  Branch (1640:28): [True: 0, False: 0]
  Branch (1640:57): [True: 0, False: 0]
1641
    {
1642
        PyErr_BadInternalCall();
1643
        return -1;
1644
    }
1645
    return unicode_resize(p_unicode, length);
1646
}
1647
1648
/* Copy an ASCII or latin1 char* string into a Python Unicode string.
1649
1650
   WARNING: The function doesn't copy the terminating null character and
1651
   doesn't check the maximum character (may write a latin1 character in an
1652
   ASCII string). */
1653
static void
1654
unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1655
                   const char *str, Py_ssize_t len)
1656
{
1657
    int kind = PyUnicode_KIND(unicode);
1658
    const void *data = PyUnicode_DATA(unicode);
1659
    const char *end = str + len;
1660
1661
    assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1662
    switch (kind) {
1663
    case PyUnicode_1BYTE_KIND: {
  Branch (1663:5): [True: 3, False: 0]
1664
#ifdef Py_DEBUG
1665
        if (PyUnicode_IS_ASCII(unicode)) {
1666
            Py_UCS4 maxchar = ucs1lib_find_max_char(
1667
                (const Py_UCS1*)str,
1668
                (const Py_UCS1*)str + len);
1669
            assert(maxchar < 128);
1670
        }
1671
#endif
1672
        memcpy((char *) data + index, str, len);
1673
        break;
1674
    }
1675
    case PyUnicode_2BYTE_KIND: {
  Branch (1675:5): [True: 0, False: 3]
1676
        Py_UCS2 *start = (Py_UCS2 *)data + index;
1677
        Py_UCS2 *ucs2 = start;
1678
1679
        for (; str < end; ++ucs2, ++str)
  Branch (1679:16): [True: 0, False: 0]
1680
            *ucs2 = (Py_UCS2)*str;
1681
1682
        assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1683
        break;
1684
    }
1685
    case PyUnicode_4BYTE_KIND: {
  Branch (1685:5): [True: 0, False: 3]
1686
        Py_UCS4 *start = (Py_UCS4 *)data + index;
1687
        Py_UCS4 *ucs4 = start;
1688
1689
        for (; str < end; ++ucs4, ++str)
  Branch (1689:16): [True: 0, False: 0]
1690
            *ucs4 = (Py_UCS4)*str;
1691
1692
        assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1693
        break;
1694
    }
1695
    default:
  Branch (1695:5): [True: 0, False: 3]
1696
        Py_UNREACHABLE();
1697
    }
1698
}
1699
1700
static PyObject*
1701
get_latin1_char(Py_UCS1 ch)
1702
{
1703
    return Py_NewRef(LATIN1(ch));
1704
}
1705
1706
static PyObject*
1707
unicode_char(Py_UCS4 ch)
1708
{
1709
    PyObject *unicode;
1710
1711
    assert(ch <= MAX_UNICODE);
1712
1713
    if (ch < 256) {
  Branch (1713:9): [True: 12.0M, False: 7.15M]
1714
        return get_latin1_char(ch);
1715
    }
1716
1717
    unicode = PyUnicode_New(1, ch);
1718
    if (unicode == NULL)
  Branch (1718:9): [True: 0, False: 7.15M]
1719
        return NULL;
1720
1721
    assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1722
    if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
  Branch (1722:9): [True: 854k, False: 6.29M]
1723
        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1724
    } else {
1725
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1726
        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1727
    }
1728
    assert(_PyUnicode_CheckConsistency(unicode, 1));
1729
    return unicode;
1730
}
1731
1732
PyObject *
1733
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
1734
{
1735
    PyObject *unicode;
1736
    Py_UCS4 maxchar = 0;
1737
    Py_ssize_t num_surrogates;
1738
1739
    if (u == NULL && 
size != 00
) {
  Branch (1739:9): [True: 0, False: 339k]
  Branch (1739:22): [True: 0, False: 0]
1740
        PyErr_BadInternalCall();
1741
        return NULL;
1742
    }
1743
1744
    if (size == -1) {
  Branch (1744:9): [True: 43.2k, False: 295k]
1745
        size = wcslen(u);
1746
    }
1747
1748
    /* If the Unicode data is known at construction time, we can apply
1749
       some optimizations which share commonly used objects. */
1750
1751
    /* Optimization for empty strings */
1752
    if (size == 0)
  Branch (1752:9): [True: 4.29k, False: 334k]
1753
        _Py_RETURN_UNICODE_EMPTY();
1754
1755
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1756
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
1757
       non-Unicode locales and hence needs conversion to UCS-4 first. */
1758
    if (_Py_LocaleUsesNonUnicodeWchar()) {
1759
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
1760
        if (!converted) {
1761
            return NULL;
1762
        }
1763
        PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
1764
        PyMem_Free(converted);
1765
        return unicode;
1766
    }
1767
#endif
1768
1769
    /* Single character Unicode objects in the Latin-1 range are
1770
       shared when using this constructor */
1771
    if (size == 1 && 
(Py_UCS4)*u < 25612.7k
)
  Branch (1771:9): [True: 12.7k, False: 322k]
  Branch (1771:22): [True: 12.6k, False: 53]
1772
        return get_latin1_char((unsigned char)*u);
1773
1774
    /* If not empty and not single character, copy the Unicode data
1775
       into the new object */
1776
    if (find_maxchar_surrogates(u, u + size,
  Branch (1776:9): [True: 3, False: 322k]
1777
                                &maxchar, &num_surrogates) == -1)
1778
        return NULL;
1779
1780
    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1781
    if (!unicode)
  Branch (1781:9): [True: 0, False: 322k]
1782
        return NULL;
1783
1784
    switch (PyUnicode_KIND(unicode)) {
1785
    case PyUnicode_1BYTE_KIND:
  Branch (1785:5): [True: 322k, False: 197]
1786
        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
1787
                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
1788
        break;
1789
    case PyUnicode_2BYTE_KIND:
  Branch (1789:5): [True: 167, False: 322k]
1790
#if Py_UNICODE_SIZE == 2
1791
        memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1792
#else
1793
        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
1794
                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
1795
#endif
1796
        break;
1797
    case PyUnicode_4BYTE_KIND:
  Branch (1797:5): [True: 30, False: 322k]
1798
#if SIZEOF_WCHAR_T == 2
1799
        /* This is the only case which has to process surrogates, thus
1800
           a simple copy loop is not enough and we need a function. */
1801
        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
1802
#else
1803
        assert(num_surrogates == 0);
1804
        memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1805
#endif
1806
        break;
1807
    default:
  Branch (1807:5): [True: 0, False: 322k]
1808
        Py_UNREACHABLE();
1809
    }
1810
1811
    return unicode_result(unicode);
1812
}
1813
1814
PyObject *
1815
PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
1816
{
1817
    if (size < 0) {
  Branch (1817:9): [True: 0, False: 2.72M]
1818
        PyErr_SetString(PyExc_SystemError,
1819
                        "Negative size passed to PyUnicode_FromStringAndSize");
1820
        return NULL;
1821
    }
1822
    if (u != NULL) {
  Branch (1822:9): [True: 2.65M, False: 71.7k]
1823
        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1824
    }
1825
    if (size > 0) {
  Branch (1825:9): [True: 0, False: 71.7k]
1826
        PyErr_SetString(PyExc_SystemError,
1827
            "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
1828
        return NULL;
1829
    }
1830
    return unicode_new_empty();
1831
}
1832
1833
PyObject *
1834
PyUnicode_FromString(const char *u)
1835
{
1836
    size_t size = strlen(u);
1837
    if (size > PY_SSIZE_T_MAX) {
  Branch (1837:9): [True: 0, False: 13.6M]
1838
        PyErr_SetString(PyExc_OverflowError, "input too long");
1839
        return NULL;
1840
    }
1841
    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
1842
}
1843
1844
1845
PyObject *
1846
_PyUnicode_FromId(_Py_Identifier *id)
1847
{
1848
    PyInterpreterState *interp = _PyInterpreterState_GET();
1849
    struct _Py_unicode_ids *ids = &interp->unicode.ids;
1850
1851
    Py_ssize_t index = _Py_atomic_size_get(&id->index);
1852
    if (index < 0) {
  Branch (1852:9): [True: 342, False: 102k]
1853
        struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids;
1854
1855
        PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK);
1856
        // Check again to detect concurrent access. Another thread can have
1857
        // initialized the index while this thread waited for the lock.
1858
        index = _Py_atomic_size_get(&id->index);
1859
        if (index < 0) {
  Branch (1859:13): [True: 342, False: 0]
1860
            assert(rt_ids->next_index < PY_SSIZE_T_MAX);
1861
            index = rt_ids->next_index;
1862
            rt_ids->next_index++;
1863
            _Py_atomic_size_set(&id->index, index);
1864
        }
1865
        PyThread_release_lock(rt_ids->lock);
1866
    }
1867
    assert(index >= 0);
1868
1869
    PyObject *obj;
1870
    if (index < ids->size) {
  Branch (1870:9): [True: 103k, False: 44]
1871
        obj = ids->array[index];
1872
        if (obj) {
  Branch (1872:13): [True: 102k, False: 300]
1873
            // Return a borrowed reference
1874
            return obj;
1875
        }
1876
    }
1877
1878
    obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
1879
                                       NULL, NULL);
1880
    if (!obj) {
  Branch (1880:9): [True: 0, False: 344]
1881
        return NULL;
1882
    }
1883
    PyUnicode_InternInPlace(&obj);
1884
1885
    if (index >= ids->size) {
  Branch (1885:9): [True: 44, False: 300]
1886
        // Overallocate to reduce the number of realloc
1887
        Py_ssize_t new_size = Py_MAX(index * 2, 16);
1888
        Py_ssize_t item_size = sizeof(ids->array[0]);
1889
        PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
1890
        if (new_array == NULL) {
  Branch (1890:13): [True: 0, False: 44]
1891
            PyErr_NoMemory();
1892
            return NULL;
1893
        }
1894
        memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
1895
        ids->array = new_array;
1896
        ids->size = new_size;
1897
    }
1898
1899
    // The array stores a strong reference
1900
    ids->array[index] = obj;
1901
1902
    // Return a borrowed reference
1903
    return obj;
1904
}
1905
1906
1907
static void
1908
unicode_clear_identifiers(struct _Py_unicode_state *state)
1909
{
1910
    struct _Py_unicode_ids *ids = &state->ids;
1911
    for (Py_ssize_t i=0; i < ids->size; 
i++768
) {
  Branch (1911:26): [True: 768, False: 272]
1912
        Py_XDECREF(ids->array[i]);
1913
    }
1914
    ids->size = 0;
1915
    PyMem_Free(ids->array);
1916
    ids->array = NULL;
1917
    // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
1918
    // after Py_Finalize().
1919
}
1920
1921
1922
/* Internal function, doesn't check maximum character */
1923
1924
PyObject*
1925
_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
1926
{
1927
    const unsigned char *s = (const unsigned char *)buffer;
1928
    PyObject *unicode;
1929
    if (size == 1) {
  Branch (1929:9): [True: 2.59M, False: 8.48M]
1930
#ifdef Py_DEBUG
1931
        assert((unsigned char)s[0] < 128);
1932
#endif
1933
        return get_latin1_char(s[0]);
1934
    }
1935
    unicode = PyUnicode_New(size, 127);
1936
    if (!unicode)
  Branch (1936:9): [True: 0, False: 8.48M]
1937
        return NULL;
1938
    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1939
    assert(_PyUnicode_CheckConsistency(unicode, 1));
1940
    return unicode;
1941
}
1942
1943
static Py_UCS4
1944
kind_maxchar_limit(int kind)
1945
{
1946
    switch (kind) {
1947
    case PyUnicode_1BYTE_KIND:
  Branch (1947:5): [True: 0, False: 9]
1948
        return 0x80;
1949
    case PyUnicode_2BYTE_KIND:
  Branch (1949:5): [True: 9, False: 0]
1950
        return 0x100;
1951
    case PyUnicode_4BYTE_KIND:
  Branch (1951:5): [True: 0, False: 9]
1952
        return 0x10000;
1953
    default:
  Branch (1953:5): [True: 0, False: 9]
1954
        Py_UNREACHABLE();
1955
    }
1956
}
1957
1958
static PyObject*
1959
_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
1960
{
1961
    PyObject *res;
1962
    unsigned char max_char;
1963
1964
    if (size == 0) {
  Branch (1964:9): [True: 4.89k, False: 6.13M]
1965
        _Py_RETURN_UNICODE_EMPTY();
1966
    }
1967
    assert(size > 0);
1968
    if (size == 1) {
  Branch (1968:9): [True: 4.91M, False: 1.21M]
1969
        return get_latin1_char(u[0]);
1970
    }
1971
1972
    max_char = ucs1lib_find_max_char(u, u + size);
1973
    res = PyUnicode_New(size, max_char);
1974
    if (!res)
  Branch (1974:9): [True: 0, False: 1.21M]
1975
        return NULL;
1976
    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1977
    assert(_PyUnicode_CheckConsistency(res, 1));
1978
    return res;
1979
}
1980
1981
static PyObject*
1982
_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1983
{
1984
    PyObject *res;
1985
    Py_UCS2 max_char;
1986
1987
    if (size == 0)
  Branch (1987:9): [True: 4.15k, False: 229k]
1988
        _Py_RETURN_UNICODE_EMPTY();
1989
    assert(size > 0);
1990
    if (size == 1)
  Branch (1990:9): [True: 22.4k, False: 207k]
1991
        return unicode_char(u[0]);
1992
1993
    max_char = ucs2lib_find_max_char(u, u + size);
1994
    res = PyUnicode_New(size, max_char);
1995
    if (!res)
  Branch (1995:9): [True: 0, False: 207k]
1996
        return NULL;
1997
    if (max_char >= 256)
  Branch (1997:9): [True: 97.4k, False: 109k]
1998
        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1999
    else {
2000
        _PyUnicode_CONVERT_BYTES(
2001
            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2002
    }
2003
    assert(_PyUnicode_CheckConsistency(res, 1));
2004
    return res;
2005
}
2006
2007
static PyObject*
2008
_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2009
{
2010
    PyObject *res;
2011
    Py_UCS4 max_char;
2012
2013
    if (size == 0)
  Branch (2013:9): [True: 4.68k, False: 114k]
2014
        _Py_RETURN_UNICODE_EMPTY();
2015
    assert(size > 0);
2016
    if (size == 1)
  Branch (2016:9): [True: 26.7k, False: 87.4k]
2017
        return unicode_char(u[0]);
2018
2019
    max_char = ucs4lib_find_max_char(u, u + size);
2020
    res = PyUnicode_New(size, max_char);
2021
    if (!res)
  Branch (2021:9): [True: 0, False: 87.4k]
2022
        return NULL;
2023
    if (max_char < 256)
  Branch (2023:9): [True: 84.9k, False: 2.46k]
2024
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2025
                                 PyUnicode_1BYTE_DATA(res));
2026
    else if (max_char < 0x10000)
  Branch (2026:14): [True: 1.73k, False: 735]
2027
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2028
                                 PyUnicode_2BYTE_DATA(res));
2029
    else
2030
        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2031
    assert(_PyUnicode_CheckConsistency(res, 1));
2032
    return res;
2033
}
2034
2035
PyObject*
2036
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2037
{
2038
    if (size < 0) {
  Branch (2038:9): [True: 0, False: 1.46M]
2039
        PyErr_SetString(PyExc_ValueError, "size must be positive");
2040
        return NULL;
2041
    }
2042
    switch (kind) {
2043
    case PyUnicode_1BYTE_KIND:
  Branch (2043:5): [True: 1.21M, False: 248k]
2044
        return _PyUnicode_FromUCS1(buffer, size);
2045
    case PyUnicode_2BYTE_KIND:
  Branch (2045:5): [True: 131k, False: 1.33M]
2046
        return _PyUnicode_FromUCS2(buffer, size);
2047
    case PyUnicode_4BYTE_KIND:
  Branch (2047:5): [True: 117k, False: 1.34M]
2048
        return _PyUnicode_FromUCS4(buffer, size);
2049
    default:
  Branch (2049:5): [True: 0, False: 1.46M]
2050
        PyErr_SetString(PyExc_SystemError, "invalid kind");
2051
        return NULL;
2052
    }
2053
}
2054
2055
Py_UCS4
2056
_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2057
{
2058
    int kind;
2059
    const void *startptr, *endptr;
2060
2061
    assert(0 <= start);
2062
    assert(end <= PyUnicode_GET_LENGTH(unicode));
2063
    assert(start <= end);
2064
2065
    if (start == 0 && 
end == 3.68M
PyUnicode_GET_LENGTH3.68M
(unicode))
  Branch (2065:9): [True: 3.68M, False: 34.1k]
  Branch (2065:23): [True: 181k, False: 3.50M]
2066
        return PyUnicode_MAX_CHAR_VALUE(unicode);
2067
2068
    if (start == end)
  Branch (2068:9): [True: 3.07k, False: 3.53M]
2069
        return 127;
2070
2071
    if (PyUnicode_IS_ASCII(unicode))
2072
        return 127;
2073
2074
    kind = PyUnicode_KIND(unicode);
2075
    startptr = PyUnicode_DATA(unicode);
2076
    endptr = (char *)startptr + end * kind;
2077
    startptr = (char *)startptr + start * kind;
2078
    switch(kind) {
2079
    case PyUnicode_1BYTE_KIND:
  Branch (2079:5): [True: 33, False: 394]
2080
        return ucs1lib_find_max_char(startptr, endptr);
2081
    case PyUnicode_2BYTE_KIND:
  Branch (2081:5): [True: 394, False: 33]
2082
        return ucs2lib_find_max_char(startptr, endptr);
2083
    case PyUnicode_4BYTE_KIND:
  Branch (2083:5): [True: 0, False: 427]
2084
        return ucs4lib_find_max_char(startptr, endptr);
2085
    default:
  Branch (2085:5): [True: 0, False: 427]
2086
        Py_UNREACHABLE();
2087
    }
2088
}
2089
2090
/* Ensure that a string uses the most efficient storage, if it is not the
2091
   case: create a new string with of the right kind. Write NULL into *p_unicode
2092
   on error. */
2093
static void
2094
unicode_adjust_maxchar(PyObject **p_unicode)
2095
{
2096
    PyObject *unicode, *copy;
2097
    Py_UCS4 max_char;
2098
    Py_ssize_t len;
2099
    int kind;
2100
2101
    assert(p_unicode != NULL);
2102
    unicode = *p_unicode;
2103
    if (PyUnicode_IS_ASCII(unicode))
2104
        return;
2105
2106
    len = PyUnicode_GET_LENGTH(unicode);
2107
    kind = PyUnicode_KIND(unicode);
2108
    if (kind == PyUnicode_1BYTE_KIND) {
  Branch (2108:9): [True: 3, False: 138]
2109
        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2110
        max_char = ucs1lib_find_max_char(u, u + len);
2111
        if (max_char >= 128)
  Branch (2111:13): [True: 0, False: 3]
2112
            return;
2113
    }
2114
    else if (kind == PyUnicode_2BYTE_KIND) {
  Branch (2114:14): [True: 126, False: 12]
2115
        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2116
        max_char = ucs2lib_find_max_char(u, u + len);
2117
        if (max_char >= 256)
  Branch (2117:13): [True: 2, False: 124]
2118
            return;
2119
    }
2120
    else if (kind == PyUnicode_4BYTE_KIND) {
  Branch (2120:14): [True: 12, False: 0]
2121
        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2122
        max_char = ucs4lib_find_max_char(u, u + len);
2123
        if (max_char >= 0x10000)
  Branch (2123:13): [True: 4, False: 8]
2124
            return;
2125
    }
2126
    else
2127
        Py_UNREACHABLE();
2128
2129
    copy = PyUnicode_New(len, max_char);
2130
    if (copy != NULL)
  Branch (2130:9): [True: 135, False: 0]
2131
        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2132
    Py_DECREF(unicode);
2133
    *p_unicode = copy;
2134
}
2135
2136
PyObject*
2137
_PyUnicode_Copy(PyObject *unicode)
2138
{
2139
    Py_ssize_t length;
2140
    PyObject *copy;
2141
2142
    if (!PyUnicode_Check(unicode)) {
  Branch (2142:9): [True: 0, False: 61.6k]
2143
        PyErr_BadInternalCall();
2144
        return NULL;
2145
    }
2146
2147
    length = PyUnicode_GET_LENGTH(unicode);
2148
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2149
    if (!copy)
  Branch (2149:9): [True: 0, False: 61.6k]
2150
        return NULL;
2151
    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2152
2153
    memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2154
              length * PyUnicode_KIND(unicode));
2155
    assert(_PyUnicode_CheckConsistency(copy, 1));
2156
    return copy;
2157
}
2158
2159
2160
/* Widen Unicode objects to larger buffers. Don't write terminating null
2161
   character. Return NULL on error. */
2162
2163
static void*
2164
unicode_askind(int skind, void const *data, Py_ssize_t len, int kind)
2165
{
2166
    void *result;
2167
2168
    assert(skind < kind);
2169
    switch (kind) {
2170
    case PyUnicode_2BYTE_KIND:
  Branch (2170:5): [True: 15.8k, False: 894]
2171
        result = PyMem_New(Py_UCS2, len);
2172
        if (!result)
  Branch (2172:13): [True: 0, False: 15.8k]
2173
            return PyErr_NoMemory();
2174
        assert(skind == PyUnicode_1BYTE_KIND);
2175
        _PyUnicode_CONVERT_BYTES(
2176
            Py_UCS1, Py_UCS2,
2177
            (const Py_UCS1 *)data,
2178
            ((const Py_UCS1 *)data) + len,
2179
            result);
2180
        return result;
2181
    case PyUnicode_4BYTE_KIND:
  Branch (2181:5): [True: 894, False: 15.8k]
2182
        result = PyMem_New(Py_UCS4, len);
2183
        if (!result)
  Branch (2183:13): [True: 0, False: 894]
2184
            return PyErr_NoMemory();
2185
        if (skind == PyUnicode_2BYTE_KIND) {
  Branch (2185:13): [True: 43, False: 851]
2186
            _PyUnicode_CONVERT_BYTES(
2187
                Py_UCS2, Py_UCS4,
2188
                (const Py_UCS2 *)data,
2189
                ((const Py_UCS2 *)data) + len,
2190
                result);
2191
        }
2192
        else {
2193
            assert(skind == PyUnicode_1BYTE_KIND);
2194
            _PyUnicode_CONVERT_BYTES(
2195
                Py_UCS1, Py_UCS4,
2196
                (const Py_UCS1 *)data,
2197
                ((const Py_UCS1 *)data) + len,
2198
                result);
2199
        }
2200
        return result;
2201
    default:
  Branch (2201:5): [True: 0, False: 16.7k]
2202
        Py_UNREACHABLE();
2203
        return NULL;
2204
    }
2205
}
2206
2207
static Py_UCS4*
2208
as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2209
        int copy_null)
2210
{
2211
    int kind;
2212
    const void *data;
2213
    Py_ssize_t len, targetlen;
2214
    kind = PyUnicode_KIND(string);
2215
    data = PyUnicode_DATA(string);
2216
    len = PyUnicode_GET_LENGTH(string);
2217
    targetlen = len;
2218
    if (copy_null)
  Branch (2218:9): [True: 70, False: 102k]
2219
        targetlen++;
2220
    if (!target) {
  Branch (2220:9): [True: 46, False: 102k]
2221
        target = PyMem_New(Py_UCS4, targetlen);
2222
        if (!target) {
  Branch (2222:13): [True: 0, False: 46]
2223
            PyErr_NoMemory();
2224
            return NULL;
2225
        }
2226
    }
2227
    else {
2228
        if (targetsize < targetlen) {
  Branch (2228:13): [True: 12, False: 102k]
2229
            PyErr_Format(PyExc_SystemError,
2230
                         "string is longer than the buffer");
2231
            if (copy_null && 
0 < targetsize6
)
  Branch (2231:17): [True: 6, False: 6]
  Branch (2231:30): [True: 6, False: 0]
2232
                target[0] = 0;
2233
            return NULL;
2234
        }
2235
    }
2236
    if (kind == PyUnicode_1BYTE_KIND) {
  Branch (2236:9): [True: 102k, False: 243]
2237
        const Py_UCS1 *start = (const Py_UCS1 *) data;
2238
        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2239
    }
2240
    else if (kind == PyUnicode_2BYTE_KIND) {
  Branch (2240:14): [True: 204, False: 39]
2241
        const Py_UCS2 *start = (const Py_UCS2 *) data;
2242
        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2243
    }
2244
    else if (kind == PyUnicode_4BYTE_KIND) {
  Branch (2244:14): [True: 39, False: 0]
2245
        memcpy(target, data, len * sizeof(Py_UCS4));
2246
    }
2247
    else {
2248
        Py_UNREACHABLE();
2249
    }
2250
    if (copy_null)
  Branch (2250:9): [True: 64, False: 102k]
2251
        target[len] = 0;
2252
    return target;
2253
}
2254
2255
Py_UCS4*
2256
PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2257
                 int copy_null)
2258
{
2259
    if (target == NULL || targetsize < 0) {
  Branch (2259:9): [True: 0, False: 102k]
  Branch (2259:27): [True: 0, False: 102k]
2260
        PyErr_BadInternalCall();
2261
        return NULL;
2262
    }
2263
    return as_ucs4(string, target, targetsize, copy_null);
2264
}
2265
2266
Py_UCS4*
2267
PyUnicode_AsUCS4Copy(PyObject *string)
2268
{
2269
    return as_ucs4(string, NULL, 0, 1);
2270
}
2271
2272
/* maximum number of characters required for output of %lld or %p.
2273
   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2274
   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2275
#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2276
2277
static int
2278
unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2279
                             Py_ssize_t width, Py_ssize_t precision)
2280
{
2281
    Py_ssize_t length, fill, arglen;
2282
    Py_UCS4 maxchar;
2283
2284
    length = PyUnicode_GET_LENGTH(str);
2285
    if ((precision == -1 || 
precision >= length2.30k
)
  Branch (2285:10): [True: 14.2M, False: 2.30k]
  Branch (2285:29): [True: 2.28k, False: 17]
2286
        && 
width <= length14.2M
)
  Branch (2286:12): [True: 14.2M, False: 9]
2287
        return _PyUnicodeWriter_WriteStr(writer, str);
2288
2289
    if (precision != -1)
  Branch (2289:9): [True: 17, False: 9]
2290
        length = Py_MIN(precision, length);
2291
2292
    arglen = Py_MAX(length, width);
2293
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
  Branch (2293:9): [True: 5, False: 21]
2294
        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2295
    else
2296
        maxchar = writer->maxchar;
2297
2298
    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
  Branch (2298:9): [True: 0, False: 26]
2299
        return -1;
2300
2301
    if (width > length) {
  Branch (2301:9): [True: 14, False: 12]
2302
        fill = width - length;
2303
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
  Branch (2303:13): [True: 0, False: 14]
2304
            return -1;
2305
        writer->pos += fill;
2306
    }
2307
2308
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2309
                                  str, 0, length);
2310
    writer->pos += length;
2311
    return 0;
2312
}
2313
2314
static int
2315
unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2316
                              Py_ssize_t width, Py_ssize_t precision)
2317
{
2318
    /* UTF-8 */
2319
    Py_ssize_t length;
2320
    PyObject *unicode;
2321
    int res;
2322
2323
    if (precision == -1) {
  Branch (2323:9): [True: 154k, False: 4.39M]
2324
        length = strlen(str);
2325
    }
2326
    else {
2327
        length = 0;
2328
        while (length < precision && 
str[length]34.1M
) {
  Branch (2328:16): [True: 34.1M, False: 1.09k]
  Branch (2328:38): [True: 29.7M, False: 4.39M]
2329
            length++;
2330
        }
2331
    }
2332
    unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2333
    if (unicode == NULL)
  Branch (2333:9): [True: 0, False: 4.55M]
2334
        return -1;
2335
2336
    res = unicode_fromformat_write_str(writer, unicode, width, -1);
2337
    Py_DECREF(unicode);
2338
    return res;
2339
}
2340
2341
static const char*
2342
unicode_fromformat_arg(_PyUnicodeWriter *writer,
2343
                       const char *f, va_list *vargs)
2344
{
2345
    const char *p;
2346
    Py_ssize_t len;
2347
    int zeropad;
2348
    Py_ssize_t width;
2349
    Py_ssize_t precision;
2350
    int longflag;
2351
    int longlongflag;
2352
    int size_tflag;
2353
    Py_ssize_t fill;
2354
2355
    p = f;
2356
    f++;
2357
    zeropad = 0;
2358
    if (*f == '0') {
  Branch (2358:9): [True: 10.1k, False: 14.3M]
2359
        zeropad = 1;
2360
        f++;
2361
    }
2362
2363
    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2364
    width = -1;
2365
    if (Py_ISDIGIT((unsigned)*f)) {
2366
        width = *f - '0';
2367
        f++;
2368
        while (Py_ISDIGIT((unsigned)*f)) {
2369
            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
  Branch (2369:17): [True: 0, False: 15]
2370
                PyErr_SetString(PyExc_ValueError,
2371
                                "width too big");
2372
                return NULL;
2373
            }
2374
            width = (width * 10) + (*f - '0');
2375
            f++;
2376
        }
2377
    }
2378
    precision = -1;
2379
    if (*f == '.') {
  Branch (2379:9): [True: 4.40M, False: 10.0M]
2380
        f++;
2381
        if (Py_ISDIGIT((unsigned)*f)) {
2382
            precision = (*f - '0');
2383
            f++;
2384
            while (Py_ISDIGIT((unsigned)*f)) {
2385
                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
  Branch (2385:21): [True: 0, False: 5.30M]
2386
                    PyErr_SetString(PyExc_ValueError,
2387
                                    "precision too big");
2388
                    return NULL;
2389
                }
2390
                precision = (precision * 10) + (*f - '0');
2391
                f++;
2392
            }
2393
        }
2394
        if (*f == '%') {
  Branch (2394:13): [True: 1, False: 4.40M]
2395
            /* "%.3%s" => f points to "3" */
2396
            f--;
2397
        }
2398
    }
2399
    if (*f == '\0') {
  Branch (2399:9): [True: 1, False: 14.4M]
2400
        /* bogus format "%.123" => go backward, f points to "3" */
2401
        f--;
2402
    }
2403
2404
    /* Handle %ld, %lu, %lld and %llu. */
2405
    longflag = 0;
2406
    longlongflag = 0;
2407
    size_tflag = 0;
2408
    if (*f == 'l') {
  Branch (2408:9): [True: 8.16k, False: 14.4M]
2409
        if (f[1] == 'd' || 
f[1] == 'u'6.22k
||
f[1] == 'i'104
) {
  Branch (2409:13): [True: 1.94k, False: 6.22k]
  Branch (2409:28): [True: 6.12k, False: 104]
  Branch (2409:43): [True: 1, False: 103]
2410
            longflag = 1;
2411
            ++f;
2412
        }
2413
        else if (f[1] == 'l' &&
  Branch (2413:18): [True: 103, False: 0]
2414
                 (f[2] == 'd' || 
f[2] == 'u'7
||
f[2] == 'i'1
)) {
  Branch (2414:19): [True: 96, False: 7]
  Branch (2414:34): [True: 6, False: 1]
  Branch (2414:49): [True: 1, False: 0]
2415
            longlongflag = 1;
2416
            f += 2;
2417
        }
2418
    }
2419
    /* handle the size_t flag. */
2420
    else if (*f == 'z' && 
(28.8k
f[1] == 'd'28.8k
||
f[1] == 'u'154
||
f[1] == 'i'6
)) {
  Branch (2420:14): [True: 28.8k, False: 14.3M]
  Branch (2420:28): [True: 28.7k, False: 154]
  Branch (2420:43): [True: 148, False: 6]
  Branch (2420:58): [True: 6, False: 0]
2421
        size_tflag = 1;
2422
        ++f;
2423
    }
2424
2425
    if (f[1] == '\0')
  Branch (2425:9): [True: 78.1k, False: 14.3M]
2426
        writer->overallocate = 0;
2427
2428
    switch (*f) {
2429
    case 'c':
  Branch (2429:5): [True: 31.2k, False: 14.3M]
2430
    {
2431
        int ordinal = va_arg(*vargs, int);
2432
        if (ordinal < 0 || ordinal > MAX_UNICODE) {
  Branch (2432:13): [True: 0, False: 31.2k]
  Branch (2432:28): [True: 1, False: 31.2k]
2433
            PyErr_SetString(PyExc_OverflowError,
2434
                            "character argument not in range(0x110000)");
2435
            return NULL;
2436
        }
2437
        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
  Branch (2437:13): [True: 0, False: 31.2k]
2438
            return NULL;
2439
        break;
2440
    }
2441
2442
    case 'i':
  Branch (2442:5): [True: 872, False: 14.4M]
2443
    case 'd':
  Branch (2443:5): [True: 92.1k, False: 14.3M]
2444
    case 'u':
  Branch (2444:5): [True: 6.77k, False: 14.4M]
2445
    case 'x':
  Branch (2445:5): [True: 123, False: 14.4M]
2446
    {
2447
        /* used by sprintf */
2448
        char buffer[MAX_LONG_LONG_CHARS];
2449
        Py_ssize_t arglen;
2450
2451
        if (*f == 'u') {
  Branch (2451:13): [True: 6.77k, False: 93.0k]
2452
            if (longflag) {
  Branch (2452:17): [True: 6.12k, False: 658]
2453
                len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2454
            }
2455
            else if (longlongflag) {
  Branch (2455:22): [True: 6, False: 652]
2456
                len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2457
            }
2458
            else if (size_tflag) {
  Branch (2458:22): [True: 148, False: 504]
2459
                len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2460
            }
2461
            else {
2462
                len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2463
            }
2464
        }
2465
        else if (*f == 'x') {
  Branch (2465:18): [True: 123, False: 92.9k]
2466
            len = sprintf(buffer, "%x", va_arg(*vargs, int));
2467
        }
2468
        else {
2469
            if (longflag) {
  Branch (2469:17): [True: 1.94k, False: 91.0k]
2470
                len = sprintf(buffer, "%li", va_arg(*vargs, long));
2471
            }
2472
            else if (longlongflag) {
  Branch (2472:22): [True: 97, False: 90.9k]
2473
                len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2474
            }
2475
            else if (size_tflag) {
  Branch (2475:22): [True: 28.7k, False: 62.1k]
2476
                len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2477
            }
2478
            else {
2479
                len = sprintf(buffer, "%i", va_arg(*vargs, int));
2480
            }
2481
        }
2482
        assert(len >= 0);
2483
2484
        if (precision < len)
  Branch (2484:13): [True: 96.2k, False: 3.65k]
2485
            precision = len;
2486
2487
        arglen = Py_MAX(precision, width);
2488
        if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
  Branch (2488:13): [True: 0, False: 99.8k]
2489
            return NULL;
2490
2491
        if (width > precision) {
  Branch (2491:13): [True: 6.50k, False: 93.3k]
2492
            Py_UCS4 fillchar;
2493
            fill = width - precision;
2494
            fillchar = zeropad?
'0'5.27k
:
' '1.22k
;
  Branch (2494:24): [True: 5.27k, False: 1.22k]
2495
            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
  Branch (2495:17): [True: 0, False: 6.50k]
2496
                return NULL;
2497
            writer->pos += fill;
2498
        }
2499
        if (precision > len) {
  Branch (2499:13): [True: 840, False: 99.0k]
2500
            fill = precision - len;
2501
            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
  Branch (2501:17): [True: 0, False: 840]
2502
                return NULL;
2503
            writer->pos += fill;
2504
        }
2505
2506
        if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
  Branch (2506:13): [True: 0, False: 99.8k]
2507
            return NULL;
2508
        break;
2509
    }
2510
2511
    case 'p':
  Branch (2511:5): [True: 25.6k, False: 14.3M]
2512
    {
2513
        char number[MAX_LONG_LONG_CHARS];
2514
2515
        len = sprintf(number, "%p", va_arg(*vargs, void*));
2516
        assert(len >= 0);
2517
2518
        /* %p is ill-defined:  ensure leading 0x. */
2519
        if (number[1] == 'X')
  Branch (2519:13): [True: 0, False: 25.6k]
2520
            number[1] = 'x';
2521
        else if (number[1] != 'x') {
  Branch (2521:18): [True: 0, False: 25.6k]
2522
            memmove(number + 2, number,
2523
                    strlen(number) + 1);
2524
            number[0] = '0';
2525
            number[1] = 'x';
2526
            len += 2;
2527
        }
2528
2529
        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
  Branch (2529:13): [True: 0, False: 25.6k]
2530
            return NULL;
2531
        break;
2532
    }
2533
2534
    case 's':
  Branch (2534:5): [True: 4.55M, False: 9.85M]
2535
    {
2536
        /* UTF-8 */
2537
        const char *s = va_arg(*vargs, const char*);
2538
        if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
  Branch (2538:13): [True: 0, False: 4.55M]
2539
            return NULL;
2540
        break;
2541
    }
2542
2543
    case 'U':
  Branch (2543:5): [True: 9.57M, False: 4.83M]
2544
    {
2545
        PyObject *obj = va_arg(*vargs, PyObject *);
2546
        assert(obj && _PyUnicode_CHECK(obj));
2547
2548
        if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
  Branch (2548:13): [True: 0, False: 9.57M]
2549
            return NULL;
2550
        break;
2551
    }
2552
2553
    case 'V':
  Branch (2553:5): [True: 2.05k, False: 14.4M]
2554
    {
2555
        PyObject *obj = va_arg(*vargs, PyObject *);
2556
        const char *str = va_arg(*vargs, const char *);
2557
        if (obj) {
  Branch (2557:13): [True: 2.04k, False: 8]
2558
            assert(_PyUnicode_CHECK(obj));
2559
            if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
  Branch (2559:17): [True: 0, False: 2.04k]
2560
                return NULL;
2561
        }
2562
        else {
2563
            assert(str != NULL);
2564
            if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
  Branch (2564:17): [True: 0, False: 8]
2565
                return NULL;
2566
        }
2567
        break;
2568
    }
2569
2570
    case 'S':
  Branch (2570:5): [True: 10.2k, False: 14.3M]
2571
    {
2572
        PyObject *obj = va_arg(*vargs, PyObject *);
2573
        PyObject *str;
2574
        assert(obj);
2575
        str = PyObject_Str(obj);
2576
        if (!str)
  Branch (2576:13): [True: 0, False: 10.2k]
2577
            return NULL;
2578
        if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
  Branch (2578:13): [True: 0, False: 10.2k]
2579
            Py_DECREF(str);
2580
            return NULL;
2581
        }
2582
        Py_DECREF(str);
2583
        break;
2584
    }
2585
2586
    case 'R':
  Branch (2586:5): [True: 113k, False: 14.2M]
2587
    {
2588
        PyObject *obj = va_arg(*vargs, PyObject *);
2589
        PyObject *repr;
2590
        assert(obj);
2591
        repr = PyObject_Repr(obj);
2592
        if (!repr)
  Branch (2592:13): [True: 489, False: 112k]
2593
            return NULL;
2594
        if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
  Branch (2594:13): [True: 0, False: 112k]
2595
            Py_DECREF(repr);
2596
            return NULL;
2597
        }
2598
        Py_DECREF(repr);
2599
        break;
2600
    }
2601
2602
    case 'A':
  Branch (2602:5): [True: 6, False: 14.4M]
2603
    {
2604
        PyObject *obj = va_arg(*vargs, PyObject *);
2605
        PyObject *ascii;
2606
        assert(obj);
2607
        ascii = PyObject_ASCII(obj);
2608
        if (!ascii)
  Branch (2608:13): [True: 0, False: 6]
2609
            return NULL;
2610
        if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
  Branch (2610:13): [True: 0, False: 6]
2611
            Py_DECREF(ascii);
2612
            return NULL;
2613
        }
2614
        Py_DECREF(ascii);
2615
        break;
2616
    }
2617
2618
    case '%':
  Branch (2618:5): [True: 133, False: 14.4M]
2619
        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
  Branch (2619:13): [True: 0, False: 133]
2620
            return NULL;
2621
        break;
2622
2623
    default:
  Branch (2623:5): [True: 3, False: 14.4M]
2624
        /* if we stumble upon an unknown formatting code, copy the rest
2625
           of the format string to the output string. (we cannot just
2626
           skip the code, since there's no way to know what's in the
2627
           argument list) */
2628
        len = strlen(p);
2629
        if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
  Branch (2629:13): [True: 0, False: 3]
2630
            return NULL;
2631
        f = p+len;
2632
        return f;
2633
    }
2634
2635
    f++;
2636
    return f;
2637
}
2638
2639
PyObject *
2640
PyUnicode_FromFormatV(const char *format, va_list vargs)
2641
{
2642
    va_list vargs2;
2643
    const char *f;
2644
    _PyUnicodeWriter writer;
2645
2646
    _PyUnicodeWriter_Init(&writer);
2647
    writer.min_length = strlen(format) + 100;
2648
    writer.overallocate = 1;
2649
2650
    // Copy varags to be able to pass a reference to a subfunction.
2651
    va_copy(vargs2, vargs);
2652
2653
    for (f = format; *f; ) {
  Branch (2653:22): [True: 36.2M, False: 7.65M]
2654
        if (*f == '%') {
  Branch (2654:13): [True: 14.4M, False: 21.8M]
2655
            f = unicode_fromformat_arg(&writer, f, &vargs2);
2656
            if (f == NULL)
  Branch (2656:17): [True: 490, False: 14.4M]
2657
                goto fail;
2658
        }
2659
        else {
2660
            const char *p;
2661
            Py_ssize_t len;
2662
2663
            p = f;
2664
            do
2665
            {
2666
                if ((unsigned char)*p > 127) {
  Branch (2666:21): [True: 1, False: 231M]
2667
                    PyErr_Format(PyExc_ValueError,
2668
                        "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2669
                        "string, got a non-ASCII byte: 0x%02x",
2670
                        (unsigned char)*p);
2671
                    goto fail;
2672
                }
2673
                p++;
2674
            }
2675
            while (*p != '\0' && 
*p != '%'223M
);
  Branch (2675:20): [True: 223M, False: 7.57M]
  Branch (2675:34): [True: 209M, False: 14.2M]
2676
            len = p - f;
2677
2678
            if (*p == '\0')
  Branch (2678:17): [True: 7.57M, False: 14.2M]
2679
                writer.overallocate = 0;
2680
2681
            if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
  Branch (2681:17): [True: 0, False: 21.8M]
2682
                goto fail;
2683
2684
            f = p;
2685
        }
2686
    }
2687
    va_end(vargs2);
2688
    return _PyUnicodeWriter_Finish(&writer);
2689
2690
  fail:
2691
    va_end(vargs2);
2692
    _PyUnicodeWriter_Dealloc(&writer);
2693
    return NULL;
2694
}
2695
2696
PyObject *
2697
PyUnicode_FromFormat(const char *format, ...)
2698
{
2699
    PyObject* ret;
2700
    va_list vargs;
2701
2702
    va_start(vargs, format);
2703
    ret = PyUnicode_FromFormatV(format, vargs);
2704
    va_end(vargs);
2705
    return ret;
2706
}
2707
2708
static Py_ssize_t
2709
unicode_get_widechar_size(PyObject *unicode)
2710
{
2711
    Py_ssize_t res;
2712
2713
    assert(unicode != NULL);
2714
    assert(_PyUnicode_CHECK(unicode));
2715
2716
    res = _PyUnicode_LENGTH(unicode);
2717
#if SIZEOF_WCHAR_T == 2
2718
    if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
2719
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
2720
        const Py_UCS4 *end = s + res;
2721
        for (; s < end; ++s) {
2722
            if (*s > 0xFFFF) {
2723
                ++res;
2724
            }
2725
        }
2726
    }
2727
#endif
2728
    return res;
2729
}
2730
2731
static void
2732
unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
2733
{
2734
    assert(unicode != NULL);
2735
    assert(_PyUnicode_CHECK(unicode));
2736
2737
    if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
  Branch (2737:9): [True: 32, False: 71.9k]
2738
        memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
2739
        return;
2740
    }
2741
2742
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
  Branch (2742:9): [True: 65.5k, False: 6.41k]
2743
        const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
2744
        for (; size--; 
++s, ++w2.04M
) {
  Branch (2744:16): [True: 2.04M, False: 65.5k]
2745
            *w = *s;
2746
        }
2747
    }
2748
    else {
2749
#if SIZEOF_WCHAR_T == 4
2750
        assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
2751
        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
2752
        for (; size--; 
++s, ++w17.0k
) {
  Branch (2752:16): [True: 17.0k, False: 6.41k]
2753
            *w = *s;
2754
        }
2755
#else
2756
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2757
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
2758
        for (; size--; ++s, ++w) {
2759
            Py_UCS4 ch = *s;
2760
            if (ch > 0xFFFF) {
2761
                assert(ch <= MAX_UNICODE);
2762
                /* encode surrogate pair in this case */
2763
                *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
2764
                if (!size--)
2765
                    break;
2766
                *w = Py_UNICODE_LOW_SURROGATE(ch);
2767
            }
2768
            else {
2769
                *w = ch;
2770
            }
2771
        }
2772
#endif
2773
    }
2774
}
2775
2776
#ifdef HAVE_WCHAR_H
2777
2778
/* Convert a Unicode object to a wide character string.
2779
2780
   - If w is NULL: return the number of wide characters (including the null
2781
     character) required to convert the unicode object. Ignore size argument.
2782
2783
   - Otherwise: return the number of wide characters (excluding the null
2784
     character) written into w. Write at most size wide characters (including
2785
     the null character). */
2786
Py_ssize_t
2787
PyUnicode_AsWideChar(PyObject *unicode,
2788
                     wchar_t *w,
2789
                     Py_ssize_t size)
2790
{
2791
    Py_ssize_t res;
2792
2793
    if (unicode == NULL) {
  Branch (2793:9): [True: 0, False: 20.0k]
2794
        PyErr_BadInternalCall();
2795
        return -1;
2796
    }
2797
    if (!PyUnicode_Check(unicode)) {
  Branch (2797:9): [True: 0, False: 20.0k]
2798
        PyErr_BadArgument();
2799
        return -1;
2800
    }
2801
2802
    res = unicode_get_widechar_size(unicode);
2803
    if (w == NULL) {
  Branch (2803:9): [True: 10.0k, False: 10.0k]
2804
        return res + 1;
2805
    }
2806
2807
    if (size > res) {
  Branch (2807:9): [True: 50, False: 10.0k]
2808
        size = res + 1;
2809
    }
2810
    else {
2811
        res = size;
2812
    }
2813
    unicode_copy_as_widechar(unicode, w, size);
2814
2815
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2816
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
2817
       non-Unicode locales and hence needs conversion first. */
2818
    if (_Py_LocaleUsesNonUnicodeWchar()) {
2819
        if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
2820
            return -1;
2821
        }
2822
    }
2823
#endif
2824
2825
    return res;
2826
}
2827
2828
wchar_t*
2829
PyUnicode_AsWideCharString(PyObject *unicode,
2830
                           Py_ssize_t *size)
2831
{
2832
    wchar_t *buffer;
2833
    Py_ssize_t buflen;
2834
2835
    if (unicode == NULL) {
  Branch (2835:9): [True: 0, False: 61.9k]
2836
        PyErr_BadInternalCall();
2837
        return NULL;
2838
    }
2839
    if (!PyUnicode_Check(unicode)) {
  Branch (2839:9): [True: 0, False: 61.9k]
2840
        PyErr_BadArgument();
2841
        return NULL;
2842
    }
2843
2844
    buflen = unicode_get_widechar_size(unicode);
2845
    buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
2846
    if (buffer == NULL) {
  Branch (2846:9): [True: 0, False: 61.9k]
2847
        PyErr_NoMemory();
2848
        return NULL;
2849
    }
2850
    unicode_copy_as_widechar(unicode, buffer, buflen + 1);
2851
2852
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2853
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
2854
       non-Unicode locales and hence needs conversion first. */
2855
    if (_Py_LocaleUsesNonUnicodeWchar()) {
2856
        if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
2857
            return NULL;
2858
        }
2859
    }
2860
#endif
2861
2862
    if (size != NULL) {
  Branch (2862:9): [True: 39.2k, False: 22.7k]
2863
        *size = buflen;
2864
    }
2865
    else if (wcslen(buffer) != (size_t)buflen) {
  Branch (2865:14): [True: 5, False: 22.7k]
2866
        PyMem_Free(buffer);
2867
        PyErr_SetString(PyExc_ValueError,
2868
                        "embedded null character");
2869
        return NULL;
2870
    }
2871
    return buffer;
2872
}
2873
2874
#endif /* HAVE_WCHAR_H */
2875
2876
int
2877
_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
2878
{
2879
    wchar_t **p = (wchar_t **)ptr;
2880
    if (obj == NULL) {
  Branch (2880:9): [True: 0, False: 0]
2881
        PyMem_Free(*p);
2882
        *p = NULL;
2883
        return 1;
2884
    }
2885
    if (PyUnicode_Check(obj)) {
2886
        *p = PyUnicode_AsWideCharString(obj, NULL);
2887
        if (*p == NULL) {
  Branch (2887:13): [True: 0, False: 0]
2888
            return 0;
2889
        }
2890
        return Py_CLEANUP_SUPPORTED;
2891
    }
2892
    PyErr_Format(PyExc_TypeError,
2893
                 "argument must be str, not %.50s",
2894
                 Py_TYPE(obj)->tp_name);
2895
    return 0;
2896
}
2897
2898
int
2899
_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
2900
{
2901
    wchar_t **p = (wchar_t **)ptr;
2902
    if (obj == NULL) {
  Branch (2902:9): [True: 0, False: 0]
2903
        PyMem_Free(*p);
2904
        *p = NULL;
2905
        return 1;
2906
    }
2907
    if (obj == Py_None) {
  Branch (2907:9): [True: 0, False: 0]
2908
        *p = NULL;
2909
        return 1;
2910
    }
2911
    if (PyUnicode_Check(obj)) {
2912
        *p = PyUnicode_AsWideCharString(obj, NULL);
2913
        if (*p == NULL) {
  Branch (2913:13): [True: 0, False: 0]
2914
            return 0;
2915
        }
2916
        return Py_CLEANUP_SUPPORTED;
2917
    }
2918
    PyErr_Format(PyExc_TypeError,
2919
                 "argument must be str or None, not %.50s",
2920
                 Py_TYPE(obj)->tp_name);
2921
    return 0;
2922
}
2923
2924
PyObject *
2925
PyUnicode_FromOrdinal(int ordinal)
2926
{
2927
    if (ordinal < 0 || 
ordinal > 7.42M
MAX_UNICODE7.42M
) {
  Branch (2927:9): [True: 1, False: 7.42M]
  Branch (2927:24): [True: 5, False: 7.42M]
2928
        PyErr_SetString(PyExc_ValueError,
2929
                        "chr() arg not in range(0x110000)");
2930
        return NULL;
2931
    }
2932
2933
    return unicode_char((Py_UCS4)ordinal);
2934
}
2935
2936
PyObject *
2937
PyUnicode_FromObject(PyObject *obj)
2938
{
2939
    /* XXX Perhaps we should make this API an alias of
2940
       PyObject_Str() instead ?! */
2941
    if (PyUnicode_CheckExact(obj)) {
2942
        Py_INCREF(obj);
2943
        return obj;
2944
    }
2945
    if (PyUnicode_Check(obj)) {
2946
        /* For a Unicode subtype that's not a Unicode object,
2947
           return a true Unicode object with the same data. */
2948
        return _PyUnicode_Copy(obj);
2949
    }
2950
    PyErr_Format(PyExc_TypeError,
2951
                 "Can't convert '%.100s' object to str implicitly",
2952
                 Py_TYPE(obj)->tp_name);
2953
    return NULL;
2954
}
2955
2956
PyObject *
2957
PyUnicode_FromEncodedObject(PyObject *obj,
2958
                            const char *encoding,
2959
                            const char *errors)
2960
{
2961
    Py_buffer buffer;
2962
    PyObject *v;
2963
2964
    if (obj == NULL) {
  Branch (2964:9): [True: 0, False: 6.49M]
2965
        PyErr_BadInternalCall();
2966
        return NULL;
2967
    }
2968
2969
    /* Decoding bytes objects is the most common case and should be fast */
2970
    if (PyBytes_Check(obj)) {
2971
        if (PyBytes_GET_SIZE(obj) == 0) {
  Branch (2971:13): [True: 43.6k, False: 5.89M]
2972
            if (unicode_check_encoding_errors(encoding, errors) < 0) {
  Branch (2972:17): [True: 0, False: 43.6k]
2973
                return NULL;
2974
            }
2975
            _Py_RETURN_UNICODE_EMPTY();
2976
        }
2977
        return PyUnicode_Decode(
2978
                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2979
                encoding, errors);
2980
    }
2981
2982
    if (PyUnicode_Check(obj)) {
2983
        PyErr_SetString(PyExc_TypeError,
2984
                        "decoding str is not supported");
2985
        return NULL;
2986
    }
2987
2988
    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2989
    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
  Branch (2989:9): [True: 0, False: 552k]
2990
        PyErr_Format(PyExc_TypeError,
2991
                     "decoding to str: need a bytes-like object, %.80s found",
2992
                     Py_TYPE(obj)->tp_name);
2993
        return NULL;
2994
    }
2995
2996
    if (buffer.len == 0) {
  Branch (2996:9): [True: 2.07k, False: 550k]
2997
        PyBuffer_Release(&buffer);
2998
        if (unicode_check_encoding_errors(encoding, errors) < 0) {
  Branch (2998:13): [True: 0, False: 2.07k]
2999
            return NULL;
3000
        }
3001
        _Py_RETURN_UNICODE_EMPTY();
3002
    }
3003
3004
    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3005
    PyBuffer_Release(&buffer);
3006
    return v;
3007
}
3008
3009
/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3010
   also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3011
   longer than lower_len-1). */
3012
int
3013
_Py_normalize_encoding(const char *encoding,
3014
                       char *lower,
3015
                       size_t lower_len)
3016
{
3017
    const char *e;
3018
    char *l;
3019
    char *l_end;
3020
    int punct;
3021
3022
    assert(encoding != NULL);
3023
3024
    e = encoding;
3025
    l = lower;
3026
    l_end = &lower[lower_len - 1];
3027
    punct = 0;
3028
    while (1) {
  Branch (3028:12): [Folded - Ignored]
3029
        char c = *e;
3030
        if (c == 0) {
  Branch (3030:13): [True: 8.77M, False: 72.9M]
3031
            break;
3032
        }
3033
3034
        if (Py_ISALNUM(c) || 
c == '.'11.0M
) {
  Branch (3034:30): [True: 50, False: 11.0M]
3035
            if (punct && 
l != lower11.0M
) {
  Branch (3035:17): [True: 11.0M, False: 50.7M]
  Branch (3035:26): [True: 11.0M, False: 6]
3036
                if (l == l_end) {
  Branch (3036:21): [True: 51, False: 11.0M]
3037
                    return 0;
3038
                }
3039
                *l++ = '_';
3040
            }
3041
            punct = 0;
3042
3043
            if (l == l_end) {
  Branch (3043:17): [True: 1.09M, False: 60.7M]
3044
                return 0;
3045
            }
3046
            *l++ = Py_TOLOWER(c);
3047
        }
3048
        else {
3049
            punct = 1;
3050
        }
3051
3052
        e++;
3053
    }
3054
    *l = '\0';
3055
    return 1;
3056
}
3057
3058
PyObject *
3059
PyUnicode_Decode(const char *s,
3060
                 Py_ssize_t size,
3061
                 const char *encoding,
3062
                 const char *errors)
3063
{
3064
    PyObject *buffer = NULL, *unicode;
3065
    Py_buffer info;
3066
    char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3067
3068
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
  Branch (3068:9): [True: 0, False: 6.44M]
3069
        return NULL;
3070
    }
3071
3072
    if (size == 0) {
  Branch (3072:9): [True: 4, False: 6.44M]
3073
        _Py_RETURN_UNICODE_EMPTY();
3074
    }
3075
3076
    if (encoding == NULL) {
  Branch (3076:9): [True: 4.88k, False: 6.44M]
3077
        return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3078
    }
3079
3080
    /* Shortcuts for common default encodings */
3081
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
  Branch (3081:9): [True: 6.40M, False: 36.8k]
3082
        char *lower = buflower;
3083
3084
        /* Fast paths */
3085
        if (lower[0] == 'u' && 
lower[1] == 't'857k
&&
lower[2] == 'f'857k
) {
  Branch (3085:13): [True: 857k, False: 5.54M]
  Branch (3085:32): [True: 857k, False: 146]
  Branch (3085:51): [True: 857k, False: 0]
3086
            lower += 3;
3087
            if (*lower == '_') {
  Branch (3087:17): [True: 857k, False: 742]
3088
                /* Match "utf8" and "utf_8" */
3089
                lower++;
3090
            }
3091
3092
            if (lower[0] == '8' && 
lower[1] == 0853k
) {
  Branch (3092:17): [True: 853k, False: 4.48k]
  Branch (3092:36): [True: 853k, False: 38]
3093
                return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3094
            }
3095
            else if (lower[0] == '1' && 
lower[1] == '6'3.20k
&&
lower[2] == 03.20k
) {
  Branch (3095:22): [True: 3.20k, False: 1.31k]
  Branch (3095:41): [True: 3.20k, False: 0]
  Branch (3095:60): [True: 1.06k, False: 2.14k]
3096
                return PyUnicode_DecodeUTF16(s, size, errors, 0);
3097
            }
3098
            else if (lower[0] == '3' && 
lower[1] == '2'92
&&
lower[2] == 092
) {
  Branch (3098:22): [True: 92, False: 3.36k]
  Branch (3098:41): [True: 92, False: 0]
  Branch (3098:60): [True: 33, False: 59]
3099
                return PyUnicode_DecodeUTF32(s, size, errors, 0);
3100
            }
3101
        }
3102
        else {
3103
            if (strcmp(lower, "ascii") == 0
  Branch (3103:17): [True: 652k, False: 4.89M]
3104
                || 
strcmp(lower, "us_ascii") == 04.89M
) {
  Branch (3104:20): [True: 139, False: 4.89M]
3105
                return PyUnicode_DecodeASCII(s, size, errors);
3106
            }
3107
    #ifdef MS_WINDOWS
3108
            else if (strcmp(lower, "mbcs") == 0) {
3109
                return PyUnicode_DecodeMBCS(s, size, errors);
3110
            }
3111
    #endif
3112
            else if (strcmp(lower, "latin1") == 0
  Branch (3112:22): [True: 7.10k, False: 4.89M]
3113
                     || 
strcmp(lower, "latin_1") == 04.89M
  Branch (3113:25): [True: 4.86M, False: 26.7k]
3114
                     || 
strcmp(lower, "iso_8859_1") == 026.7k
  Branch (3114:25): [True: 1.93k, False: 24.8k]
3115
                     || 
strcmp(lower, "iso8859_1") == 024.8k
) {
  Branch (3115:25): [True: 21.5k, False: 3.30k]
3116
                return PyUnicode_DecodeLatin1(s, size, errors);
3117
            }
3118
        }
3119
    }
3120
3121
    /* Decode via the codec registry */
3122
    buffer = NULL;
3123
    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
  Branch (3123:9): [True: 0, False: 43.5k]
3124
        goto onError;
3125
    buffer = PyMemoryView_FromBuffer(&info);
3126
    if (buffer == NULL)
  Branch (3126:9): [True: 0, False: 43.5k]
3127
        goto onError;
3128
    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3129
    if (unicode == NULL)
  Branch (3129:9): [True: 99, False: 43.4k]
3130
        goto onError;
3131
    if (!PyUnicode_Check(unicode)) {
  Branch (3131:9): [True: 2, False: 43.4k]
3132
        PyErr_Format(PyExc_TypeError,
3133
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3134
                     "use codecs.decode() to decode to arbitrary types",
3135
                     encoding,
3136
                     Py_TYPE(unicode)->tp_name);
3137
        Py_DECREF(unicode);
3138
        goto onError;
3139
    }
3140
    Py_DECREF(buffer);
3141
    return unicode_result(unicode);
3142
3143
  onError:
3144
    Py_XDECREF(buffer);
3145
    return NULL;
3146
}
3147
3148
PyObject *
3149
PyUnicode_AsDecodedObject(PyObject *unicode,
3150
                          const char *encoding,
3151
                          const char *errors)
3152
{
3153
    if (!PyUnicode_Check(unicode)) {
  Branch (3153:9): [True: 0, False: 0]
3154
        PyErr_BadArgument();
3155
        return NULL;
3156
    }
3157
3158
    if (PyErr_WarnEx(PyExc_DeprecationWarning,
  Branch (3158:9): [True: 0, False: 0]
3159
                     "PyUnicode_AsDecodedObject() is deprecated; "
3160
                     "use PyCodec_Decode() to decode from str", 1) < 0)
3161
        return NULL;
3162
3163
    if (encoding == NULL)
  Branch (3163:9): [True: 0, False: 0]
3164
        encoding = PyUnicode_GetDefaultEncoding();
3165
3166
    /* Decode via the codec registry */
3167
    return PyCodec_Decode(unicode, encoding, errors);
3168
}
3169
3170
PyObject *
3171
PyUnicode_AsDecodedUnicode(PyObject *unicode,
3172
                           const char *encoding,
3173
                           const char *errors)
3174
{
3175
    PyObject *v;
3176
3177
    if (!PyUnicode_Check(unicode)) {
  Branch (3177:9): [True: 0, False: 0]
3178
        PyErr_BadArgument();
3179
        goto onError;
3180
    }
3181
3182
    if (PyErr_WarnEx(PyExc_DeprecationWarning,
  Branch (3182:9): [True: 0, False: 0]
3183
                     "PyUnicode_AsDecodedUnicode() is deprecated; "
3184
                     "use PyCodec_Decode() to decode from str to str", 1) < 0)
3185
        return NULL;
3186
3187
    if (encoding == NULL)
  Branch (3187:9): [True: 0, False: 0]
3188
        encoding = PyUnicode_GetDefaultEncoding();
3189
3190
    /* Decode via the codec registry */
3191
    v = PyCodec_Decode(unicode, encoding, errors);
3192
    if (v == NULL)
  Branch (3192:9): [True: 0, False: 0]
3193
        goto onError;
3194
    if (!PyUnicode_Check(v)) {
  Branch (3194:9): [True: 0, False: 0]
3195
        PyErr_Format(PyExc_TypeError,
3196
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3197
                     "use codecs.decode() to decode to arbitrary types",
3198
                     encoding,
3199
                     Py_TYPE(unicode)->tp_name);
3200
        Py_DECREF(v);
3201
        goto onError;
3202
    }
3203
    return unicode_result(v);
3204
3205
  onError:
3206
    return NULL;
3207
}
3208
3209
PyObject *
3210
PyUnicode_AsEncodedObject(PyObject *unicode,
3211
                          const char *encoding,
3212
                          const char *errors)
3213
{
3214
    PyObject *v;
3215
3216
    if (!PyUnicode_Check(unicode)) {
  Branch (3216:9): [True: 0, False: 0]
3217
        PyErr_BadArgument();
3218
        goto onError;
3219
    }
3220
3221
    if (PyErr_WarnEx(PyExc_DeprecationWarning,
  Branch (3221:9): [True: 0, False: 0]
3222
                     "PyUnicode_AsEncodedObject() is deprecated; "
3223
                     "use PyUnicode_AsEncodedString() to encode from str to bytes "
3224
                     "or PyCodec_Encode() for generic encoding", 1) < 0)
3225
        return NULL;
3226
3227
    if (encoding == NULL)
  Branch (3227:9): [True: 0, False: 0]
3228
        encoding = PyUnicode_GetDefaultEncoding();
3229
3230
    /* Encode via the codec registry */
3231
    v = PyCodec_Encode(unicode, encoding, errors);
3232
    if (v == NULL)
  Branch (3232:9): [True: 0, False: 0]
3233
        goto onError;
3234
    return v;
3235
3236
  onError:
3237
    return NULL;
3238
}
3239
3240
3241
static PyObject *
3242
unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3243
                      int current_locale)
3244
{
3245
    Py_ssize_t wlen;
3246
    wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3247
    if (wstr == NULL) {
  Branch (3247:9): [True: 0, False: 7.99k]
3248
        return NULL;
3249
    }
3250
3251
    if ((size_t)wlen != wcslen(wstr)) {
  Branch (3251:9): [True: 0, False: 7.99k]
3252
        PyErr_SetString(PyExc_ValueError, "embedded null character");
3253
        PyMem_Free(wstr);
3254
        return NULL;
3255
    }
3256
3257
    char *str;
3258
    size_t error_pos;
3259
    const char *reason;
3260
    int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3261
                                 current_locale, error_handler);
3262
    PyMem_Free(wstr);
3263
3264
    if (res != 0) {
  Branch (3264:9): [True: 0, False: 7.99k]
3265
        if (res == -2) {
  Branch (3265:13): [True: 0, False: 0]
3266
            PyObject *exc;
3267
            exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3268
                    "locale", unicode,
3269
                    (Py_ssize_t)error_pos,
3270
                    (Py_ssize_t)(error_pos+1),
3271
                    reason);
3272
            if (exc != NULL) {
  Branch (3272:17): [True: 0, False: 0]
3273
                PyCodec_StrictErrors(exc);
3274
                Py_DECREF(exc);
3275
            }
3276
        }
3277
        else if (res == -3) {
  Branch (3277:18): [True: 0, False: 0]
3278
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3279
        }
3280
        else {
3281
            PyErr_NoMemory();
3282
        }
3283
        return NULL;
3284
    }
3285
3286
    PyObject *bytes = PyBytes_FromString(str);
3287
    PyMem_RawFree(str);
3288
    return bytes;
3289
}
3290
3291
PyObject *
3292
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3293
{
3294
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3295
    return unicode_encode_locale(unicode, error_handler, 1);
3296
}
3297
3298
PyObject *
3299
PyUnicode_EncodeFSDefault(PyObject *unicode)
3300
{
3301
    PyInterpreterState *interp = _PyInterpreterState_GET();
3302
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3303
    if (fs_codec->utf8) {
  Branch (3303:9): [True: 655k, False: 7.10k]
3304
        return unicode_encode_utf8(unicode,
3305
                                   fs_codec->error_handler,
3306
                                   fs_codec->errors);
3307
    }
3308
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3309
    else if (fs_codec->encoding) {
  Branch (3309:14): [True: 400, False: 6.70k]
3310
        return PyUnicode_AsEncodedString(unicode,
3311
                                         fs_codec->encoding,
3312
                                         fs_codec->errors);
3313
    }
3314
#endif
3315
    else {
3316
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3317
           machinery is not ready and so cannot be used:
3318
           use wcstombs() in this case. */
3319
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3320
        const wchar_t *filesystem_errors = config->filesystem_errors;
3321
        assert(filesystem_errors != NULL);
3322
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3323
        assert(errors != _Py_ERROR_UNKNOWN);
3324
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3325
        return unicode_encode_utf8(unicode, errors, NULL);
3326
#else
3327
        return unicode_encode_locale(unicode, errors, 0);
3328
#endif
3329
    }
3330
}
3331
3332
PyObject *
3333
PyUnicode_AsEncodedString(PyObject *unicode,
3334
                          const char *encoding,
3335
                          const char *errors)
3336
{
3337
    PyObject *v;
3338
    char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3339
3340
    if (!PyUnicode_Check(unicode)) {
  Branch (3340:9): [True: 0, False: 2.35M]
3341
        PyErr_BadArgument();
3342
        return NULL;
3343
    }
3344
3345
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
  Branch (3345:9): [True: 0, False: 2.35M]
3346
        return NULL;
3347
    }
3348
3349
    if (encoding == NULL) {
  Branch (3349:9): [True: 87.8k, False: 2.26M]
3350
        return _PyUnicode_AsUTF8String(unicode, errors);
3351
    }
3352
3353
    /* Shortcuts for common default encodings */
3354
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
  Branch (3354:9): [True: 1.20M, False: 1.06M]
3355
        char *lower = buflower;
3356
3357
        /* Fast paths */
3358
        if (lower[0] == 'u' && 
lower[1] == 't'931k
&&
lower[2] == 'f'924k
) {
  Branch (3358:13): [True: 931k, False: 268k]
  Branch (3358:32): [True: 924k, False: 7.02k]
  Branch (3358:51): [True: 924k, False: 0]
3359
            lower += 3;
3360
            if (*lower == '_') {
  Branch (3360:17): [True: 909k, False: 15.2k]
3361
                /* Match "utf8" and "utf_8" */
3362
                lower++;
3363
            }
3364
3365
            if (lower[0] == '8' && 
lower[1] == 0915k
) {
  Branch (3365:17): [True: 915k, False: 9.38k]
  Branch (3365:36): [True: 914k, False: 725]
3366
                return _PyUnicode_AsUTF8String(unicode, errors);
3367
            }
3368
            else if (lower[0] == '1' && 
lower[1] == '6'5.33k
&&
lower[2] == 05.33k
) {
  Branch (3368:22): [True: 5.33k, False: 4.77k]
  Branch (3368:41): [True: 5.33k, False: 0]
  Branch (3368:60): [True: 1.81k, False: 3.52k]
3369
                return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3370
            }
3371
            else if (lower[0] == '3' && 
lower[1] == '2'2.20k
&&
lower[2] == 02.20k
) {
  Branch (3371:22): [True: 2.20k, False: 6.09k]
  Branch (3371:41): [True: 2.20k, False: 0]
  Branch (3371:60): [True: 767, False: 1.43k]
3372
                return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3373
            }
3374
        }
3375
        else {
3376
            if (strcmp(lower, "ascii") == 0
  Branch (3376:17): [True: 252k, False: 23.5k]
3377
                || 
strcmp(lower, "us_ascii") == 023.5k
) {
  Branch (3377:20): [True: 7.02k, False: 16.5k]
3378
                return _PyUnicode_AsASCIIString(unicode, errors);
3379
            }
3380
#ifdef MS_WINDOWS
3381
            else if (strcmp(lower, "mbcs") == 0) {
3382
                return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3383
            }
3384
#endif
3385
            else if (strcmp(lower, "latin1") == 0 ||
  Branch (3385:22): [True: 900, False: 15.6k]
3386
                     
strcmp(lower, "latin_1") == 015.6k
||
  Branch (3386:22): [True: 10.7k, False: 4.86k]
3387
                     
strcmp(lower, "iso_8859_1") == 04.86k
||
  Branch (3387:22): [True: 2.14k, False: 2.72k]
3388
                     
strcmp(lower, "iso8859_1") == 02.72k
) {
  Branch (3388:22): [True: 45, False: 2.67k]
3389
                return _PyUnicode_AsLatin1String(unicode, errors);
3390
            }
3391
        }
3392
    }
3393
3394
    /* Encode via the codec registry */
3395
    v = _PyCodec_EncodeText(unicode, encoding, errors);
3396
    if (v == NULL)
  Branch (3396:9): [True: 60, False: 1.07M]
3397
        return NULL;
3398
3399
    /* The normal path */
3400
    if (PyBytes_Check(v))
3401
        return v;
3402
3403
    /* If the codec returns a buffer, raise a warning and convert to bytes */
3404
    if (PyByteArray_Check(v)) {
3405
        int error;
3406
        PyObject *b;
3407
3408
        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3409
            "encoder %s returned bytearray instead of bytes; "
3410
            "use codecs.encode() to encode to arbitrary types",
3411
            encoding);
3412
        if (error) {
  Branch (3412:13): [True: 0, False: 0]
3413
            Py_DECREF(v);
3414
            return NULL;
3415
        }
3416
3417
        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3418
                                      PyByteArray_GET_SIZE(v));
3419
        Py_DECREF(v);
3420
        return b;
3421
    }
3422
3423
    PyErr_Format(PyExc_TypeError,
3424
                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3425
                 "use codecs.encode() to encode to arbitrary types",
3426
                 encoding,
3427
                 Py_TYPE(v)->tp_name);
3428
    Py_DECREF(v);
3429
    return NULL;
3430
}
3431
3432
PyObject *
3433
PyUnicode_AsEncodedUnicode(PyObject *unicode,
3434
                           const char *encoding,
3435
                           const char *errors)
3436
{
3437
    PyObject *v;
3438
3439
    if (!PyUnicode_Check(unicode)) {
  Branch (3439:9): [True: 0, False: 0]
3440
        PyErr_BadArgument();
3441
        goto onError;
3442
    }
3443
3444
    if (PyErr_WarnEx(PyExc_DeprecationWarning,
  Branch (3444:9): [True: 0, False: 0]
3445
                     "PyUnicode_AsEncodedUnicode() is deprecated; "
3446
                     "use PyCodec_Encode() to encode from str to str", 1) < 0)
3447
        return NULL;
3448
3449
    if (encoding == NULL)
  Branch (3449:9): [True: 0, False: 0]
3450
        encoding = PyUnicode_GetDefaultEncoding();
3451
3452
    /* Encode via the codec registry */
3453
    v = PyCodec_Encode(unicode, encoding, errors);
3454
    if (v == NULL)
  Branch (3454:9): [True: 0, False: 0]
3455
        goto onError;
3456
    if (!PyUnicode_Check(v)) {
  Branch (3456:9): [True: 0, False: 0]
3457
        PyErr_Format(PyExc_TypeError,
3458
                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
3459
                     "use codecs.encode() to encode to arbitrary types",
3460
                     encoding,
3461
                     Py_TYPE(v)->tp_name);
3462
        Py_DECREF(v);
3463
        goto onError;
3464
    }
3465
    return v;
3466
3467
  onError:
3468
    return NULL;
3469
}
3470
3471
static PyObject*
3472
unicode_decode_locale(const char *str, Py_ssize_t len,
3473
                      _Py_error_handler errors, int current_locale)
3474
{
3475
    if (str[len] != '\0' || (size_t)len != strlen(str))  {
  Branch (3475:9): [True: 0, False: 279k]
  Branch (3475:29): [True: 0, False: 279k]
3476
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3477
        return NULL;
3478
    }
3479
3480
    wchar_t *wstr;
3481
    size_t wlen;
3482
    const char *reason;
3483
    int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3484
                                 current_locale, errors);
3485
    if (res != 0) {
  Branch (3485:9): [True: 0, False: 279k]
3486
        if (res == -2) {
  Branch (3486:13): [True: 0, False: 0]
3487
            PyObject *exc;
3488
            exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3489
                                        "locale", str, len,
3490
                                        (Py_ssize_t)wlen,
3491
                                        (Py_ssize_t)(wlen + 1),
3492
                                        reason);
3493
            if (exc != NULL) {
  Branch (3493:17): [True: 0, False: 0]
3494
                PyCodec_StrictErrors(exc);
3495
                Py_DECREF(exc);
3496
            }
3497
        }
3498
        else if (res == -3) {
  Branch (3498:18): [True: 0, False: 0]
3499
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3500
        }
3501
        else {
3502
            PyErr_NoMemory();
3503
        }
3504
        return NULL;
3505
    }
3506
3507
    PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3508
    PyMem_RawFree(wstr);
3509
    return unicode;
3510
}
3511
3512
PyObject*
3513
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3514
                              const char *errors)
3515
{
3516
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3517
    return unicode_decode_locale(str, len, error_handler, 1);
3518
}
3519
3520
PyObject*
3521
PyUnicode_DecodeLocale(const char *str, const char *errors)
3522
{
3523
    Py_ssize_t size = (Py_ssize_t)strlen(str);
3524
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3525
    return unicode_decode_locale(str, size, error_handler, 1);
3526
}
3527
3528
3529
PyObject*
3530
PyUnicode_DecodeFSDefault(const char *s) {
3531
    Py_ssize_t size = (Py_ssize_t)strlen(s);
3532
    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3533
}
3534
3535
PyObject*
3536
PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3537
{
3538
    PyInterpreterState *interp = _PyInterpreterState_GET();
3539
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3540
    if (fs_codec->utf8) {
  Branch (3540:9): [True: 333k, False: 92.5k]
3541
        return unicode_decode_utf8(s, size,
3542
                                   fs_codec->error_handler,
3543
                                   fs_codec->errors,
3544
                                   NULL);
3545
    }
3546
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3547
    else if (fs_codec->encoding) {
  Branch (3547:14): [True: 364, False: 92.1k]
3548
        return PyUnicode_Decode(s, size,
3549
                                fs_codec->encoding,
3550
                                fs_codec->errors);
3551
    }
3552
#endif
3553
    else {
3554
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3555
           machinery is not ready and so cannot be used:
3556
           use mbstowcs() in this case. */
3557
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3558
        const wchar_t *filesystem_errors = config->filesystem_errors;
3559
        assert(filesystem_errors != NULL);
3560
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3561
        assert(errors != _Py_ERROR_UNKNOWN);
3562
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3563
        return unicode_decode_utf8(s, size, errors, NULL, NULL);
3564
#else
3565
        return unicode_decode_locale(s, size, errors, 0);
3566
#endif
3567
    }
3568
}
3569
3570
3571
int
3572
PyUnicode_FSConverter(PyObject* arg, void* addr)
3573
{
3574
    PyObject *path = NULL;
3575
    PyObject *output = NULL;
3576
    Py_ssize_t size;
3577
    const char *data;
3578
    if (arg == NULL) {
  Branch (3578:9): [True: 1, False: 698k]
3579
        Py_DECREF(*(PyObject**)addr);
3580
        *(PyObject**)addr = NULL;
3581
        return 1;
3582
    }
3583
    path = PyOS_FSPath(arg);
3584
    if (path == NULL) {
  Branch (3584:9): [True: 11, False: 698k]
3585
        return 0;
3586
    }
3587
    if (PyBytes_Check(path)) {
3588
        output = path;
3589
    }
3590
    else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
3591
        output = PyUnicode_EncodeFSDefault(path);
3592
        Py_DECREF(path);
3593
        if (!output) {
  Branch (3593:13): [True: 54, False: 661k]
3594
            return 0;
3595
        }
3596
        assert(PyBytes_Check(output));
3597
    }
3598
3599
    size = PyBytes_GET_SIZE(output);
3600
    data = PyBytes_AS_STRING(output);
3601
    if ((size_t)size != strlen(data)) {
  Branch (3601:9): [True: 72, False: 698k]
3602
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3603
        Py_DECREF(output);
3604
        return 0;
3605
    }
3606
    *(PyObject**)addr = output;
3607
    return Py_CLEANUP_SUPPORTED;
3608
}
3609
3610
3611
int
3612
PyUnicode_FSDecoder(PyObject* arg, void* addr)
3613
{
3614
    int is_buffer = 0;
3615
    PyObject *path = NULL;
3616
    PyObject *output = NULL;
3617
    if (arg == NULL) {
  Branch (3617:9): [True: 0, False: 17.8k]
3618
        Py_DECREF(*(PyObject**)addr);
3619
        *(PyObject**)addr = NULL;
3620
        return 1;
3621
    }
3622
3623
    is_buffer = PyObject_CheckBuffer(arg);
3624
    if (!is_buffer) {
  Branch (3624:9): [True: 17.8k, False: 6]
3625
        path = PyOS_FSPath(arg);
3626
        if (path == NULL) {
  Branch (3626:13): [True: 2, False: 17.8k]
3627
            return 0;
3628
        }
3629
    }
3630
    else {
3631
        path = arg;
3632
        Py_INCREF(arg);
3633
    }
3634
3635
    if (PyUnicode_Check(path)) {
3636
        output = path;
3637
    }
3638
    else if (PyBytes_Check(path) || 
is_buffer4
) {
  Branch (3638:37): [True: 4, False: 0]
3639
        PyObject *path_bytes = NULL;
3640
3641
        if (!PyBytes_Check(path) &&
  Branch (3641:13): [True: 4, False: 2]
3642
            PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
  Branch (3642:13): [True: 0, False: 4]
3643
            "path should be string, bytes, or os.PathLike, not %.200s",
3644
            Py_TYPE(arg)->tp_name)) {
3645
                Py_DECREF(path);
3646
            return 0;
3647
        }
3648
        path_bytes = PyBytes_FromObject(path);
3649
        Py_DECREF(path);
3650
        if (!path_bytes) {
  Branch (3650:13): [True: 0, False: 6]
3651
            return 0;
3652
        }
3653
        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3654
                                                  PyBytes_GET_SIZE(path_bytes));
3655
        Py_DECREF(path_bytes);
3656
        if (!output) {
  Branch (3656:13): [True: 0, False: 6]
3657
            return 0;
3658
        }
3659
    }
3660
    else {
3661
        PyErr_Format(PyExc_TypeError,
3662
                     "path should be string, bytes, or os.PathLike, not %.200s",
3663
                     Py_TYPE(arg)->tp_name);
3664
        Py_DECREF(path);
3665
        return 0;
3666
    }
3667
    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
  Branch (3667:9): [True: 0, False: 17.8k]
3668
                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3669
        PyErr_SetString(PyExc_ValueError, "embedded null character");
3670
        Py_DECREF(output);
3671
        return 0;
3672
    }
3673
    *(PyObject**)addr = output;
3674
    return Py_CLEANUP_SUPPORTED;
3675
}
3676
3677
3678
static int unicode_fill_utf8(PyObject *unicode);
3679
3680
const char *
3681
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3682
{
3683
    if (!PyUnicode_Check(unicode)) {
  Branch (3683:9): [True: 0, False: 14.6M]
3684
        PyErr_BadArgument();
3685
        return NULL;
3686
    }
3687
3688
    if (PyUnicode_UTF8(unicode) == NULL) {
  Branch (3688:9): [True: 3.26k, False: 14.6M]
3689
        if (unicode_fill_utf8(unicode) == -1) {
  Branch (3689:13): [True: 69, False: 3.19k]
3690
            return NULL;
3691
        }
3692
    }
3693
3694
    if (psize)
  Branch (3694:9): [True: 14.4M, False: 241k]
3695
        *psize = PyUnicode_UTF8_LENGTH(unicode);
3696
    return PyUnicode_UTF8(unicode);
3697
}
3698
3699
const char *
3700
PyUnicode_AsUTF8(PyObject *unicode)
3701
{
3702
    return PyUnicode_AsUTF8AndSize(unicode, NULL);
3703
}
3704
3705
/*
3706
PyUnicode_GetSize() has been deprecated since Python 3.3
3707
because it returned length of Py_UNICODE.
3708
3709
But this function is part of stable abi, because it don't
3710
include Py_UNICODE in signature and it was not excluded from
3711
stable abi in PEP 384.
3712
*/
3713
PyAPI_FUNC(Py_ssize_t)
3714
PyUnicode_GetSize(PyObject *unicode)
3715
{
3716
    PyErr_SetString(PyExc_RuntimeError,
3717
                    "PyUnicode_GetSize has been removed.");
3718
    return -1;
3719
}
3720
3721
Py_ssize_t
3722
PyUnicode_GetLength(PyObject *unicode)
3723
{
3724
    if (!PyUnicode_Check(unicode)) {
  Branch (3724:9): [True: 0, False: 14.5k]
3725
        PyErr_BadArgument();
3726
        return -1;
3727
    }
3728
    return PyUnicode_GET_LENGTH(unicode);
3729
}
3730
3731
Py_UCS4
3732
PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3733
{
3734
    const void *data;
3735
    int kind;
3736
3737
    if (!PyUnicode_Check(unicode)) {
  Branch (3737:9): [True: 0, False: 17]
3738
        PyErr_BadArgument();
3739
        return (Py_UCS4)-1;
3740
    }
3741
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
  Branch (3741:9): [True: 0, False: 17]
  Branch (3741:22): [True: 0, False: 17]
3742
        PyErr_SetString(PyExc_IndexError, "string index out of range");
3743
        return (Py_UCS4)-1;
3744
    }
3745
    data = PyUnicode_DATA(unicode);
3746
    kind = PyUnicode_KIND(unicode);
3747
    return PyUnicode_READ(kind, data, index);
3748
}
3749
3750
int
3751
PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3752
{
3753
    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
  Branch (3753:9): [True: 0, False: 12]
  Branch (3753:38): [True: 0, False: 12]
3754
        PyErr_BadArgument();
3755
        return -1;
3756
    }
3757
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
  Branch (3757:9): [True: 0, False: 12]
  Branch (3757:22): [True: 0, False: 12]
3758
        PyErr_SetString(PyExc_IndexError, "string index out of range");
3759
        return -1;
3760
    }
3761
    if (unicode_check_modifiable(unicode))
  Branch (3761:9): [True: 0, False: 12]
3762
        return -1;
3763
    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
  Branch (3763:9): [True: 0, False: 12]
3764
        PyErr_SetString(PyExc_ValueError, "character out of range");
3765
        return -1;
3766
    }
3767
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3768
                    index, ch);
3769
    return 0;
3770
}
3771
3772
const char *
3773
PyUnicode_GetDefaultEncoding(void)
3774
{
3775
    return "utf-8";
3776
}
3777
3778
/* create or adjust a UnicodeDecodeError */
3779
static void
3780
make_decode_exception(PyObject **exceptionObject,
3781
                      const char *encoding,
3782
                      const char *input, Py_ssize_t length,
3783
                      Py_ssize_t startpos, Py_ssize_t endpos,
3784
                      const char *reason)
3785
{
3786
    if (*exceptionObject == NULL) {
  Branch (3786:9): [True: 2.12k, False: 1.82k]
3787
        *exceptionObject = PyUnicodeDecodeError_Create(
3788
            encoding, input, length, startpos, endpos, reason);
3789
    }
3790
    else {
3791
        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
  Branch (3791:13): [True: 0, False: 1.82k]
3792
            goto onError;
3793
        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
  Branch (3793:13): [True: 0, False: 1.82k]
3794
            goto onError;
3795
        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
  Branch (3795:13): [True: 0, False: 1.82k]
3796
            goto onError;
3797
    }
3798
    return;
3799
3800
onError:
3801
    Py_CLEAR(*exceptionObject);
3802
}
3803
3804
#ifdef MS_WINDOWS
3805
static int
3806
widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
3807
{
3808
    if (newsize > *size) {
3809
        wchar_t *newbuf = *buf;
3810
        if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
3811
            PyErr_NoMemory();
3812
            return -1;
3813
        }
3814
        *buf = newbuf;
3815
    }
3816
    *size = newsize;
3817
    return 0;
3818
}
3819
3820
/* error handling callback helper:
3821
   build arguments, call the callback and check the arguments,
3822
   if no exception occurred, copy the replacement to the output
3823
   and adjust various state variables.
3824
   return 0 on success, -1 on error
3825
*/
3826
3827
static int
3828
unicode_decode_call_errorhandler_wchar(
3829
    const char *errors, PyObject **errorHandler,
3830
    const char *encoding, const char *reason,
3831
    const char **input, const char **inend, Py_ssize_t *startinpos,
3832
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3833
    wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
3834
{
3835
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
3836
3837
    PyObject *restuple = NULL;
3838
    PyObject *repunicode = NULL;
3839
    Py_ssize_t outsize;
3840
    Py_ssize_t insize;
3841
    Py_ssize_t requiredsize;
3842
    Py_ssize_t newpos;
3843
    PyObject *inputobj = NULL;
3844
    Py_ssize_t repwlen;
3845
3846
    if (*errorHandler == NULL) {
3847
        *errorHandler = PyCodec_LookupError(errors);
3848
        if (*errorHandler == NULL)
3849
            goto onError;
3850
    }
3851
3852
    make_decode_exception(exceptionObject,
3853
        encoding,
3854
        *input, *inend - *input,
3855
        *startinpos, *endinpos,
3856
        reason);
3857
    if (*exceptionObject == NULL)
3858
        goto onError;
3859
3860
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
3861
    if (restuple == NULL)
3862
        goto onError;
3863
    if (!PyTuple_Check(restuple)) {
3864
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
3865
        goto onError;
3866
    }
3867
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
3868
        goto onError;
3869
3870
    /* Copy back the bytes variables, which might have been modified by the
3871
       callback */
3872
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3873
    if (!inputobj)
3874
        goto onError;
3875
    *input = PyBytes_AS_STRING(inputobj);
3876
    insize = PyBytes_GET_SIZE(inputobj);
3877
    *inend = *input + insize;
3878
    /* we can DECREF safely, as the exception has another reference,
3879
       so the object won't go away. */
3880
    Py_DECREF(inputobj);
3881
3882
    if (newpos<0)
3883
        newpos = insize+newpos;
3884
    if (newpos<0 || newpos>insize) {
3885
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3886
        goto onError;
3887
    }
3888
3889
    repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
3890
    if (repwlen < 0)
3891
        goto onError;
3892
    repwlen--;
3893
    /* need more space? (at least enough for what we
3894
       have+the replacement+the rest of the string (starting
3895
       at the new input position), so we won't have to check space
3896
       when there are no errors in the rest of the string) */
3897
    requiredsize = *outpos;
3898
    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
3899
        goto overflow;
3900
    requiredsize += repwlen;
3901
    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
3902
        goto overflow;
3903
    requiredsize += insize - newpos;
3904
    outsize = *bufsize;
3905
    if (requiredsize > outsize) {
3906
        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
3907
            requiredsize = 2*outsize;
3908
        if (widechar_resize(buf, bufsize, requiredsize) < 0) {
3909
            goto onError;
3910
        }
3911
    }
3912
    PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
3913
    *outpos += repwlen;
3914
    *endinpos = newpos;
3915
    *inptr = *input + newpos;
3916
3917
    /* we made it! */
3918
    Py_DECREF(restuple);
3919
    return 0;
3920
3921
  overflow:
3922
    PyErr_SetString(PyExc_OverflowError,
3923
                    "decoded result is too long for a Python string");
3924
3925
  onError:
3926
    Py_XDECREF(restuple);
3927
    return -1;
3928
}
3929
#endif   /* MS_WINDOWS */
3930
3931
static int
3932
unicode_decode_call_errorhandler_writer(
3933
    const char *errors, PyObject **errorHandler,
3934
    const char *encoding, const char *reason,
3935
    const char **input, const char **inend, Py_ssize_t *startinpos,
3936
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3937
    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
3938
{
3939
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
3940
3941
    PyObject *restuple = NULL;
3942
    PyObject *repunicode = NULL;
3943
    Py_ssize_t insize;
3944
    Py_ssize_t newpos;
3945
    Py_ssize_t replen;
3946
    Py_ssize_t remain;
3947
    PyObject *inputobj = NULL;
3948
    int need_to_grow = 0;
3949
    const char *new_inptr;
3950
3951
    if (*errorHandler == NULL) {
  Branch (3951:9): [True: 2.12k, False: 1.82k]
3952
        *errorHandler = PyCodec_LookupError(errors);
3953
        if (*errorHandler == NULL)
  Branch (3953:13): [True: 1, False: 2.12k]
3954
            goto onError;
3955
    }
3956
3957
    make_decode_exception(exceptionObject,
3958
        encoding,
3959
        *input, *inend - *input,
3960
        *startinpos, *endinpos,
3961
        reason);
3962
    if (*exceptionObject == NULL)
  Branch (3962:9): [True: 0, False: 3.95k]
3963
        goto onError;
3964
3965
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
3966
    if (restuple == NULL)
  Branch (3966:9): [True: 1.46k, False: 2.48k]
3967
        goto onError;
3968
    if (!PyTuple_Check(restuple)) {
  Branch (3968:9): [True: 12, False: 2.47k]
3969
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
3970
        goto onError;
3971
    }
3972
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
  Branch (3972:9): [True: 22, False: 2.45k]
3973
        goto onError;
3974
3975
    /* Copy back the bytes variables, which might have been modified by the
3976
       callback */
3977
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3978
    if (!inputobj)
  Branch (3978:9): [True: 7, False: 2.44k]
3979
        goto onError;
3980
    remain = *inend - *input - *endinpos;
3981
    *input = PyBytes_AS_STRING(inputobj);
3982
    insize = PyBytes_GET_SIZE(inputobj);
3983
    *inend = *input + insize;
3984
    /* we can DECREF safely, as the exception has another reference,
3985
       so the object won't go away. */
3986
    Py_DECREF(inputobj);
3987
3988
    if (newpos<0)
  Branch (3988:9): [True: 3, False: 2.44k]
3989
        newpos = insize+newpos;
3990
    if (newpos<0 || 
newpos>insize2.44k
) {
  Branch (3990:9): [True: 1, False: 2.44k]
  Branch (3990:21): [True: 1, False: 2.44k]
3991
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3992
        goto onError;
3993
    }
3994
3995
    replen = PyUnicode_GET_LENGTH(repunicode);
3996
    if (replen > 1) {
  Branch (3996:9): [True: 71, False: 2.37k]
3997
        writer->min_length += replen - 1;
3998
        need_to_grow = 1;
3999
    }
4000
    new_inptr = *input + newpos;
4001
    if (*inend - new_inptr > remain) {
  Branch (4001:9): [True: 267, False: 2.17k]
4002
        /* We don't know the decoding algorithm here so we make the worst
4003
           assumption that one byte decodes to one unicode character.
4004
           If unfortunately one byte could decode to more unicode characters,
4005
           the decoder may write out-of-bound then.  Is it possible for the
4006
           algorithms using this function? */
4007
        writer->min_length += *inend - new_inptr - remain;
4008
        need_to_grow = 1;
4009
    }
4010
    if (need_to_grow) {
  Branch (4010:9): [True: 337, False: 2.10k]
4011
        writer->overallocate = 1;
4012
        if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
  Branch (4012:13): [True: 0, False: 337]
4013
                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4014
            goto onError;
4015
    }
4016
    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
  Branch (4016:9): [True: 0, False: 2.44k]
4017
        goto onError;
4018
4019
    *endinpos = newpos;
4020
    *inptr = new_inptr;
4021
4022
    /* we made it! */
4023
    Py_DECREF(restuple);
4024
    return 0;
4025
4026
  onError:
4027
    Py_XDECREF(restuple);
4028
    return -1;
4029
}
4030
4031
/* --- UTF-7 Codec -------------------------------------------------------- */
4032
4033
/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4034
4035
/* Three simple macros defining base-64. */
4036
4037
/* Is c a base-64 character? */
4038
4039
#define IS_BASE64(c) \
4040
    (
(69.7k
(c) >= 'A'69.7k
&&
(c) <= 'Z'66.5k
) || \
4041
     
(29.8k
(c) >= 'a'29.8k
&&
(c) <= 'z'26.7k
) || \
4042
     
(3.16k
(c) >= '0'3.16k
&&
(c) <= '9'474
) || \
4043
     
(c) == '+'2.70k
||
(c) == '/'2.68k
)
4044
4045
/* given that c is a base-64 character, what is its base-64 value? */
4046
4047
#define FROM_BASE64(c)                                                  \
4048
    (((c) >= 'A' && 
(c) <= 'Z'50.1k
) ?
(c) - 'A'37.0k
: \
4049
     
(13.5k
(c) >= 'a'13.5k
&&
(c) <= 'z'13.0k
) ?
(c) - 'a' + 2613.0k
: \
4050
     
(456
(c) >= '0'456
&&
(c) <= '9'402
) ?
(c) - '0' + 52402
: \
4051
     
(c) == '+'54
?
6218
:
6336
)
4052
4053
/* What is the base-64 character of the bottom 6 bits of n? */
4054
4055
#define TO_BASE64(n)  \
4056
    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4057
4058
/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4059
 * decoded as itself.  We are permissive on decoding; the only ASCII
4060
 * byte not decoding to itself is the + which begins a base64
4061
 * string. */
4062
4063
#define DECODE_DIRECT(c)                                \
4064
    ((c) <= 127 && 
(c) != '+'294k
)
4065
4066
/* The UTF-7 encoder treats ASCII characters differently according to
4067
 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4068
 * the above).  See RFC2152.  This array identifies these different
4069
 * sets:
4070
 * 0 : "Set D"
4071
 *     alphanumeric and '(),-./:?
4072
 * 1 : "Set O"
4073
 *     !"#$%&*;<=>@[]^_`{|}
4074
 * 2 : "whitespace"
4075
 *     ht nl cr sp
4076
 * 3 : special (must be base64 encoded)
4077
 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4078
 */
4079
4080
static
4081
char utf7_category[128] = {
4082
/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4083
    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4084
/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4085
    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4086
/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4087
    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4088
/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4089
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4090
/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4091
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4092
/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4093
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4094
/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4095
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4096
/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4097
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4098
};
4099
4100
/* ENCODE_DIRECT: this character should be encoded as itself.  The
4101
 * answer depends on whether we are encoding set O as itself, and also
4102
 * on whether we are encoding whitespace as itself.  RFC2152 makes it
4103
 * clear that the answers to these questions vary between
4104
 * applications, so this code needs to be flexible.  */
4105
4106
#define ENCODE_DIRECT(c, directO, directWS)             \
4107
    ((c) < 128 && 
(c) > 0319k
&& \
4108
     
(319k
(utf7_category[(c)] == 0)319k
|| \
4109
      
(13.7k
directWS13.7k
&&
(utf7_category[(c)] == 2)13.7k
) || \
4110
      
(352
directO352
&&
(utf7_category[(c)] == 1)352
)))
4111
4112
PyObject *
4113
PyUnicode_DecodeUTF7(const char *s,
4114
                     Py_ssize_t size,
4115
                     const char *errors)
4116
{
4117
    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4118
}
4119
4120
/* The decoder.  The only state we preserve is our read position,
4121
 * i.e. how many characters we have consumed.  So if we end in the
4122
 * middle of a shift sequence we have to back off the read position
4123
 * and the output to the beginning of the sequence, otherwise we lose
4124
 * all the shift state (seen bits, number of bits seen, high
4125
 * surrogate). */
4126
4127
PyObject *
4128
PyUnicode_DecodeUTF7Stateful(const char *s,
4129
                             Py_ssize_t size,
4130
                             const char *errors,
4131
                             Py_ssize_t *consumed)
4132
{
4133
    const char *starts = s;
4134
    Py_ssize_t startinpos;
4135
    Py_ssize_t endinpos;
4136
    const char *e;
4137
    _PyUnicodeWriter writer;
4138
    const char *errmsg = "";
4139
    int inShift = 0;
4140
    Py_ssize_t shiftOutStart;
4141
    unsigned int base64bits = 0;
4142
    unsigned long base64buffer = 0;
4143
    Py_UCS4 surrogate = 0;
4144
    PyObject *errorHandler = NULL;
4145
    PyObject *exc = NULL;
4146
4147
    if (size == 0) {
  Branch (4147:9): [True: 18, False: 5.64k]
4148
        if (consumed)
  Branch (4148:13): [True: 8, False: 10]
4149
            *consumed = 0;
4150
        _Py_RETURN_UNICODE_EMPTY();
4151
    }
4152
4153
    /* Start off assuming it's all ASCII. Widen later as necessary. */
4154
    _PyUnicodeWriter_Init(&writer);
4155
    writer.min_length = size;
4156
4157
    shiftOutStart = 0;
4158
    e = s + size;
4159
4160
    while (s < e) {
  Branch (4160:12): [True: 351k, False: 5.61k]
4161
        Py_UCS4 ch;
4162
      restart:
4163
        ch = (unsigned char) *s;
4164
4165
        if (inShift) { /* in a base-64 section */
  Branch (4165:13): [True: 53.2k, False: 298k]
4166
            if (IS_BASE64(ch)) { /* consume a base-64 character */
4167
                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4168
                base64bits += 6;
4169
                s++;
4170
                if (base64bits >= 16) {
  Branch (4170:21): [True: 18.5k, False: 32.0k]
4171
                    /* we have enough bits for a UTF-16 value */
4172
                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4173
                    base64bits -= 16;
4174
                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4175
                    assert(outCh <= 0xffff);
4176
                    if (surrogate) {
  Branch (4176:25): [True: 17, False: 18.5k]
4177
                        /* expecting a second surrogate */
4178
                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
  Branch (4178:29): [True: 16, False: 1]
4179
                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4180
                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
  Branch (4180:33): [True: 0, False: 16]
4181
                                goto onError;
4182
                            surrogate = 0;
4183
                            continue;
4184
                        }
4185
                        else {
4186
                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
  Branch (4186:33): [True: 0, False: 1]
4187
                                goto onError;
4188
                            surrogate = 0;
4189
                        }
4190
                    }
4191
                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
  Branch (4191:25): [True: 52, False: 18.4k]
4192
                        /* first surrogate */
4193
                        surrogate = outCh;
4194
                    }
4195
                    else {
4196
                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
  Branch (4196:29): [True: 0, False: 18.4k]
4197
                            goto onError;
4198
                    }
4199
                }
4200
            }
4201
            else { /* now leaving a base-64 section */
4202
                inShift = 0;
4203
                if (base64bits > 0) { /* left-over bits */
  Branch (4203:21): [True: 2.61k, False: 10]
4204
                    if (base64bits >= 6) {
  Branch (4204:25): [True: 28, False: 2.58k]
4205
                        /* We've seen at least one base-64 character */
4206
                        s++;
4207
                        errmsg = "partial character in shift sequence";
4208
                        goto utf7Error;
4209
                    }
4210
                    else {
4211
                        /* Some bits remain; they should be zero */
4212
                        if (base64buffer != 0) {
  Branch (4212:29): [True: 8, False: 2.57k]
4213
                            s++;
4214
                            errmsg = "non-zero padding bits in shift sequence";
4215
                            goto utf7Error;
4216
                        }
4217
                    }
4218
                }
4219
                if (surrogate && 
DECODE_DIRECT12
(ch)) {
  Branch (4219:21): [True: 12, False: 2.57k]
4220
                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
  Branch (4220:25): [True: 0, False: 9]
4221
                        goto onError;
4222
                }
4223
                surrogate = 0;
4224
                if (ch == '-') {
  Branch (4224:21): [True: 2.57k, False: 11]
4225
                    /* '-' is absorbed; other terminating
4226
                       characters are preserved */
4227
                    s++;
4228
                }
4229
            }
4230
        }
4231
        else if ( ch == '+' ) {
  Branch (4231:19): [True: 2.89k, False: 295k]
4232
            startinpos = s-starts;
4233
            s++; /* consume '+' */
4234
            if (s < e && 
*s == '-'2.83k
) { /* '+-' encodes '+' */
  Branch (4234:17): [True: 2.83k, False: 67]
  Branch (4234:26): [True: 7, False: 2.82k]
4235
                s++;
4236
                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
  Branch (4236:21): [True: 0, False: 7]
4237
                    goto onError;
4238
            }
4239
            else if (s < e && 
!2.82k
IS_BASE642.82k
(*s)) {
  Branch (4239:22): [True: 2.82k, False: 67]
4240
                s++;
4241
                errmsg = "ill-formed sequence";
4242
                goto utf7Error;
4243
            }
4244
            else { /* begin base64-encoded section */
4245
                inShift = 1;
4246
                surrogate = 0;
4247
                shiftOutStart = writer.pos;
4248
                base64bits = 0;
4249
                base64buffer = 0;
4250
            }
4251
        }
4252
        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4253
            s++;
4254
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
  Branch (4254:17): [True: 0, False: 294k]
4255
                goto onError;
4256
        }
4257
        else {
4258
            startinpos = s-starts;
4259
            s++;
4260
            errmsg = "unexpected special character";
4261
            goto utf7Error;
4262
        }
4263
        continue;
4264
utf7Error:
4265
        endinpos = s-starts;
4266
        if (unicode_decode_call_errorhandler_writer(
  Branch (4266:13): [True: 27, False: 285]
4267
                errors, &errorHandler,
4268
                "utf7", errmsg,
4269
                &starts, &e, &startinpos, &endinpos, &exc, &s,
4270
                &writer))
4271
            goto onError;
4272
    }
4273
4274
    /* end of string */
4275
4276
    if (inShift && 
!consumed261
) { /* in shift sequence, no more to follow */
  Branch (4276:9): [True: 261, False: 5.35k]
  Branch (4276:20): [True: 15, False: 246]
4277
        /* if we're in an inconsistent state, that's an error */
4278
        inShift = 0;
4279
        if (surrogate ||
  Branch (4279:13): [True: 2, False: 13]
4280
                
(base64bits >= 6)13
||
  Branch (4280:17): [True: 6, False: 7]
4281
                
(7
base64bits > 07
&&
base64buffer != 06
)) {
  Branch (4281:18): [True: 6, False: 1]
  Branch (4281:36): [True: 4, False: 2]
4282
            endinpos = size;
4283
            if (unicode_decode_call_errorhandler_writer(
  Branch (4283:17): [True: 5, False: 7]
4284
                    errors, &errorHandler,
4285
                    "utf7", "unterminated shift sequence",
4286
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4287
                    &writer))
4288
                goto onError;
4289
            if (s < e)
  Branch (4289:17): [True: 0, False: 7]
4290
                goto restart;
4291
        }
4292
    }
4293
4294
    /* return state */
4295
    if (consumed) {
  Branch (4295:9): [True: 4.42k, False: 1.18k]
4296
        if (inShift) {
  Branch (4296:13): [True: 246, False: 4.18k]
4297
            *consumed = startinpos;
4298
            if (writer.pos != shiftOutStart && 
writer.maxchar > 12777
) {
  Branch (4298:17): [True: 77, False: 169]
  Branch (4298:48): [True: 72, False: 5]
4299
                PyObject *result = PyUnicode_FromKindAndData(
4300
                        writer.kind, writer.data, shiftOutStart);
4301
                Py_XDECREF(errorHandler);
4302
                Py_XDECREF(exc);
4303
                _PyUnicodeWriter_Dealloc(&writer);
4304
                return result;
4305
            }
4306
            writer.pos = shiftOutStart; /* back off output */
4307
        }
4308
        else {
4309
            *consumed = s-starts;
4310
        }
4311
    }
4312
4313
    Py_XDECREF(errorHandler);
4314
    Py_XDECREF(exc);
4315
    return _PyUnicodeWriter_Finish(&writer);
4316
4317
  onError:
4318
    Py_XDECREF(errorHandler);
4319
    Py_XDECREF(exc);
4320
    _PyUnicodeWriter_Dealloc(&writer);
4321
    return NULL;
4322
}
4323
4324
4325
PyObject *
4326
_PyUnicode_EncodeUTF7(PyObject *str,
4327
                      int base64SetO,
4328
                      int base64WhiteSpace,
4329
                      const char *errors)
4330
{
4331
    int kind;
4332
    const void *data;
4333
    Py_ssize_t len;
4334
    PyObject *v;
4335
    int inShift = 0;
4336
    Py_ssize_t i;
4337
    unsigned int base64bits = 0;
4338
    unsigned long base64buffer = 0;
4339
    char * out;
4340
    const char * start;
4341
4342
    kind = PyUnicode_KIND(str);
4343
    data = PyUnicode_DATA(str);
4344
    len = PyUnicode_GET_LENGTH(str);
4345
4346
    if (len == 0)
  Branch (4346:9): [True: 20, False: 1.88k]
4347
        return PyBytes_FromStringAndSize(NULL, 0);
4348
4349
    /* It might be possible to tighten this worst case */
4350
    if (len > PY_SSIZE_T_MAX / 8)
  Branch (4350:9): [True: 0, False: 1.88k]
4351
        return PyErr_NoMemory();
4352
    v = PyBytes_FromStringAndSize(NULL, len * 8);
4353
    if (v == NULL)
  Branch (4353:9): [True: 0, False: 1.88k]
4354
        return NULL;
4355
4356
    start = out = PyBytes_AS_STRING(v);
4357
    for (i = 0; i < len; 
++i349k
) {
  Branch (4357:17): [True: 349k, False: 1.88k]
4358
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4359
4360
        if (inShift) {
  Branch (4360:13): [True: 29.6k, False: 319k]
4361
            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4362
                /* shifting out */
4363
                if (base64bits) { /* output remaining bits */
  Branch (4363:21): [True: 13.6k, False: 3]
4364
                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4365
                    base64buffer = 0;
4366
                    base64bits = 0;
4367
                }
4368
                inShift = 0;
4369
                /* Characters not in the BASE64 set implicitly unshift the sequence
4370
                   so no '-' is required, except if the character is itself a '-' */
4371
                if (IS_BASE64(ch) || 
ch == '-'11
) {
  Branch (4371:38): [True: 1, False: 10]
4372
                    *out++ = '-';
4373
                }
4374
                *out++ = (char) ch;
4375
            }
4376
            else {
4377
                goto encode_char;
4378
            }
4379
        }
4380
        else { /* not in a shift sequence */
4381
            if (ch == '+') {
  Branch (4381:17): [True: 13, False: 319k]
4382
                *out++ = '+';
4383
                        *out++ = '-';
4384
            }
4385
            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4386
                *out++ = (char) ch;
4387
            }
4388
            else {
4389
                *out++ = '+';
4390
                inShift = 1;
4391
                goto encode_char;
4392
            }
4393
        }
4394
        continue;
4395
encode_char:
4396
        if (ch >= 0x10000) {
  Branch (4396:13): [True: 9, False: 30.6k]
4397
            assert(ch <= MAX_UNICODE);
4398
4399
            /* code first surrogate */
4400
            base64bits += 16;
4401
            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4402
            while (base64bits >= 6) {
  Branch (4402:20): [True: 21, False: 9]
4403
                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4404
                base64bits -= 6;
4405
            }
4406
            /* prepare second surrogate */
4407
            ch = Py_UNICODE_LOW_SURROGATE(ch);
4408
        }
4409
        base64bits += 16;
4410
        base64buffer = (base64buffer << 16) | ch;
4411
        while (base64bits >= 6) {
  Branch (4411:16): [True: 71.9k, False: 30.6k]
4412
            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4413
            base64bits -= 6;
4414
        }
4415
    }
4416
    if (base64bits)
  Branch (4416:9): [True: 961, False: 926]
4417
        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4418
    if (inShift)
  Branch (4418:9): [True: 970, False: 917]
4419
        *out++ = '-';
4420
    if (_PyBytes_Resize(&v, out - start) < 0)
  Branch (4420:9): [True: 0, False: 1.88k]
4421
        return NULL;
4422
    return v;
4423
}
4424
4425
#undef IS_BASE64
4426
#undef FROM_BASE64
4427
#undef TO_BASE64
4428
#undef DECODE_DIRECT
4429
#undef ENCODE_DIRECT
4430
4431
/* --- UTF-8 Codec -------------------------------------------------------- */
4432
4433
PyObject *
4434
PyUnicode_DecodeUTF8(const char *s,
4435
                     Py_ssize_t size,
4436
                     const char *errors)
4437
{
4438
    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4439
}
4440
4441
#include "stringlib/asciilib.h"
4442
#include "stringlib/codecs.h"
4443
#include "stringlib/undef.h"
4444
4445
#include "stringlib/ucs1lib.h"
4446
#include "stringlib/codecs.h"
4447
#include "stringlib/undef.h"
4448
4449
#include "stringlib/ucs2lib.h"
4450
#include "stringlib/codecs.h"
4451
#include "stringlib/undef.h"
4452
4453
#include "stringlib/ucs4lib.h"
4454
#include "stringlib/codecs.h"
4455
#include "stringlib/undef.h"
4456
4457
/* Mask to quickly check whether a C 'size_t' contains a
4458
   non-ASCII, UTF8-encoded char. */
4459
#if (SIZEOF_SIZE_T == 8)
4460
# define ASCII_CHAR_MASK 0x8080808080808080ULL
4461
#elif (SIZEOF_SIZE_T == 4)
4462
# define ASCII_CHAR_MASK 0x80808080U
4463
#else
4464
# error C 'size_t' size should be either 4 or 8!
4465
#endif
4466
4467
static Py_ssize_t
4468
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4469
{
4470
    const char *p = start;
4471
4472
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
4473
    assert(_Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T));
4474
    if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
4475
        /* Fast path, see in STRINGLIB(utf8_decode) for
4476
           an explanation. */
4477
        /* Help allocation */
4478
        const char *_p = p;
4479
        Py_UCS1 * q = dest;
4480
        while (_p + SIZEOF_SIZE_T <= end) {
  Branch (4480:16): [True: 37.3M, False: 8.84M]
4481
            size_t value = *(const size_t *) _p;
4482
            if (value & ASCII_CHAR_MASK)
  Branch (4482:17): [True: 122k, False: 37.2M]
4483
                break;
4484
            *((size_t *)q) = value;
4485
            _p += SIZEOF_SIZE_T;
4486
            q += SIZEOF_SIZE_T;
4487
        }
4488
        p = _p;
4489
        while (p < end) {
  Branch (4489:16): [True: 32.5M, False: 8.71M]
4490
            if ((unsigned char)*p & 0x80)
  Branch (4490:17): [True: 253k, False: 32.3M]
4491
                break;
4492
            *q++ = *p++;
4493
        }
4494
        return p - start;
4495
    }
4496
#endif
4497
    
while (14.2M
p < end) {
  Branch (4497:12): [True: 86.1M, False: 11.5M]
4498
        /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4499
           for an explanation. */
4500
        if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
4501
            /* Help allocation */
4502
            const char *_p = p;
4503
            while (_p + SIZEOF_SIZE_T <= end) {
  Branch (4503:20): [True: 8.10M, False: 12.6M]
4504
                size_t value = *(const size_t *) _p;
4505
                if (value & ASCII_CHAR_MASK)
  Branch (4505:21): [True: 684, False: 8.10M]
4506
                    break;
4507
                _p += SIZEOF_SIZE_T;
4508
            }
4509
            p = _p;
4510
            if (_p == end)
  Branch (4510:17): [True: 2.74M, False: 9.94M]
4511
                break;
4512
        }
4513
        if ((unsigned char)*p & 0x80)
  Branch (4513:13): [True: 9.21k, False: 83.3M]
4514
            break;
4515
        ++p;
4516
    }
4517
    memcpy(dest, start, p - start);
4518
    return p - start;
4519
}
4520
4521
static PyObject *
4522
unicode_decode_utf8(const char *s, Py_ssize_t size,
4523
                    _Py_error_handler error_handler, const char *errors,
4524
                    Py_ssize_t *consumed)
4525
{
4526
    if (size == 0) {
  Branch (4526:9): [True: 1.21M, False: 27.9M]
4527
        if (consumed)
  Branch (4527:13): [True: 38, False: 1.21M]
4528
            *consumed = 0;
4529
        _Py_RETURN_UNICODE_EMPTY();
4530
    }
4531
4532
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4533
    if (size == 1 && 
(unsigned char)s[0] < 1284.88M
) {
  Branch (4533:9): [True: 4.88M, False: 23.0M]
  Branch (4533:22): [True: 4.86M, False: 16.3k]
4534
        if (consumed) {
  Branch (4534:13): [True: 6.97k, False: 4.85M]
4535
            *consumed = 1;
4536
        }
4537
        return get_latin1_char((unsigned char)s[0]);
4538
    }
4539
4540
    const char *starts = s;
4541
    const char *end = s + size;
4542
4543
    // fast path: try ASCII string.
4544
    PyObject *u = PyUnicode_New(size, 127);
4545
    if (u == NULL) {
  Branch (4545:9): [True: 0, False: 23.1M]
4546
        return NULL;
4547
    }
4548
    s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
4549
    if (s == end) {
  Branch (4549:9): [True: 22.8M, False: 262k]
4550
        return u;
4551
    }
4552
4553
    // Use _PyUnicodeWriter after fast path is failed.
4554
    _PyUnicodeWriter writer;
4555
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
4556
    writer.pos = s - starts;
4557
4558
    Py_ssize_t startinpos, endinpos;
4559
    const char *errmsg = "";
4560
    PyObject *error_handler_obj = NULL;
4561
    PyObject *exc = NULL;
4562
4563
    while (s < end) {
  Branch (4563:12): [True: 442k, False: 49.6k]
4564
        Py_UCS4 ch;
4565
        int kind = writer.kind;
4566
4567
        if (kind == PyUnicode_1BYTE_KIND) {
  Branch (4567:13): [True: 307k, False: 135k]
4568
            if (PyUnicode_IS_ASCII(writer.buffer))
4569
                ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4570
            else
4571
                ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4572
        } else 
if (135k
kind == PyUnicode_2BYTE_KIND135k
) {
  Branch (4572:20): [True: 134k, False: 998]
4573
            ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
4574
        } else {
4575
            assert(kind == PyUnicode_4BYTE_KIND);
4576
            ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
4577
        }
4578
4579
        switch (ch) {
4580
        case 0:
  Branch (4580:9): [True: 216k, False: 226k]
4581
            if (s == end || 
consumed126k
)
  Branch (4581:17): [True: 89.8k, False: 126k]
  Branch (4581:29): [True: 121k, False: 4.20k]
4582
                goto End;
4583
            errmsg = "unexpected end of data";
4584
            startinpos = s - starts;
4585
            endinpos = end - starts;
4586
            break;
4587
        case 1:
  Branch (4587:9): [True: 2.14k, False: 440k]
4588
            errmsg = "invalid start byte";
4589
            startinpos = s - starts;
4590
            endinpos = startinpos + 1;
4591
            break;
4592
        case 2:
  Branch (4592:9): [True: 6.30k, False: 436k]
4593
            if (consumed && 
(unsigned char)s[0] == 0xED49
&&
end - s == 215
  Branch (4593:17): [True: 49, False: 6.25k]
  Branch (4593:29): [True: 15, False: 34]
  Branch (4593:60): [True: 4, False: 11]
4594
                && 
(unsigned char)s[1] >= 0xA04
&&
(unsigned char)s[1] <= 0xBF4
)
  Branch (4594:20): [True: 4, False: 0]
  Branch (4594:51): [True: 4, False: 0]
4595
            {
4596
                /* Truncated surrogate code in range D800-DFFF */
4597
                goto End;
4598
            }
4599
            /* fall through */
4600
        case 3:
  Branch (4600:9): [True: 497, False: 441k]
4601
        case 4:
  Branch (4601:9): [True: 369, False: 442k]
4602
            errmsg = "invalid continuation byte";
4603
            startinpos = s - starts;
4604
            endinpos = startinpos + ch - 1;
4605
            break;
4606
        default:
  Branch (4606:9): [True: 217k, False: 225k]
4607
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
  Branch (4607:17): [True: 0, False: 217k]
4608
                goto onError;
4609
            continue;
4610
        }
4611
4612
        if (error_handler == _Py_ERROR_UNKNOWN)
  Branch (4612:13): [True: 7.04k, False: 6.48k]
4613
            error_handler = _Py_GetErrorHandler(errors);
4614
4615
        switch (error_handler) {
4616
        case _Py_ERROR_IGNORE:
  Branch (4616:9): [True: 894, False: 12.6k]
4617
            s += (endinpos - startinpos);
4618
            break;
4619
4620
        case _Py_ERROR_REPLACE:
  Branch (4620:9): [True: 1.28k, False: 12.2k]
4621
            if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
  Branch (4621:17): [True: 0, False: 1.28k]
4622
                goto onError;
4623
            s += (endinpos - startinpos);
4624
            break;
4625
4626
        case _Py_ERROR_SURROGATEESCAPE:
  Branch (4626:9): [True: 9.44k, False: 4.07k]
4627
        {
4628
            Py_ssize_t i;
4629
4630
            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
  Branch (4630:17): [True: 0, False: 9.44k]
4631
                goto onError;
4632
            
for (i=startinpos; 9.44k
i<endinpos;
i++9.44k
) {
  Branch (4632:32): [True: 9.44k, False: 9.44k]
4633
                ch = (Py_UCS4)(unsigned char)(starts[i]);
4634
                PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4635
                                ch + 0xdc00);
4636
                writer.pos++;
4637
            }
4638
            s += (endinpos - startinpos);
4639
            break;
4640
        }
4641
4642
        default:
  Branch (4642:9): [True: 1.89k, False: 11.6k]
4643
            if (unicode_decode_call_errorhandler_writer(
  Branch (4643:17): [True: 1.28k, False: 611]
4644
                    errors, &error_handler_obj,
4645
                    "utf-8", errmsg,
4646
                    &starts, &end, &startinpos, &endinpos, &exc, &s,
4647
                    &writer))
4648
                goto onError;
4649
        }
4650
    }
4651
4652
End:
4653
    if (consumed)
  Branch (4653:9): [True: 199k, False: 61.8k]
4654
        *consumed = s - starts;
4655
4656
    Py_XDECREF(error_handler_obj);
4657
    Py_XDECREF(exc);
4658
    return _PyUnicodeWriter_Finish(&writer);
4659
4660
onError:
4661
    Py_XDECREF(error_handler_obj);
4662
    Py_XDECREF(exc);
4663
    _PyUnicodeWriter_Dealloc(&writer);
4664
    return NULL;
4665
}
4666
4667
4668
PyObject *
4669
PyUnicode_DecodeUTF8Stateful(const char *s,
4670
                             Py_ssize_t size,
4671
                             const char *errors,
4672
                             Py_ssize_t *consumed)
4673
{
4674
    return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
4675
}
4676
4677
4678
/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
4679
   non-zero, use strict error handler otherwise.
4680
4681
   On success, write a pointer to a newly allocated wide character string into
4682
   *wstr (use PyMem_RawFree() to free the memory) and write the output length
4683
   (in number of wchar_t units) into *wlen (if wlen is set).
4684
4685
   On memory allocation failure, return -1.
4686
4687
   On decoding error (if surrogateescape is zero), return -2. If wlen is
4688
   non-NULL, write the start of the illegal byte sequence into *wlen. If reason
4689
   is not NULL, write the decoding error message into *reason. */
4690
int
4691
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
4692
                 const char **reason, _Py_error_handler errors)
4693
{
4694
    const char *orig_s = s;
4695
    const char *e;
4696
    wchar_t *unicode;
4697
    Py_ssize_t outpos;
4698
4699
    int surrogateescape = 0;
4700
    int surrogatepass = 0;
4701
    switch (errors)
4702
    {
4703
    case _Py_ERROR_STRICT:
  Branch (4703:5): [True: 0, False: 1.97k]
4704
        break;
4705
    case _Py_ERROR_SURROGATEESCAPE:
  Branch (4705:5): [True: 1.97k, False: 0]
4706
        surrogateescape = 1;
4707
        break;
4708
    case _Py_ERROR_SURROGATEPASS:
  Branch (4708:5): [True: 0, False: 1.97k]
4709
        surrogatepass = 1;
4710
        break;
4711
    default:
  Branch (4711:5): [True: 0, False: 1.97k]
4712
        return -3;
4713
    }
4714
4715
    /* Note: size will always be longer than the resulting Unicode
4716
       character count */
4717
    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
  Branch (4717:9): [True: 0, False: 1.97k]
4718
        return -1;
4719
    }
4720
4721
    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
4722
    if (!unicode) {
  Branch (4722:9): [True: 0, False: 1.97k]
4723
        return -1;
4724
    }
4725
4726
    /* Unpack UTF-8 encoded data */
4727
    e = s + size;
4728
    outpos = 0;
4729
    while (s < e) {
  Branch (4729:12): [True: 1.97k, False: 0]
4730
        Py_UCS4 ch;
4731
#if SIZEOF_WCHAR_T == 4
4732
        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
4733
#else
4734
        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
4735
#endif
4736
        if (ch > 0xFF) {
  Branch (4736:13): [True: 0, False: 1.97k]
4737
#if SIZEOF_WCHAR_T == 4
4738
            Py_UNREACHABLE();
4739
#else
4740
            assert(ch > 0xFFFF && ch <= MAX_UNICODE);
4741
            /* write a surrogate pair */
4742
            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4743
            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4744
#endif
4745
        }
4746
        else {
4747
            if (!ch && s == e) {
  Branch (4747:17): [True: 1.97k, False: 0]
  Branch (4747:24): [True: 1.97k, False: 0]
4748
                break;
4749
            }
4750
4751
            if (surrogateescape) {
  Branch (4751:17): [True: 0, False: 0]
4752
                unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4753
            }
4754
            else {
4755
                /* Is it a valid three-byte code? */
4756
                if (surrogatepass
  Branch (4756:21): [True: 0, False: 0]
4757
                    && (e - s) >= 3
  Branch (4757:24): [True: 0, False: 0]
4758
                    && (s[0] & 0xf0) == 0xe0
  Branch (4758:24): [True: 0, False: 0]
4759
                    && (s[1] & 0xc0) == 0x80
  Branch (4759:24): [True: 0, False: 0]
4760
                    && (s[2] & 0xc0) == 0x80)
  Branch (4760:24): [True: 0, False: 0]
4761
                {
4762
                    ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4763
                    s += 3;
4764
                    unicode[outpos++] = ch;
4765
                }
4766
                else {
4767
                    PyMem_RawFree(unicode );
4768
                    if (reason != NULL) {
  Branch (4768:25): [True: 0, False: 0]
4769
                        switch (ch) {
4770
                        case 0:
  Branch (4770:25): [True: 0, False: 0]
4771
                            *reason = "unexpected end of data";
4772
                            break;
4773
                        case 1:
  Branch (4773:25): [True: 0, False: 0]
4774
                            *reason = "invalid start byte";
4775
                            break;
4776
                        /* 2, 3, 4 */
4777
                        default:
  Branch (4777:25): [True: 0, False: 0]
4778
                            *reason = "invalid continuation byte";
4779
                            break;
4780
                        }
4781
                    }
4782
                    if (wlen != NULL) {
  Branch (4782:25): [True: 0, False: 0]
4783
                        *wlen = s - orig_s;
4784
                    }
4785
                    return -2;
4786
                }
4787
            }
4788
        }
4789
    }
4790
    unicode[outpos] = L'\0';
4791
    if (wlen) {
  Branch (4791:9): [True: 1.97k, False: 0]
4792
        *wlen = outpos;
4793
    }
4794
    *wstr = unicode;
4795
    return 0;
4796
}
4797
4798
4799
wchar_t*
4800
_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
4801
                               size_t *wlen)
4802
{
4803
    wchar_t *wstr;
4804
    int res = _Py_DecodeUTF8Ex(arg, arglen,
4805
                               &wstr, wlen,
4806
                               NULL, _Py_ERROR_SURROGATEESCAPE);
4807
    if (res != 0) {
  Branch (4807:9): [True: 0, False: 274]
4808
        /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
4809
        assert(res != -3);
4810
        if (wlen) {
  Branch (4810:13): [True: 0, False: 0]
4811
            *wlen = (size_t)res;
4812
        }
4813
        return NULL;
4814
    }
4815
    return wstr;
4816
}
4817
4818
4819
/* UTF-8 encoder using the surrogateescape error handler .
4820
4821
   On success, return 0 and write the newly allocated character string (use
4822
   PyMem_Free() to free the memory) into *str.
4823
4824
   On encoding failure, return -2 and write the position of the invalid
4825
   surrogate character into *error_pos (if error_pos is set) and the decoding
4826
   error message into *reason (if reason is set).
4827
4828
   On memory allocation failure, return -1. */
4829
int
4830
_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
4831
                 const char **reason, int raw_malloc, _Py_error_handler errors)
4832
{
4833
    const Py_ssize_t max_char_size = 4;
4834
    Py_ssize_t len = wcslen(text);
4835
4836
    assert(len >= 0);
4837
4838
    int surrogateescape = 0;
4839
    int surrogatepass = 0;
4840
    switch (errors)
4841
    {
4842
    case _Py_ERROR_STRICT:
  Branch (4842:5): [True: 1.11k, False: 167]
4843
        break;
4844
    case _Py_ERROR_SURROGATEESCAPE:
  Branch (4844:5): [True: 167, False: 1.11k]
4845
        surrogateescape = 1;
4846
        break;
4847
    case _Py_ERROR_SURROGATEPASS:
  Branch (4847:5): [True: 0, False: 1.27k]
4848
        surrogatepass = 1;
4849
        break;
4850
    default:
  Branch (4850:5): [True: 0, False: 1.27k]
4851
        return -3;
4852
    }
4853
4854
    if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
  Branch (4854:9): [True: 0, False: 1.27k]
4855
        return -1;
4856
    }
4857
    char *bytes;
4858
    if (raw_malloc) {
  Branch (4858:9): [True: 1.27k, False: 0]
4859
        bytes = PyMem_RawMalloc((len + 1) * max_char_size);
4860
    }
4861
    else {
4862
        bytes = PyMem_Malloc((len + 1) * max_char_size);
4863
    }
4864
    if (bytes == NULL) {
  Branch (4864:9): [True: 0, False: 1.27k]
4865
        return -1;
4866
    }
4867
4868
    char *p = bytes;
4869
    Py_ssize_t i;
4870
    for (i = 0; i < len; ) {
  Branch (4870:17): [True: 16.7k, False: 1.27k]
4871
        Py_ssize_t ch_pos = i;
4872
        Py_UCS4 ch = text[i];
4873
        i++;
4874
#if Py_UNICODE_SIZE == 2
4875
        if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
4876
            && i < len
4877
            && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
4878
        {
4879
            ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
4880
            i++;
4881
        }
4882
#endif
4883
4884
        if (ch < 0x80) {
  Branch (4884:13): [True: 16.7k, False: 0]
4885
            /* Encode ASCII */
4886
            *p++ = (char) ch;
4887
4888
        }
4889
        else if (ch < 0x0800) {
  Branch (4889:18): [True: 0, False: 0]
4890
            /* Encode Latin-1 */
4891
            *p++ = (char)(0xc0 | (ch >> 6));
4892
            *p++ = (char)(0x80 | (ch & 0x3f));
4893
        }
4894
        else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
  Branch (4894:18): [True: 0, False: 0]
  Branch (4894:49): [True: 0, False: 0]
4895
            /* surrogateescape error handler */
4896
            if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
  Branch (4896:17): [True: 0, False: 0]
  Branch (4896:39): [True: 0, False: 0]
  Branch (4896:55): [True: 0, False: 0]
4897
                if (error_pos != NULL) {
  Branch (4897:21): [True: 0, False: 0]
4898
                    *error_pos = (size_t)ch_pos;
4899
                }
4900
                if (reason != NULL) {
  Branch (4900:21): [True: 0, False: 0]
4901
                    *reason = "encoding error";
4902
                }
4903
                if (raw_malloc) {
  Branch (4903:21): [True: 0, False: 0]
4904
                    PyMem_RawFree(bytes);
4905
                }
4906
                else {
4907
                    PyMem_Free(bytes);
4908
                }
4909
                return -2;
4910
            }
4911
            *p++ = (char)(ch & 0xff);
4912
        }
4913
        else if (ch < 0x10000) {
  Branch (4913:18): [True: 0, False: 0]
4914
            *p++ = (char)(0xe0 | (ch >> 12));
4915
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4916
            *p++ = (char)(0x80 | (ch & 0x3f));
4917
        }
4918
        else {  /* ch >= 0x10000 */
4919
            assert(ch <= MAX_UNICODE);
4920
            /* Encode UCS4 Unicode ordinals */
4921
            *p++ = (char)(0xf0 | (ch >> 18));
4922
            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4923
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4924
            *p++ = (char)(0x80 | (ch & 0x3f));
4925
        }
4926
    }
4927
    *p++ = '\0';
4928
4929
    size_t final_size = (p - bytes);
4930
    char *bytes2;
4931
    if (raw_malloc) {
  Branch (4931:9): [True: 1.27k, False: 0]
4932
        bytes2 = PyMem_RawRealloc(bytes, final_size);
4933
    }
4934
    else {
4935
        bytes2 = PyMem_Realloc(bytes, final_size);
4936
    }
4937
    if (bytes2 == NULL) {
  Branch (4937:9): [True: 0, False: 1.27k]
4938
        if (error_pos != NULL) {
  Branch (4938:13): [True: 0, False: 0]
4939
            *error_pos = (size_t)-1;
4940
        }
4941
        if (raw_malloc) {
  Branch (4941:13): [True: 0, False: 0]
4942
            PyMem_RawFree(bytes);
4943
        }
4944
        else {
4945
            PyMem_Free(bytes);
4946
        }
4947
        return -1;
4948
    }
4949
    *str = bytes2;
4950
    return 0;
4951
}
4952
4953
4954
/* Primary internal function which creates utf8 encoded bytes objects.
4955
4956
   Allocation strategy:  if the string is short, convert into a stack buffer
4957
   and allocate exactly as much space needed at the end.  Else allocate the
4958
   maximum possible needed (4 result bytes per Unicode character), and return
4959
   the excess memory at the end.
4960
*/
4961
static PyObject *
4962
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
4963
                    const char *errors)
4964
{
4965
    if (!PyUnicode_Check(unicode)) {
  Branch (4965:9): [True: 0, False: 1.87M]
4966
        PyErr_BadArgument();
4967
        return NULL;
4968
    }
4969
4970
    if (PyUnicode_UTF8(unicode))
4971
        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4972
                                         PyUnicode_UTF8_LENGTH(unicode));
4973
4974
    int kind = PyUnicode_KIND(unicode);
4975
    const void *data = PyUnicode_DATA(unicode);
4976
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
4977
4978
    _PyBytesWriter writer;
4979
    char *end;
4980
4981
    switch (kind) {
4982
    default:
  Branch (4982:5): [True: 0, False: 230k]
4983
        Py_UNREACHABLE();
4984
    case PyUnicode_1BYTE_KIND:
  Branch (4984:5): [True: 73.8k, False: 156k]
4985
        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4986
        assert(!PyUnicode_IS_ASCII(unicode));
4987
        end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
4988
        break;
4989
    case PyUnicode_2BYTE_KIND:
  Branch (4989:5): [True: 154k, False: 75.1k]
4990
        end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
4991
        break;
4992
    case PyUnicode_4BYTE_KIND:
  Branch (4992:5): [True: 1.27k, False: 228k]
4993
        end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
4994
        break;
4995
    }
4996
4997
    if (end == NULL) {
  Branch (4997:9): [True: 313, False: 229k]
4998
        _PyBytesWriter_Dealloc(&writer);
4999
        return NULL;
5000
    }
5001
    return _PyBytesWriter_Finish(&writer, end);
5002
}
5003
5004
static int
5005
unicode_fill_utf8(PyObject *unicode)
5006
{
5007
    /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5008
    assert(!PyUnicode_IS_ASCII(unicode));
5009
5010
    int kind = PyUnicode_KIND(unicode);
5011
    const void *data = PyUnicode_DATA(unicode);
5012
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5013
5014
    _PyBytesWriter writer;
5015
    char *end;
5016
5017
    switch (kind) {
5018
    default:
  Branch (5018:5): [True: 0, False: 3.26k]
5019
        Py_UNREACHABLE();
5020
    case PyUnicode_1BYTE_KIND:
  Branch (5020:5): [True: 2.88k, False: 380]
5021
        end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5022
                                   _Py_ERROR_STRICT, NULL);
5023
        break;
5024
    case PyUnicode_2BYTE_KIND:
  Branch (5024:5): [True: 315, False: 2.95k]
5025
        end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5026
                                   _Py_ERROR_STRICT, NULL);
5027
        break;
5028
    case PyUnicode_4BYTE_KIND:
  Branch (5028:5): [True: 65, False: 3.20k]
5029
        end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5030
                                   _Py_ERROR_STRICT, NULL);
5031
        break;
5032
    }
5033
    if (end == NULL) {
  Branch (5033:9): [True: 69, False: 3.19k]
5034
        _PyBytesWriter_Dealloc(&writer);
5035
        return -1;
5036
    }
5037
5038
    const char *start = writer.use_small_buffer ? 
writer.small_buffer2.91k
:
  Branch (5038:25): [True: 2.91k, False: 284]
5039
                    
PyBytes_AS_STRING284
(writer.buffer);
5040
    Py_ssize_t len = end - start;
5041
5042
    char *cache = PyObject_Malloc(len + 1);
5043
    if (cache == NULL) {
  Branch (5043:9): [True: 0, False: 3.19k]
5044
        _PyBytesWriter_Dealloc(&writer);
5045
        PyErr_NoMemory();
5046
        return -1;
5047
    }
5048
    _PyUnicode_UTF8(unicode) = cache;
5049
    _PyUnicode_UTF8_LENGTH(unicode) = len;
5050
    memcpy(cache, start, len);
5051
    cache[len] = '\0';
5052
    _PyBytesWriter_Dealloc(&writer);
5053
    return 0;
5054
}
5055
5056
PyObject *
5057
_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5058
{
5059
    return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5060
}
5061
5062
5063
PyObject *
5064
PyUnicode_AsUTF8String(PyObject *unicode)
5065
{
5066
    return _PyUnicode_AsUTF8String(unicode, NULL);
5067
}
5068
5069
/* --- UTF-32 Codec ------------------------------------------------------- */
5070
5071
PyObject *
5072
PyUnicode_DecodeUTF32(const char *s,
5073
                      Py_ssize_t size,
5074
                      const char *errors,
5075
                      int *byteorder)
5076
{
5077
    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5078
}
5079
5080
PyObject *
5081
PyUnicode_DecodeUTF32Stateful(const char *s,
5082
                              Py_ssize_t size,
5083
                              const char *errors,
5084
                              int *byteorder,
5085
                              Py_ssize_t *consumed)
5086
{
5087
    const char *starts = s;
5088
    Py_ssize_t startinpos;
5089
    Py_ssize_t endinpos;
5090
    _PyUnicodeWriter writer;
5091
    const unsigned char *q, *e;
5092
    int le, bo = 0;       /* assume native ordering by default */
5093
    const char *encoding;
5094
    const char *errmsg = "";
5095
    PyObject *errorHandler = NULL;
5096
    PyObject *exc = NULL;
5097
5098
    q = (const unsigned char *)s;
5099
    e = q + size;
5100
5101
    if (byteorder)
  Branch (5101:9): [True: 57.9k, False: 33]
5102
        bo = *byteorder;
5103
5104
    /* Check for BOM marks (U+FEFF) in the input and adjust current
5105
       byte order setting accordingly. In native mode, the leading BOM
5106
       mark is skipped, in all other modes, it is copied to the output
5107
       stream as-is (giving a ZWNBSP character). */
5108
    if (bo == 0 && 
size >= 41.06k
) {
  Branch (5108:9): [True: 1.06k, False: 56.9k]
  Branch (5108:20): [True: 977, False: 91]
5109
        Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5110
        if (bom == 0x0000FEFF) {
  Branch (5110:13): [True: 935, False: 42]
5111
            bo = -1;
5112
            q += 4;
5113
        }
5114
        else if (bom == 0xFFFE0000) {
  Branch (5114:18): [True: 40, False: 2]
5115
            bo = 1;
5116
            q += 4;
5117
        }
5118
        if (byteorder)
  Branch (5118:13): [True: 947, False: 30]
5119
            *byteorder = bo;
5120
    }
5121
5122
    if (q == e) {
  Branch (5122:9): [True: 1.14k, False: 56.8k]
5123
        if (consumed)
  Branch (5123:13): [True: 51, False: 1.09k]
5124
            *consumed = size;
5125
        _Py_RETURN_UNICODE_EMPTY();
5126
    }
5127
5128
#ifdef WORDS_BIGENDIAN
5129
    le = bo < 0;
5130
#else
5131
    le = bo <= 0;
5132
#endif
5133
    encoding = le ? 
"utf-32-le"37.9k
:
"utf-32-be"18.9k
;
  Branch (5133:16): [True: 37.9k, False: 18.9k]
5134
5135
    _PyUnicodeWriter_Init(&writer);
5136
    writer.min_length = (e - q + 3) / 4;
5137
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
  Branch (5137:9): [True: 0, False: 56.8k]
5138
        goto onError;
5139
5140
    
while (56.8k
1) {
  Branch (5140:12): [Folded - Ignored]
5141
        Py_UCS4 ch = 0;
5142
        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5143
5144
        if (e - q >= 4) {
  Branch (5144:13): [True: 57.0k, False: 4.76k]
5145
            int kind = writer.kind;
5146
            void *data = writer.data;
5147
            const unsigned char *last = e - 4;
5148
            Py_ssize_t pos = writer.pos;
5149
            if (le) {
  Branch (5149:17): [True: 38.0k, False: 19.0k]
5150
                do {
5151
                    ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5152
                    if (ch > maxch)
  Branch (5152:25): [True: 3.26k, False: 597k]
5153
                        break;
5154
                    if (kind != PyUnicode_1BYTE_KIND &&
  Branch (5154:25): [True: 37.2k, False: 559k]
5155
                        
Py_UNICODE_IS_SURROGATE(ch)37.2k
)
  Branch (5155:25): [True: 10, False: 37.2k]
5156
                        break;
5157
                    PyUnicode_WRITE(kind, data, pos++, ch);
5158
                    q += 4;
5159
                } while (q <= last);
  Branch (5159:26): [True: 562k, False: 34.7k]
5160
            }
5161
            else {
5162
                do {
5163
                    ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5164
                    if (ch > maxch)
  Branch (5164:25): [True: 1.68k, False: 299k]
5165
                        break;
5166
                    if (kind != PyUnicode_1BYTE_KIND &&
  Branch (5166:25): [True: 19.9k, False: 279k]
5167
                        
Py_UNICODE_IS_SURROGATE(ch)19.9k
)
  Branch (5167:25): [True: 5, False: 19.9k]
5168
                        break;
5169
                    PyUnicode_WRITE(kind, data, pos++, ch);
5170
                    q += 4;
5171
                } while (q <= last);
  Branch (5171:26): [True: 282k, False: 17.3k]
5172
            }
5173
            writer.pos = pos;
5174
        }
5175
5176
        if (Py_UNICODE_IS_SURROGATE(ch)) {
  Branch (5176:13): [True: 122, False: 61.6k]
5177
            errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5178
            startinpos = ((const char *)q) - starts;
5179
            endinpos = startinpos + 4;
5180
        }
5181
        else if (ch <= maxch) {
  Branch (5181:18): [True: 56.8k, False: 4.84k]
5182
            if (q == e || 
consumed6.41k
)
  Branch (5182:17): [True: 50.4k, False: 6.41k]
  Branch (5182:27): [True: 6.40k, False: 8]
5183
                break;
5184
            /* remaining bytes at the end? (size should be divisible by 4) */
5185
            errmsg = "truncated data";
5186
            startinpos = ((const char *)q) - starts;
5187
            endinpos = ((const char *)e) - starts;
5188
        }
5189
        else {
5190
            if (ch < 0x110000) {
  Branch (5190:17): [True: 4.83k, False: 8]
5191
                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
  Branch (5191:21): [True: 0, False: 4.83k]
5192
                    goto onError;
5193
                q += 4;
5194
                continue;
5195
            }
5196
            errmsg = "code point not in range(0x110000)";
5197
            startinpos = ((const char *)q) - starts;
5198
            endinpos = startinpos + 4;
5199
        }
5200
5201
        /* The remaining input chars are ignored if the callback
5202
           chooses to skip the input */
5203
        if (unicode_decode_call_errorhandler_writer(
  Branch (5203:13): [True: 15, False: 123]
5204
                errors, &errorHandler,
5205
                encoding, errmsg,
5206
                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5207
                &writer))
5208
            goto onError;
5209
    }
5210
5211
    if (consumed)
  Branch (5211:9): [True: 56.6k, False: 182]
5212
        *consumed = (const char *)q-starts;
5213
5214
    Py_XDECREF(errorHandler);
5215
    Py_XDECREF(exc);
5216
    return _PyUnicodeWriter_Finish(&writer);
5217
5218
  onError:
5219
    _PyUnicodeWriter_Dealloc(&writer);
5220
    Py_XDECREF(errorHandler);
5221
    Py_XDECREF(exc);
5222
    return NULL;
5223
}
5224
5225
PyObject *
5226
_PyUnicode_EncodeUTF32(PyObject *str,
5227
                       const char *errors,
5228
                       int byteorder)
5229
{
5230
    int kind;
5231
    const void *data;
5232
    Py_ssize_t len;
5233
    PyObject *v;
5234
    uint32_t *out;
5235
#if PY_LITTLE_ENDIAN
5236
    int native_ordering = byteorder <= 0;
5237
#else
5238
    int native_ordering = byteorder >= 0;
5239
#endif
5240
    const char *encoding;
5241
    Py_ssize_t nsize, pos;
5242
    PyObject *errorHandler = NULL;
5243
    PyObject *exc = NULL;
5244
    PyObject *rep = NULL;
5245
5246
    if (!PyUnicode_Check(str)) {
  Branch (5246:9): [True: 0, False: 2.45k]
5247
        PyErr_BadArgument();
5248
        return NULL;
5249
    }
5250
    kind = PyUnicode_KIND(str);
5251
    data = PyUnicode_DATA(str);
5252
    len = PyUnicode_GET_LENGTH(str);
5253
5254
    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
  Branch (5254:9): [True: 0, False: 2.45k]
5255
        return PyErr_NoMemory();
5256
    nsize = len + (byteorder == 0);
5257
    v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5258
    if (v == NULL)
  Branch (5258:9): [True: 0, False: 2.45k]
5259
        return NULL;
5260
5261
    /* output buffer is 4-bytes aligned */
5262
    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5263
    out = (uint32_t *)PyBytes_AS_STRING(v);
5264
    if (byteorder == 0)
  Branch (5264:9): [True: 806, False: 1.64k]
5265
        *out++ = 0xFEFF;
5266
    if (len == 0)
  Branch (5266:9): [True: 5, False: 2.45k]
5267
        goto done;
5268
5269
    if (byteorder == -1)
  Branch (5269:9): [True: 868, False: 1.58k]
5270
        encoding = "utf-32-le";
5271
    else if (byteorder == 1)
  Branch (5271:14): [True: 777, False: 805]
5272
        encoding = "utf-32-be";
5273
    else
5274
        encoding = "utf-32";
5275
5276
    if (kind == PyUnicode_1BYTE_KIND) {
  Branch (5276:9): [True: 1.81k, False: 636]
5277
        ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5278
        goto done;
5279
    }
5280
5281
    pos = 0;
5282
    while (pos < len) {
  Branch (5282:12): [True: 724, False: 7]
5283
        Py_ssize_t newpos, repsize, moreunits;
5284
5285
        if (kind == PyUnicode_2BYTE_KIND) {
  Branch (5285:13): [True: 690, False: 34]
5286
            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5287
                                        &out, native_ordering);
5288
        }
5289
        else {
5290
            assert(kind == PyUnicode_4BYTE_KIND);
5291
            pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5292
                                        &out, native_ordering);
5293
        }
5294
        if (pos == len)
  Branch (5294:13): [True: 615, False: 109]
5295
            break;
5296
5297
        rep = unicode_encode_call_errorhandler(
5298
                errors, &errorHandler,
5299
                encoding, "surrogates not allowed",
5300
                str, &exc, pos, pos + 1, &newpos);
5301
        if (!rep)
  Branch (5301:13): [True: 3, False: 106]
5302
            goto error;
5303
5304
        if (PyBytes_Check(rep)) {
5305
            repsize = PyBytes_GET_SIZE(rep);
5306
            if (repsize & 3) {
  Branch (5306:17): [True: 8, False: 17]
5307
                raise_encode_exception(&exc, encoding,
5308
                                       str, pos, pos + 1,
5309
                                       "surrogates not allowed");
5310
                goto error;
5311
            }
5312
            moreunits = repsize / 4;
5313
        }
5314
        else {
5315
            assert(PyUnicode_Check(rep));
5316
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5317
            if (!PyUnicode_IS_ASCII(rep)) {
  Branch (5317:17): [True: 3, False: 78]
5318
                raise_encode_exception(&exc, encoding,
5319
                                       str, pos, pos + 1,
5320
                                       "surrogates not allowed");
5321
                goto error;
5322
            }
5323
        }
5324
        moreunits += pos - newpos;
5325
        pos = newpos;
5326
5327
        /* four bytes are reserved for each surrogate */
5328
        if (moreunits > 0) {
  Branch (5328:13): [True: 59, False: 36]
5329
            Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
5330
            if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
  Branch (5330:17): [True: 0, False: 59]
5331
                /* integer overflow */
5332
                PyErr_NoMemory();
5333
                goto error;
5334
            }
5335
            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0)
  Branch (5335:17): [True: 0, False: 59]
5336
                goto error;
5337
            out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
5338
        }
5339
5340
        if (PyBytes_Check(rep)) {
5341
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
5342
            out += repsize / 4;
5343
        } else /* rep is unicode */ {
5344
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5345
            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5346
                                 &out, native_ordering);
5347
        }
5348
5349
        Py_CLEAR(rep);
5350
    }
5351
5352
    /* Cut back to size actually needed. This is necessary for, for example,
5353
       encoding of a string containing isolated surrogates and the 'ignore'
5354
       handler is used. */
5355
    nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5356
    if (nsize != PyBytes_GET_SIZE(v))
  Branch (5356:9): [True: 6, False: 616]
5357
      _PyBytes_Resize(&v, nsize);
5358
    Py_XDECREF(errorHandler);
5359
    Py_XDECREF(exc);
5360
  done:
5361
    return v;
5362
  error:
5363
    Py_XDECREF(rep);
5364
    Py_XDECREF(errorHandler);
5365
    Py_XDECREF(exc);
5366
    Py_XDECREF(v);
5367
    return NULL;
5368
}
5369
5370
PyObject *
5371
PyUnicode_AsUTF32String(PyObject *unicode)
5372
{
5373
    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5374
}
5375
5376
/* --- UTF-16 Codec ------------------------------------------------------- */
5377
5378
PyObject *
5379
PyUnicode_DecodeUTF16(const char *s,
5380
                      Py_ssize_t size,
5381
                      const char *errors,
5382
                      int *byteorder)
5383
{
5384
    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5385
}
5386
5387
PyObject *
5388
PyUnicode_DecodeUTF16Stateful(const char *s,
5389
                              Py_ssize_t size,
5390
                              const char *errors,
5391
                              int *byteorder,
5392
                              Py_ssize_t *consumed)
5393
{
5394
    const char *starts = s;
5395
    Py_ssize_t startinpos;
5396
    Py_ssize_t endinpos;
5397
    _PyUnicodeWriter writer;
5398
    const unsigned char *q, *e;
5399
    int bo = 0;       /* assume native ordering by default */
5400
    int native_ordering;
5401
    const char *errmsg = "";
5402
    PyObject *errorHandler = NULL;
5403
    PyObject *exc = NULL;
5404
    const char *encoding;
5405
5406
    q = (const unsigned char *)s;
5407
    e = q + size;
5408
5409
    if (byteorder)
  Branch (5409:9): [True: 32.7k, False: 1.06k]
5410
        bo = *byteorder;
5411
5412
    /* Check for BOM marks (U+FEFF) in the input and adjust current
5413
       byte order setting accordingly. In native mode, the leading BOM
5414
       mark is skipped, in all other modes, it is copied to the output
5415
       stream as-is (giving a ZWNBSP character). */
5416
    if (bo == 0 && 
size >= 22.03k
) {
  Branch (5416:9): [True: 2.03k, False: 31.8k]
  Branch (5416:20): [True: 2.00k, False: 38]
5417
        const Py_UCS4 bom = (q[1] << 8) | q[0];
5418
        if (bom == 0xFEFF) {
  Branch (5418:13): [True: 1.97k, False: 23]
5419
            q += 2;
5420
            bo = -1;
5421
        }
5422
        else if (bom == 0xFFFE) {
  Branch (5422:18): [True: 21, False: 2]
5423
            q += 2;
5424
            bo = 1;
5425
        }
5426
        if (byteorder)
  Branch (5426:13): [True: 940, False: 1.06k]
5427
            *byteorder = bo;
5428
    }
5429
5430
    if (q == e) {
  Branch (5430:9): [True: 1.19k, False: 32.6k]
5431
        if (consumed)
  Branch (5431:13): [True: 72, False: 1.11k]
5432
            *consumed = size;
5433
        _Py_RETURN_UNICODE_EMPTY();
5434
    }
5435
5436
#if PY_LITTLE_ENDIAN
5437
    native_ordering = bo <= 0;
5438
    encoding = bo <= 0 ? 
"utf-16-le"21.7k
:
"utf-16-be"10.8k
;
  Branch (5438:16): [True: 21.7k, False: 10.8k]
5439
#else
5440
    native_ordering = bo >= 0;
5441
    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5442
#endif
5443
5444
    /* Note: size will always be longer than the resulting Unicode
5445
       character count normally.  Error handler will take care of
5446
       resizing when needed. */
5447
    _PyUnicodeWriter_Init(&writer);
5448
    writer.min_length = (e - q + 1) / 2;
5449
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
  Branch (5449:9): [True: 0, False: 32.6k]
5450
        goto onError;
5451
5452
    
while (32.6k
1) {
  Branch (5452:12): [Folded - Ignored]
5453
        Py_UCS4 ch = 0;
5454
        if (e - q >= 2) {
  Branch (5454:13): [True: 35.6k, False: 4.15k]
5455
            int kind = writer.kind;
5456
            if (kind == PyUnicode_1BYTE_KIND) {
  Branch (5456:17): [True: 31.4k, False: 4.13k]
5457
                if (PyUnicode_IS_ASCII(writer.buffer))
5458
                    ch = asciilib_utf16_decode(&q, e,
5459
                            (Py_UCS1*)writer.data, &writer.pos,
5460
                            native_ordering);
5461
                else
5462
                    ch = ucs1lib_utf16_decode(&q, e,
5463
                            (Py_UCS1*)writer.data, &writer.pos,
5464
                            native_ordering);
5465
            } else 
if (4.13k
kind == PyUnicode_2BYTE_KIND4.13k
) {
  Branch (5465:24): [True: 4.09k, False: 38]
5466
                ch = ucs2lib_utf16_decode(&q, e,
5467
                        (Py_UCS2*)writer.data, &writer.pos,
5468
                        native_ordering);
5469
            } else {
5470
                assert(kind == PyUnicode_4BYTE_KIND);
5471
                ch = ucs4lib_utf16_decode(&q, e,
5472
                        (Py_UCS4*)writer.data, &writer.pos,
5473
                        native_ordering);
5474
            }
5475
        }
5476
5477
        switch (ch)
5478
        {
5479
        case 0:
  Branch (5479:9): [True: 32.6k, False: 7.14k]
5480
            /* remaining byte at the end? (size should be even) */
5481
            if (q == e || 
consumed1.92k
)
  Branch (5481:17): [True: 30.7k, False: 1.92k]
  Branch (5481:27): [True: 1.90k, False: 19]
5482
                goto End;
5483
            errmsg = "truncated data";
5484
            startinpos = ((const char *)q) - starts;
5485
            endinpos = ((const char *)e) - starts;
5486
            break;
5487
            /* The remaining input chars are ignored if the callback
5488
               chooses to skip the input */
5489
        case 1:
  Branch (5489:9): [True: 44, False: 39.7k]
5490
            q -= 2;
5491
            if (consumed)
  Branch (5491:17): [True: 30, False: 14]
5492
                goto End;
5493
            errmsg = "unexpected end of data";
5494
            startinpos = ((const char *)q) - starts;
5495
            endinpos = ((const char *)e) - starts;
5496
            break;
5497
        case 2:
  Branch (5497:9): [True: 105, False: 39.6k]
5498
            errmsg = "illegal encoding";
5499
            startinpos = ((const char *)q) - 2 - starts;
5500
            endinpos = startinpos + 2;
5501
            break;
5502
        case 3:
  Branch (5502:9): [True: 12, False: 39.7k]
5503
            errmsg = "illegal UTF-16 surrogate";
5504
            startinpos = ((const char *)q) - 4 - starts;
5505
            endinpos = startinpos + 2;
5506
            break;
5507
        default:
  Branch (5507:9): [True: 6.98k, False: 32.7k]
5508
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
  Branch (5508:17): [True: 0, False: 6.98k]
5509
                goto onError;
5510
            continue;
5511
        }
5512
5513
        if (unicode_decode_call_errorhandler_writer(
  Branch (5513:13): [True: 27, False: 123]
5514
                errors,
5515
                &errorHandler,
5516
                encoding, errmsg,
5517
                &starts,
5518
                (const char **)&e,
5519
                &startinpos,
5520
                &endinpos,
5521
                &exc,
5522
                (const char **)&q,
5523
                &writer))
5524
            goto onError;
5525
    }
5526
5527
End:
5528
    if (consumed)
  Branch (5528:9): [True: 29.3k, False: 3.30k]
5529
        *consumed = (const char *)q-starts;
5530
5531
    Py_XDECREF(errorHandler);
5532
    Py_XDECREF(exc);
5533
    return _PyUnicodeWriter_Finish(&writer);
5534
5535
  onError:
5536
    _PyUnicodeWriter_Dealloc(&writer);
5537
    Py_XDECREF(errorHandler);
5538
    Py_XDECREF(exc);
5539
    return NULL;
5540
}
5541
5542
PyObject *
5543
_PyUnicode_EncodeUTF16(PyObject *str,
5544
                       const char *errors,
5545
                       int byteorder)
5546
{
5547
    int kind;
5548
    const void *data;
5549
    Py_ssize_t len;
5550
    PyObject *v;
5551
    unsigned short *out;
5552
    Py_ssize_t pairs;
5553
#if PY_BIG_ENDIAN
5554
    int native_ordering = byteorder >= 0;
5555
#else
5556
    int native_ordering = byteorder <= 0;
5557
#endif
5558
    const char *encoding;
5559
    Py_ssize_t nsize, pos;
5560
    PyObject *errorHandler = NULL;
5561
    PyObject *exc = NULL;
5562
    PyObject *rep = NULL;
5563
5564
    if (!PyUnicode_Check(str)) {
  Branch (5564:9): [True: 0, False: 5.83k]
5565
        PyErr_BadArgument();
5566
        return NULL;
5567
    }
5568
    kind = PyUnicode_KIND(str);
5569
    data = PyUnicode_DATA(str);
5570
    len = PyUnicode_GET_LENGTH(str);
5571
5572
    pairs = 0;
5573
    if (kind == PyUnicode_4BYTE_KIND) {
  Branch (5573:9): [True: 56, False: 5.77k]
5574
        const Py_UCS4 *in = (const Py_UCS4 *)data;
5575
        const Py_UCS4 *end = in + len;
5576
        while (in < end) {
  Branch (5576:16): [True: 1.39k, False: 56]
5577
            if (*in++ >= 0x10000) {
  Branch (5577:17): [True: 94, False: 1.30k]
5578
                pairs++;
5579
            }
5580
        }
5581
    }
5582
    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
  Branch (5582:9): [True: 0, False: 5.83k]
5583
        return PyErr_NoMemory();
5584
    }
5585
    nsize = len + pairs + (byteorder == 0);
5586
    v = PyBytes_FromStringAndSize(NULL, nsize * 2);
5587
    if (v == NULL) {
  Branch (5587:9): [True: 0, False: 5.83k]
5588
        return NULL;
5589
    }
5590
5591
    /* output buffer is 2-bytes aligned */
5592
    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5593
    out = (unsigned short *)PyBytes_AS_STRING(v);
5594
    if (byteorder == 0) {
  Branch (5594:9): [True: 1.88k, False: 3.94k]
5595
        *out++ = 0xFEFF;
5596
    }
5597
    if (len == 0) {
  Branch (5597:9): [True: 23, False: 5.81k]
5598
        goto done;
5599
    }
5600
5601
    if (kind == PyUnicode_1BYTE_KIND) {
  Branch (5601:9): [True: 2.83k, False: 2.97k]
5602
        ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5603
        goto done;
5604
    }
5605
5606
    if (byteorder < 0) {
  Branch (5606:9): [True: 996, False: 1.98k]
5607
        encoding = "utf-16-le";
5608
    }
5609
    else if (byteorder > 0) {
  Branch (5609:14): [True: 988, False: 995]
5610
        encoding = "utf-16-be";
5611
    }
5612
    else {
5613
        encoding = "utf-16";
5614
    }
5615
5616
    pos = 0;
5617
    while (pos < len) {
  Branch (5617:12): [True: 3.06k, False: 7]
5618
        Py_ssize_t newpos, repsize, moreunits;
5619
5620
        if (kind == PyUnicode_2BYTE_KIND) {
  Branch (5620:13): [True: 3.00k, False: 62]
5621
            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5622
                                        &out, native_ordering);
5623
        }
5624
        else {
5625
            assert(kind == PyUnicode_4BYTE_KIND);
5626
            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5627
                                        &out, native_ordering);
5628
        }
5629
        if (pos == len)
  Branch (5629:13): [True: 2.96k, False: 107]
5630
            break;
5631
5632
        rep = unicode_encode_call_errorhandler(
5633
                errors, &errorHandler,
5634
                encoding, "surrogates not allowed",
5635
                str, &exc, pos, pos + 1, &newpos);
5636
        if (!rep)
  Branch (5636:13): [True: 5, False: 102]
5637
            goto error;
5638
5639
        if (PyBytes_Check(rep)) {
5640
            repsize = PyBytes_GET_SIZE(rep);
5641
            if (repsize & 1) {
  Branch (5641:17): [True: 4, False: 17]
5642
                raise_encode_exception(&exc, encoding,
5643
                                       str, pos, pos + 1,
5644
                                       "surrogates not allowed");
5645
                goto error;
5646
            }
5647
            moreunits = repsize / 2;
5648
        }
5649
        else {
5650
            assert(PyUnicode_Check(rep));
5651
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5652
            if (!PyUnicode_IS_ASCII(rep)) {
  Branch (5652:17): [True: 3, False: 78]
5653
                raise_encode_exception(&exc, encoding,
5654
                                       str, pos, pos + 1,
5655
                                       "surrogates not allowed");
5656
                goto error;
5657
            }
5658
        }
5659
        moreunits += pos - newpos;
5660
        pos = newpos;
5661
5662
        /* two bytes are reserved for each surrogate */
5663
        if (moreunits > 0) {
  Branch (5663:13): [True: 59, False: 36]
5664
            Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5665
            if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
  Branch (5665:17): [True: 0, False: 59]
5666
                /* integer overflow */
5667
                PyErr_NoMemory();
5668
                goto error;
5669
            }
5670
            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * moreunits) < 0)
  Branch (5670:17): [True: 0, False: 59]
5671
                goto error;
5672
            out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5673
        }
5674
5675
        if (PyBytes_Check(rep)) {
5676
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
5677
            out += repsize / 2;
5678
        } else /* rep is unicode */ {
5679
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5680
            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5681
                                 &out, native_ordering);
5682
        }
5683
5684
        Py_CLEAR(rep);
5685
    }
5686
5687
    /* Cut back to size actually needed. This is necessary for, for example,
5688
    encoding of a string containing isolated surrogates and the 'ignore' handler
5689
    is used. */
5690
    nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5691
    if (nsize != PyBytes_GET_SIZE(v))
  Branch (5691:9): [True: 6, False: 2.96k]
5692
      _PyBytes_Resize(&v, nsize);
5693
    Py_XDECREF(errorHandler);
5694
    Py_XDECREF(exc);
5695
  done:
5696
    return v;
5697
  error:
5698
    Py_XDECREF(rep);
5699
    Py_XDECREF(errorHandler);
5700
    Py_XDECREF(exc);
5701
    Py_XDECREF(v);
5702
    return NULL;
5703
#undef STORECHAR
5704
}
5705
5706
PyObject *
5707
PyUnicode_AsUTF16String(PyObject *unicode)
5708
{
5709
    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5710
}
5711
5712
/* --- Unicode Escape Codec ----------------------------------------------- */
5713
5714
static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
5715
5716
PyObject *
5717
_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
5718
                               Py_ssize_t size,
5719
                               const char *errors,
5720
                               Py_ssize_t *consumed,
5721
                               const char **first_invalid_escape)
5722
{
5723
    const char *starts = s;
5724
    _PyUnicodeWriter writer;
5725
    const char *end;
5726
    PyObject *errorHandler = NULL;
5727
    PyObject *exc = NULL;
5728
5729
    // so we can remember if we've seen an invalid escape char or not
5730
    *first_invalid_escape = NULL;
5731
5732
    if (size == 0) {
  Branch (5732:9): [True: 17, False: 92.5k]
5733
        if (consumed) {
  Branch (5733:13): [True: 8, False: 9]
5734
            *consumed = 0;
5735
        }
5736
        _Py_RETURN_UNICODE_EMPTY();
5737
    }
5738
    /* Escaped strings will always be longer than the resulting
5739
       Unicode string, so we start with size here and then reduce the
5740
       length after conversion to the true value.
5741
       (but if the error callback returns a long replacement string
5742
       we'll have to allocate more space) */
5743
    _PyUnicodeWriter_Init(&writer);
5744
    writer.min_length = size;
5745
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
  Branch (5745:9): [True: 0, False: 92.5k]
5746
        goto onError;
5747
    }
5748
5749
    end = s + size;
5750
    while (s < end) {
  Branch (5750:12): [True: 683k, False: 91.9k]
5751
        unsigned char c = (unsigned char) *s++;
5752
        Py_UCS4 ch;
5753
        int count;
5754
        const char *message;
5755
5756
#define WRITE_ASCII_CHAR(ch)                                                  \
5757
            
do 25.4k
{ \
5758
                assert(ch <= 127);                                            \
5759
                assert(writer.pos < writer.size);                             \
5760
                PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
5761
            } while(0)
5762
5763
#define WRITE_CHAR(ch)                                                        \
5764
            
do 657k
{ \
5765
                if (ch <= writer.maxchar) {                                   \
5766
                    assert(writer.pos < writer.size);                         \
5767
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5768
                }                                                             \
5769
                else 
if (7.06k
_PyUnicodeWriter_WriteCharInline(&writer, ch) < 07.06k
) { \
5770
                    goto onError;                                             \
5771
                }                                                             \
5772
            } while(0)
5773
5774
        /* Non-escape characters are interpreted as Unicode ordinals */
5775
        if (c != '\\') {
  Branch (5775:13): [True: 645k, False: 38.4k]
5776
            WRITE_CHAR(c);
5777
            continue;
5778
        }
5779
5780
        Py_ssize_t startinpos = s - starts - 1;
5781
        /* \ - Escapes */
5782
        if (s >= end) {
  Branch (5782:13): [True: 226, False: 38.1k]
5783
            message = "\\ at end of string";
5784
            goto incomplete;
5785
        }
5786
        c = (unsigned char) *s++;
5787
5788
        assert(writer.pos < writer.size);
5789
        switch (c) {
5790
5791
            /* \x escapes */
5792
        case '\n': continue;
  Branch (5792:9): [True: 320, False: 37.8k]
5793
        case '\\': WRITE_ASCII_CHAR('\\'); continue;
  Branch (5793:9): [True: 1.07k, False: 37.1k]
5794
        case '\'': WRITE_ASCII_CHAR('\''); continue;
  Branch (5794:9): [True: 93, False: 38.0k]
5795
        case '\"': WRITE_ASCII_CHAR('\"'); continue;
  Branch (5795:9): [True: 82, False: 38.1k]
5796
        case 'b': WRITE_ASCII_CHAR('\b'); continue;
  Branch (5796:9): [True: 52, False: 38.1k]
5797
        /* FF */
5798
        case 'f': WRITE_ASCII_CHAR('\014'); continue;
  Branch (5798:9): [True: 37, False: 38.1k]
5799
        case 't': WRITE_ASCII_CHAR('\t'); continue;
  Branch (5799:9): [True: 3.59k, False: 34.5k]
5800
        case 'n': WRITE_ASCII_CHAR('\n'); continue;
  Branch (5800:9): [True: 16.7k, False: 21.3k]
5801
        case 'r': WRITE_ASCII_CHAR('\r'); continue;
  Branch (5801:9): [True: 3.57k, False: 34.6k]
5802
        /* VT */
5803
        case 'v': WRITE_ASCII_CHAR('\013'); continue;
  Branch (5803:9): [True: 12, False: 38.1k]
5804
        /* BEL, not classic C */
5805
        case 'a': WRITE_ASCII_CHAR('\007'); continue;
  Branch (5805:9): [True: 6, False: 38.1k]
5806
5807
            /* \OOO (octal) escapes */
5808
        
case '0': 38
case '1': case '2': case '3':
  Branch (5808:9): [True: 38, False: 38.1k]
  Branch (5808:19): [True: 17, False: 38.1k]
  Branch (5808:29): [True: 0, False: 38.1k]
  Branch (5808:39): [True: 0, False: 38.1k]
5809
        
case '4': 187
case '5': 315
case '6': 443
case '7':
  Branch (5809:9): [True: 132, False: 38.0k]
  Branch (5809:19): [True: 128, False: 38.0k]
  Branch (5809:29): [True: 128, False: 38.0k]
  Branch (5809:39): [True: 130, False: 38.0k]
5810
            ch = c - '0';
5811
            if (s < end && 
'0' <= *s533
&&
*s <= '7'525
) {
  Branch (5811:17): [True: 533, False: 40]
  Branch (5811:28): [True: 525, False: 8]
  Branch (5811:41): [True: 519, False: 6]
5812
                ch = (ch<<3) + *s++ - '0';
5813
                if (s < end && '0' <= *s && *s <= '7') {
  Branch (5813:21): [True: 519, False: 0]
  Branch (5813:32): [True: 519, False: 0]
  Branch (5813:45): [True: 517, False: 2]
5814
                    ch = (ch<<3) + *s++ - '0';
5815
                }
5816
            }
5817
            if (ch > 0377) {
  Branch (5817:17): [True: 514, False: 59]
5818
                if (*first_invalid_escape == NULL) {
  Branch (5818:21): [True: 514, False: 0]
5819
                    *first_invalid_escape = s-3; /* Back up 3 chars, since we've
5820
                                                    already incremented s. */
5821
                }
5822
            }
5823
            WRITE_CHAR(ch);
5824
            continue;
5825
5826
            /* hex escapes */
5827
            /* \xXX */
5828
        case 'x':
  Branch (5828:9): [True: 785, False: 37.4k]
5829
            count = 2;
5830
            message = "truncated \\xXX escape";
5831
            goto hexescape;
5832
5833
            /* \uXXXX */
5834
        case 'u':
  Branch (5834:9): [True: 9.06k, False: 29.1k]
5835
            count = 4;
5836
            message = "truncated \\uXXXX escape";
5837
            goto hexescape;
5838
5839
            /* \UXXXXXXXX */
5840
        case 'U':
  Branch (5840:9): [True: 1.87k, False: 36.3k]
5841
            count = 8;
5842
            message = "truncated \\UXXXXXXXX escape";
5843
        hexescape:
5844
            for (ch = 0; count; 
++s, --count51.8k
) {
  Branch (5844:26): [True: 52.2k, False: 11.4k]
5845
                if (s >= end) {
  Branch (5845:21): [True: 292, False: 51.9k]
5846
                    goto incomplete;
5847
                }
5848
                c = (unsigned char)*s;
5849
                ch <<= 4;
5850
                if (c >= '0' && c <= '9') {
  Branch (5850:21): [True: 51.9k, False: 0]
  Branch (5850:33): [True: 46.5k, False: 5.36k]
5851
                    ch += c - '0';
5852
                }
5853
                else if (c >= 'a' && 
c <= 'f'5.31k
) {
  Branch (5853:26): [True: 5.31k, False: 54]
  Branch (5853:38): [True: 5.30k, False: 5]
5854
                    ch += c - ('a' - 10);
5855
                }
5856
                else if (c >= 'A' && c <= 'F') {
  Branch (5856:26): [True: 59, False: 0]
  Branch (5856:38): [True: 24, False: 35]
5857
                    ch += c - ('A' - 10);
5858
                }
5859
                else {
5860
                    goto error;
5861
                }
5862
            }
5863
5864
            /* when we get here, ch is a 32-bit unicode character */
5865
            if (ch > MAX_UNICODE) {
  Branch (5865:17): [True: 10, False: 11.3k]
5866
                message = "illegal Unicode character";
5867
                goto error;
5868
            }
5869
5870
            WRITE_CHAR(ch);
5871
            continue;
5872
5873
            /* \N{name} */
5874
        case 'N':
  Branch (5874:9): [True: 100, False: 38.0k]
5875
            if (ucnhash_capi == NULL) {
  Branch (5875:17): [True: 5, False: 95]
5876
                /* load the unicode data module */
5877
                ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5878
                                                PyUnicodeData_CAPSULE_NAME, 1);
5879
                if (ucnhash_capi == NULL) {
  Branch (5879:21): [True: 0, False: 5]
5880
                    PyErr_SetString(
5881
                        PyExc_UnicodeError,
5882
                        "\\N escapes not supported (can't load unicodedata module)"
5883
                        );
5884
                    goto onError;
5885
                }
5886
            }
5887
5888
            message = "malformed \\N character escape";
5889
            if (s >= end) {
  Branch (5889:17): [True: 2, False: 98]
5890
                goto incomplete;
5891
            }
5892
            if (*s == '{') {
  Branch (5892:17): [True: 93, False: 5]
5893
                const char *start = ++s;
5894
                size_t namelen;
5895
                /* look for the closing brace */
5896
                while (s < end && 
*s != '}'101k
)
  Branch (5896:24): [True: 101k, False: 5]
  Branch (5896:35): [True: 101k, False: 88]
5897
                    s++;
5898
                if (s >= end) {
  Branch (5898:21): [True: 5, False: 88]
5899
                    goto incomplete;
5900
                }
5901
                namelen = s - start;
5902
                if (namelen) {
  Branch (5902:21): [True: 88, False: 0]
5903
                    /* found a name.  look it up in the unicode database */
5904
                    s++;
5905
                    ch = 0xffffffff; /* in case 'getcode' messes up */
5906
                    if (namelen <= INT_MAX &&
  Branch (5906:25): [True: 88, False: 0]
5907
                        ucnhash_capi->getcode(start, (int)namelen,
  Branch (5907:25): [True: 80, False: 8]
5908
                                              &ch, 0)) {
5909
                        assert(ch <= MAX_UNICODE);
5910
                        WRITE_CHAR(ch);
5911
                        continue;
5912
                    }
5913
                    message = "unknown Unicode character name";
5914
                }
5915
            }
5916
            goto error;
5917
5918
        default:
  Branch (5918:9): [True: 150, False: 38.0k]
5919
            if (*first_invalid_escape == NULL) {
  Branch (5919:17): [True: 150, False: 0]
5920
                *first_invalid_escape = s-1; /* Back up one char, since we've
5921
                                                already incremented s. */
5922
            }
5923
            WRITE_ASCII_CHAR('\\');
5924
            WRITE_CHAR(c);
5925
            continue;
5926
        }
5927
5928
      incomplete:
5929
        if (consumed) {
  Branch (5929:13): [True: 470, False: 55]
5930
            *consumed = startinpos;
5931
            break;
5932
        }
5933
      error:;
5934
        Py_ssize_t endinpos = s-starts;
5935
        writer.min_length = end - s + writer.pos;
5936
        if (unicode_decode_call_errorhandler_writer(
  Branch (5936:13): [True: 65, False: 48]
5937
                errors, &errorHandler,
5938
                "unicodeescape", message,
5939
                &starts, &end, &startinpos, &endinpos, &exc, &s,
5940
                &writer)) {
5941
            goto onError;
5942
        }
5943
        assert(end - s <= writer.size - writer.pos);
5944
5945
#undef WRITE_ASCII_CHAR
5946
#undef WRITE_CHAR
5947
    }
5948
5949
    Py_XDECREF(errorHandler);
5950
    Py_XDECREF(exc);
5951
    return _PyUnicodeWriter_Finish(&writer);
5952
5953
  onError:
5954
    _PyUnicodeWriter_Dealloc(&writer);
5955
    Py_XDECREF(errorHandler);
5956
    Py_XDECREF(exc);
5957
    return NULL;
5958
}
5959
5960
PyObject *
5961
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
5962
                              Py_ssize_t size,
5963
                              const char *errors,
5964
                              Py_ssize_t *consumed)
5965
{
5966
    const char *first_invalid_escape;
5967
    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
5968
                                                      consumed,
5969
                                                      &first_invalid_escape);
5970
    if (result == NULL)
  Branch (5970:9): [True: 32, False: 6.28k]
5971
        return NULL;
5972
    if (first_invalid_escape != NULL) {
  Branch (5972:9): [True: 300, False: 5.98k]
5973
        unsigned char c = *first_invalid_escape;
5974
        if ('4' <= c && c <= '7') {
  Branch (5974:13): [True: 300, False: 0]
  Branch (5974:25): [True: 256, False: 44]
5975
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
  Branch (5975:17): [True: 0, False: 256]
5976
                                 "invalid octal escape sequence '\\%.3s'",
5977
                                 first_invalid_escape) < 0)
5978
            {
5979
                Py_DECREF(result);
5980
                return NULL;
5981
            }
5982
        }
5983
        else {
5984
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
  Branch (5984:17): [True: 0, False: 44]
5985
                                 "invalid escape sequence '\\%c'",
5986
                                 c) < 0)
5987
            {
5988
                Py_DECREF(result);
5989
                return NULL;
5990
            }
5991
        }
5992
    }
5993
    return result;
5994
}
5995
5996
PyObject *
5997
PyUnicode_DecodeUnicodeEscape(const char *s,
5998
                              Py_ssize_t size,
5999
                              const char *errors)
6000
{
6001
    return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6002
}
6003
6004
/* Return a Unicode-Escape string version of the Unicode object. */
6005
6006
PyObject *
6007
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6008
{
6009
    Py_ssize_t i, len;
6010
    PyObject *repr;
6011
    char *p;
6012
    int kind;
6013
    const void *data;
6014
    Py_ssize_t expandsize;
6015
6016
    /* Initial allocation is based on the longest-possible character
6017
       escape.
6018
6019
       For UCS1 strings it's '\xxx', 4 bytes per source character.
6020
       For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6021
       For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6022
    */
6023
6024
    if (!PyUnicode_Check(unicode)) {
  Branch (6024:9): [True: 0, False: 2.06k]
6025
        PyErr_BadArgument();
6026
        return NULL;
6027
    }
6028
6029
    len = PyUnicode_GET_LENGTH(unicode);
6030
    if (len == 0) {
  Branch (6030:9): [True: 7, False: 2.05k]
6031
        return PyBytes_FromStringAndSize(NULL, 0);
6032
    }
6033
6034
    kind = PyUnicode_KIND(unicode);
6035
    data = PyUnicode_DATA(unicode);
6036
    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6037
       bytes, and 1 byte characters 4. */
6038
    expandsize = kind * 2 + 2;
6039
    if (len > PY_SSIZE_T_MAX / expandsize) {
  Branch (6039:9): [True: 0, False: 2.05k]
6040
        return PyErr_NoMemory();
6041
    }
6042
    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6043
    if (repr == NULL) {
  Branch (6043:9): [True: 0, False: 2.05k]
6044
        return NULL;
6045
    }
6046
6047
    p = PyBytes_AS_STRING(repr);
6048
    for (i = 0; i < len; 
i++296k
) {
  Branch (6048:17): [True: 296k, False: 2.05k]
6049
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6050
6051
        /* U+0000-U+00ff range */
6052
        if (ch < 0x100) {
  Branch (6052:13): [True: 289k, False: 6.39k]
6053
            if (ch >= ' ' && 
ch < 127276k
) {
  Branch (6053:17): [True: 276k, False: 13.1k]
  Branch (6053:30): [True: 276k, False: 262]
6054
                if (ch != '\\') {
  Branch (6054:21): [True: 276k, False: 30]
6055
                    /* Copy printable US ASCII as-is */
6056
                    *p++ = (char) ch;
6057
                }
6058
                /* Escape backslashes */
6059
                else {
6060
                    *p++ = '\\';
6061
                    *p++ = '\\';
6062
                }
6063
            }
6064
6065
            /* Map special whitespace to '\t', \n', '\r' */
6066
            else if (ch == '\t') {
  Branch (6066:22): [True: 10, False: 13.3k]
6067
                *p++ = '\\';
6068
                *p++ = 't';
6069
            }
6070
            else if (ch == '\n') {
  Branch (6070:22): [True: 9.72k, False: 3.63k]
6071
                *p++ = '\\';
6072
                *p++ = 'n';
6073
            }
6074
            else if (ch == '\r') {
  Branch (6074:22): [True: 3.31k, False: 325]
6075
                *p++ = '\\';
6076
                *p++ = 'r';
6077
            }
6078
6079
            /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6080
            else {
6081
                *p++ = '\\';
6082
                *p++ = 'x';
6083
                *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6084
                *p++ = Py_hexdigits[ch & 0x000F];
6085
            }
6086
        }
6087
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6088
        else if (ch < 0x10000) {
  Branch (6088:18): [True: 6.38k, False: 10]
6089
            *p++ = '\\';
6090
            *p++ = 'u';
6091
            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6092
            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6093
            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6094
            *p++ = Py_hexdigits[ch & 0x000F];
6095
        }
6096
        /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6097
        else {
6098
6099
            /* Make sure that the first two digits are zero */
6100
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6101
            *p++ = '\\';
6102
            *p++ = 'U';
6103
            *p++ = '0';
6104
            *p++ = '0';
6105
            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6106
            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6107
            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6108
            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6109
            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6110
            *p++ = Py_hexdigits[ch & 0x0000000F];
6111
        }
6112
    }
6113
6114
    assert(p - PyBytes_AS_STRING(repr) > 0);
6115
    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
  Branch (6115:9): [True: 0, False: 2.05k]
6116
        return NULL;
6117
    }
6118
    return repr;
6119
}
6120
6121
/* --- Raw Unicode Escape Codec ------------------------------------------- */
6122
6123
PyObject *
6124
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6125
                                          Py_ssize_t size,
6126
                                          const char *errors,
6127
                                          Py_ssize_t *consumed)
6128
{
6129
    const char *starts = s;
6130
    _PyUnicodeWriter writer;
6131
    const char *end;
6132
    PyObject *errorHandler = NULL;
6133
    PyObject *exc = NULL;
6134
6135
    if (size == 0) {
  Branch (6135:9): [True: 34, False: 82.6k]
6136
        if (consumed) {
  Branch (6136:13): [True: 8, False: 26]
6137
            *consumed = 0;
6138
        }
6139
        _Py_RETURN_UNICODE_EMPTY();
6140
    }
6141
6142
    /* Escaped strings will always be longer than the resulting
6143
       Unicode string, so we start with size here and then reduce the
6144
       length after conversion to the true value. (But decoding error
6145
       handler might have to resize the string) */
6146
    _PyUnicodeWriter_Init(&writer);
6147
    writer.min_length = size;
6148
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
  Branch (6148:9): [True: 0, False: 82.6k]
6149
        goto onError;
6150
    }
6151
6152
    end = s + size;
6153
    while (s < end) {
  Branch (6153:12): [True: 1.03M, False: 82.3k]
6154
        unsigned char c = (unsigned char) *s++;
6155
        Py_UCS4 ch;
6156
        int count;
6157
        const char *message;
6158
6159
#define WRITE_CHAR(ch)                                                        \
6160
            
do 1.03M
{ \
6161
                if (ch <= writer.maxchar) {                                   \
6162
                    assert(writer.pos < writer.size);                         \
6163
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6164
                }                                                             \
6165
                else 
if (4.22k
_PyUnicodeWriter_WriteCharInline(&writer, ch) < 04.22k
) { \
6166
                    goto onError;                                             \
6167
                }                                                             \
6168
            } while(0)
6169
6170
        /* Non-escape characters are interpreted as Unicode ordinals */
6171
        if (c != '\\' || 
(7.41k
s >= end7.41k
&&
!consumed50
)) {
  Branch (6171:13): [True: 1.03M, False: 7.41k]
  Branch (6171:27): [True: 50, False: 7.36k]
  Branch (6171:39): [True: 1, False: 49]
6172
            WRITE_CHAR(c);
6173
            continue;
6174
        }
6175
6176
        Py_ssize_t startinpos = s - starts - 1;
6177
        /* \ - Escapes */
6178
        if (s >= end) {
  Branch (6178:13): [True: 49, False: 7.36k]
6179
            assert(consumed);
6180
            // Set message to silent compiler warning.
6181
            // Actually it is never used.
6182
            message = "\\ at end of string";
6183
            goto incomplete;
6184
        }
6185
6186
        c = (unsigned char) *s++;
6187
        if (c == 'u') {
  Branch (6187:13): [True: 7.00k, False: 360]
6188
            count = 4;
6189
            message = "truncated \\uXXXX escape";
6190
        }
6191
        else if (c == 'U') {
  Branch (6191:18): [True: 101, False: 259]
6192
            count = 8;
6193
            message = "truncated \\UXXXXXXXX escape";
6194
        }
6195
        else {
6196
            assert(writer.pos < writer.size);
6197
            PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6198
            WRITE_CHAR(c);
6199
            continue;
6200
        }
6201
6202
        /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6203
        
for (ch = 0; 7.10k
count;
++s, --count27.8k
) {
  Branch (6203:22): [True: 28.1k, False: 6.81k]
6204
            if (s >= end) {
  Branch (6204:17): [True: 256, False: 27.9k]
6205
                goto incomplete;
6206
            }
6207
            c = (unsigned char)*s;
6208
            ch <<= 4;
6209
            if (c >= '0' && c <= '9') {
  Branch (6209:17): [True: 27.9k, False: 0]
  Branch (6209:29): [True: 27.0k, False: 890]
6210
                ch += c - '0';
6211
            }
6212
            else if (c >= 'a' && 
c <= 'f'866
) {
  Branch (6212:22): [True: 866, False: 24]
  Branch (6212:34): [True: 862, False: 4]
6213
                ch += c - ('a' - 10);
6214
            }
6215
            else if (c >= 'A' && c <= 'F') {
  Branch (6215:22): [True: 28, False: 0]
  Branch (6215:34): [True: 0, False: 28]
6216
                ch += c - ('A' - 10);
6217
            }
6218
            else {
6219
                goto error;
6220
            }
6221
        }
6222
        if (ch > MAX_UNICODE) {
  Branch (6222:13): [True: 7, False: 6.81k]
6223
            message = "\\Uxxxxxxxx out of range";
6224
            goto error;
6225
        }
6226
        WRITE_CHAR(ch);
6227
        continue;
6228
6229
      incomplete:
6230
        if (consumed) {
  Branch (6230:13): [True: 281, False: 24]
6231
            *consumed = startinpos;
6232
            break;
6233
        }
6234
      error:;
6235
        Py_ssize_t endinpos = s-starts;
6236
        writer.min_length = end - s + writer.pos;
6237
        if (unicode_decode_call_errorhandler_writer(
  Branch (6237:13): [True: 21, False: 38]
6238
                errors, &errorHandler,
6239
                "rawunicodeescape", message,
6240
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6241
                &writer)) {
6242
            goto onError;
6243
        }
6244
        assert(end - s <= writer.size - writer.pos);
6245
6246
#undef WRITE_CHAR
6247
    }
6248
    Py_XDECREF(errorHandler);
6249
    Py_XDECREF(exc);
6250
    return _PyUnicodeWriter_Finish(&writer);
6251
6252
  onError:
6253
    _PyUnicodeWriter_Dealloc(&writer);
6254
    Py_XDECREF(errorHandler);
6255
    Py_XDECREF(exc);
6256
    return NULL;
6257
}
6258
6259
PyObject *
6260
PyUnicode_DecodeRawUnicodeEscape(const char *s,
6261
                                 Py_ssize_t size,
6262
                                 const char *errors)
6263
{
6264
    return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
6265
}
6266
6267
6268
PyObject *
6269
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6270
{
6271
    PyObject *repr;
6272
    char *p;
6273
    Py_ssize_t expandsize, pos;
6274
    int kind;
6275
    const void *data;
6276
    Py_ssize_t len;
6277
6278
    if (!PyUnicode_Check(unicode)) {
  Branch (6278:9): [True: 0, False: 11.2k]
6279
        PyErr_BadArgument();
6280
        return NULL;
6281
    }
6282
    kind = PyUnicode_KIND(unicode);
6283
    data = PyUnicode_DATA(unicode);
6284
    len = PyUnicode_GET_LENGTH(unicode);
6285
    if (kind == PyUnicode_1BYTE_KIND) {
  Branch (6285:9): [True: 10.2k, False: 958]
6286
        return PyBytes_FromStringAndSize(data, len);
6287
    }
6288
6289
    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6290
       bytes, and 1 byte characters 4. */
6291
    expandsize = kind * 2 + 2;
6292
6293
    if (len > PY_SSIZE_T_MAX / expandsize) {
  Branch (6293:9): [True: 0, False: 958]
6294
        return PyErr_NoMemory();
6295
    }
6296
    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6297
    if (repr == NULL) {
  Branch (6297:9): [True: 0, False: 958]
6298
        return NULL;
6299
    }
6300
    if (len == 0) {
  Branch (6300:9): [True: 0, False: 958]
6301
        return repr;
6302
    }
6303
6304
    p = PyBytes_AS_STRING(repr);
6305
    for (pos = 0; pos < len; 
pos++76.1k
) {
  Branch (6305:19): [True: 76.1k, False: 958]
6306
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6307
6308
        /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6309
        if (ch < 0x100) {
  Branch (6309:13): [True: 69.7k, False: 6.40k]
6310
            *p++ = (char) ch;
6311
        }
6312
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6313
        else if (ch < 0x10000) {
  Branch (6313:18): [True: 6.38k, False: 17]
6314
            *p++ = '\\';
6315
            *p++ = 'u';
6316
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6317
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6318
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6319
            *p++ = Py_hexdigits[ch & 15];
6320
        }
6321
        /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6322
        else {
6323
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6324
            *p++ = '\\';
6325
            *p++ = 'U';
6326
            *p++ = '0';
6327
            *p++ = '0';
6328
            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6329
            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6330
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6331
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6332
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6333
            *p++ = Py_hexdigits[ch & 15];
6334
        }
6335
    }
6336
6337
    assert(p > PyBytes_AS_STRING(repr));
6338
    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
  Branch (6338:9): [True: 0, False: 958]
6339
        return NULL;
6340
    }
6341
    return repr;
6342
}
6343
6344
/* --- Latin-1 Codec ------------------------------------------------------ */
6345
6346
PyObject *
6347
PyUnicode_DecodeLatin1(const char *s,
6348
                       Py_ssize_t size,
6349
                       const char *errors)
6350
{
6351
    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6352
    return _PyUnicode_FromUCS1((const unsigned char*)s, size);
6353
}
6354
6355
/* create or adjust a UnicodeEncodeError */
6356
static void
6357
make_encode_exception(PyObject **exceptionObject,
6358
                      const char *encoding,
6359
                      PyObject *unicode,
6360
                      Py_ssize_t startpos, Py_ssize_t endpos,
6361
                      const char *reason)
6362
{
6363
    if (*exceptionObject == NULL) {
  Branch (6363:9): [True: 1.76k, False: 6.35k]
6364
        *exceptionObject = PyObject_CallFunction(
6365
            PyExc_UnicodeEncodeError, "sOnns",
6366
            encoding, unicode, startpos, endpos, reason);
6367
    }
6368
    else {
6369
        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
  Branch (6369:13): [True: 0, False: 6.35k]
6370
            goto onError;
6371
        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
  Branch (6371:13): [True: 0, False: 6.35k]
6372
            goto onError;
6373
        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
  Branch (6373:13): [True: 0, False: 6.35k]
6374
            goto onError;
6375
        return;
6376
      onError:
6377
        Py_CLEAR(*exceptionObject);
6378
    }
6379
}
6380
6381
/* raises a UnicodeEncodeError */
6382
static void
6383
raise_encode_exception(PyObject **exceptionObject,
6384
                       const char *encoding,
6385
                       PyObject *unicode,
6386
                       Py_ssize_t startpos, Py_ssize_t endpos,
6387
                       const char *reason)
6388
{
6389
    make_encode_exception(exceptionObject,
6390
                          encoding, unicode, startpos, endpos, reason);
6391
    if (*exceptionObject != NULL)
  Branch (6391:9): [True: 1.17k, False: 0]
6392
        PyCodec_StrictErrors(*exceptionObject);
6393
}
6394
6395
/* error handling callback helper:
6396
   build arguments, call the callback and check the arguments,
6397
   put the result into newpos and return the replacement string, which
6398
   has to be freed by the caller */
6399
static PyObject *
6400
unicode_encode_call_errorhandler(const char *errors,
6401
                                 PyObject **errorHandler,
6402
                                 const char *encoding, const char *reason,
6403
                                 PyObject *unicode, PyObject **exceptionObject,
6404
                                 Py_ssize_t startpos, Py_ssize_t endpos,
6405
                                 Py_ssize_t *newpos)
6406
{
6407
    static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6408
    Py_ssize_t len;
6409
    PyObject *restuple;
6410
    PyObject *resunicode;
6411
6412
    if (*errorHandler == NULL) {
  Branch (6412:9): [True: 616, False: 6.32k]
6413
        *errorHandler = PyCodec_LookupError(errors);
6414
        if (*errorHandler == NULL)
  Branch (6414:13): [True: 1, False: 615]
6415
            return NULL;
6416
    }
6417
6418
    len = PyUnicode_GET_LENGTH(unicode);
6419
6420
    make_encode_exception(exceptionObject,
6421
                          encoding, unicode, startpos, endpos, reason);
6422
    if (*exceptionObject == NULL)
  Branch (6422:9): [True: 0, False: 6.93k]
6423
        return NULL;
6424
6425
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
6426
    if (restuple == NULL)
  Branch (6426:9): [True: 396, False: 6.53k]
6427
        return NULL;
6428
    if (!PyTuple_Check(restuple)) {
  Branch (6428:9): [True: 9, False: 6.53k]
6429
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6430
        Py_DECREF(restuple);
6431
        return NULL;
6432
    }
6433
    if (!PyArg_ParseTuple(restuple, argparse,
  Branch (6433:9): [True: 29, False: 6.50k]
6434
                          &resunicode, newpos)) {
6435
        Py_DECREF(restuple);
6436
        return NULL;
6437
    }
6438
    if (!PyUnicode_Check(resunicode) && 
!51
PyBytes_Check51
(resunicode)) {
  Branch (6438:9): [True: 51, False: 6.45k]
  Branch (6438:41): [True: 0, False: 51]
6439
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6440
        Py_DECREF(restuple);
6441
        return NULL;
6442
    }
6443
    if (*newpos<0)
  Branch (6443:9): [True: 3, False: 6.49k]
6444
        *newpos = len + *newpos;
6445
    if (*newpos<0 || 
*newpos>len6.50k
) {
  Branch (6445:9): [True: 1, False: 6.50k]
  Branch (6445:22): [True: 1, False: 6.49k]
6446
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6447
        Py_DECREF(restuple);
6448
        return NULL;
6449
    }
6450
    Py_INCREF(resunicode);
6451
    Py_DECREF(restuple);
6452
    return resunicode;
6453
}
6454
6455
static PyObject *
6456
unicode_encode_ucs1(PyObject *unicode,
6457
                    const char *errors,
6458
                    const Py_UCS4 limit)
6459
{
6460
    /* input state */
6461
    Py_ssize_t pos=0, size;
6462
    int kind;
6463
    const void *data;
6464
    /* pointer into the output */
6465
    char *str;
6466
    const char *encoding = (limit == 256) ? 
"latin-1"138
:
"ascii"5.75k
;
  Branch (6466:28): [True: 138, False: 5.75k]
6467
    const char *reason = (limit == 256) ? 
"ordinal not in range(256)"138
:
"ordinal not in range(128)"5.75k
;
  Branch (6467:26): [True: 138, False: 5.75k]
6468
    PyObject *error_handler_obj = NULL;
6469
    PyObject *exc = NULL;
6470
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6471
    PyObject *rep = NULL;
6472
    /* output object */
6473
    _PyBytesWriter writer;
6474
6475
    size = PyUnicode_GET_LENGTH(unicode);
6476
    kind = PyUnicode_KIND(unicode);
6477
    data = PyUnicode_DATA(unicode);
6478
    /* allocate enough for a simple encoding without
6479
       replacements, if we need more, we'll resize */
6480
    if (size == 0)
  Branch (6480:9): [True: 6, False: 5.88k]
6481
        return PyBytes_FromStringAndSize(NULL, 0);
6482
6483
    _PyBytesWriter_Init(&writer);
6484
    str = _PyBytesWriter_Alloc(&writer, size);
6485
    if (str == NULL)
  Branch (6485:9): [True: 0, False: 5.88k]
6486
        return NULL;
6487
6488
    
while (5.88k
pos < size) {
  Branch (6488:12): [True: 316k, False: 4.70k]
6489
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6490
6491
        /* can we encode this? */
6492
        if (ch < limit) {
  Branch (6492:13): [True: 276k, False: 39.6k]
6493
            /* no overflow check, because we know that the space is enough */
6494
            *str++ = (char)ch;
6495
            ++pos;
6496
        }
6497
        else {
6498
            Py_ssize_t newpos, i;
6499
            /* startpos for collecting unencodable chars */
6500
            Py_ssize_t collstart = pos;
6501
            Py_ssize_t collend = collstart + 1;
6502
            /* find all unecodable characters */
6503
6504
            while ((collend < size) && 
(225k
PyUnicode_READ225k
(kind, data, collend) >= limit))
  Branch (6504:20): [True: 225k, False: 1.14k]
  Branch (6504:40): [True: 187k, False: 38.4k]
6505
                ++collend;
6506
6507
            /* Only overallocate the buffer if it's not the last write */
6508
            writer.overallocate = (collend < size);
6509
6510
            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6511
            if (error_handler == _Py_ERROR_UNKNOWN)
  Branch (6511:17): [True: 5.83k, False: 33.7k]
6512
                error_handler = _Py_GetErrorHandler(errors);
6513
6514
            switch (error_handler) {
6515
            case _Py_ERROR_STRICT:
  Branch (6515:13): [True: 1.14k, False: 38.4k]
6516
                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6517
                goto onError;
6518
6519
            case _Py_ERROR_REPLACE:
  Branch (6519:13): [True: 1.13k, False: 38.4k]
6520
                memset(str, '?', collend - collstart);
6521
                str += (collend - collstart);
6522
                /* fall through */
6523
            case _Py_ERROR_IGNORE:
  Branch (6523:13): [True: 1.01k, False: 38.5k]
6524
                pos = collend;
6525
                break;
6526
6527
            case _Py_ERROR_BACKSLASHREPLACE:
  Branch (6527:13): [True: 8.19k, False: 31.4k]
6528
                /* subtract preallocated bytes */
6529
                writer.min_size -= (collend - collstart);
6530
                str = backslashreplace(&writer, str,
6531
                                       unicode, collstart, collend);
6532
                if (str == NULL)
  Branch (6532:21): [True: 0, False: 8.19k]
6533
                    goto onError;
6534
                pos = collend;
6535
                break;
6536
6537
            case _Py_ERROR_XMLCHARREFREPLACE:
  Branch (6537:13): [True: 1.24k, False: 38.3k]
6538
                /* subtract preallocated bytes */
6539
                writer.min_size -= (collend - collstart);
6540
                str = xmlcharrefreplace(&writer, str,
6541
                                        unicode, collstart, collend);
6542
                if (str == NULL)
  Branch (6542:21): [True: 0, False: 1.24k]
6543
                    goto onError;
6544
                pos = collend;
6545
                break;
6546
6547
            case _Py_ERROR_SURROGATEESCAPE:
  Branch (6547:13): [True: 20.6k, False: 18.9k]
6548
                for (i = collstart; i < collend; 
++i159k
) {
  Branch (6548:37): [True: 159k, False: 20.6k]
6549
                    ch = PyUnicode_READ(kind, data, i);
6550
                    if (ch < 0xdc80 || 
0xdcff < ch159k
) {
  Branch (6550:25): [True: 2, False: 159k]
  Branch (6550:40): [True: 0, False: 159k]
6551
                        /* Not a UTF-8b surrogate */
6552
                        break;
6553
                    }
6554
                    *str++ = (char)(ch - 0xdc00);
6555
                    ++pos;
6556
                }
6557
                if (i >= collend)
  Branch (6557:21): [True: 20.6k, False: 2]
6558
                    break;
6559
                collstart = pos;
6560
                assert(collstart != collend);
6561
                /* fall through */
6562
6563
            default:
  Branch (6563:13): [True: 6.18k, False: 33.4k]
6564
                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6565
                                                       encoding, reason, unicode, &exc,
6566
                                                       collstart, collend, &newpos);
6567
                if (rep == NULL)
  Branch (6567:21): [True: 38, False: 6.14k]
6568
                    goto onError;
6569
6570
                if (newpos < collstart) {
  Branch (6570:21): [True: 102, False: 6.04k]
6571
                    writer.overallocate = 1;
6572
                    str = _PyBytesWriter_Prepare(&writer, str,
6573
                                                 collstart - newpos);
6574
                    if (str == NULL)
  Branch (6574:25): [True: 0, False: 102]
6575
                        goto onError;
6576
                }
6577
                else {
6578
                    /* subtract preallocated bytes */
6579
                    writer.min_size -= newpos - collstart;
6580
                    /* Only overallocate the buffer if it's not the last write */
6581
                    writer.overallocate = (newpos < size);
6582
                }
6583
6584
                if (PyBytes_Check(rep)) {
6585
                    /* Directly copy bytes result to output. */
6586
                    str = _PyBytesWriter_WriteBytes(&writer, str,
6587
                                                    PyBytes_AS_STRING(rep),
6588
                                                    PyBytes_GET_SIZE(rep));
6589
                }
6590
                else {
6591
                    assert(PyUnicode_Check(rep));
6592
6593
                    if (limit == 256 ?
  Branch (6593:25): [True: 70, False: 6.07k]
  Branch (6593:25): [True: 4, False: 6.13k]
6594
                        PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6595
                        
!6.07k
PyUnicode_IS_ASCII6.07k
(rep))
6596
                    {
6597
                        /* Not all characters are smaller than limit */
6598
                        raise_encode_exception(&exc, encoding, unicode,
6599
                                               collstart, collend, reason);
6600
                        goto onError;
6601
                    }
6602
                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6603
                    str = _PyBytesWriter_WriteBytes(&writer, str,
6604
                                                    PyUnicode_DATA(rep),
6605
                                                    PyUnicode_GET_LENGTH(rep));
6606
                }
6607
                if (str == NULL)
  Branch (6607:21): [True: 0, False: 6.14k]
6608
                    goto onError;
6609
6610
                pos = newpos;
6611
                Py_CLEAR(rep);
6612
            }
6613
6614
            /* If overallocation was disabled, ensure that it was the last
6615
               write. Otherwise, we missed an optimization */
6616
            assert(writer.overallocate || pos == size);
6617
        }
6618
    }
6619
6620
    Py_XDECREF(error_handler_obj);
6621
    Py_XDECREF(exc);
6622
    return _PyBytesWriter_Finish(&writer, str);
6623
6624
  onError:
6625
    Py_XDECREF(rep);
6626
    _PyBytesWriter_Dealloc(&writer);
6627
    Py_XDECREF(error_handler_obj);
6628
    Py_XDECREF(exc);
6629
    return NULL;
6630
}
6631
6632
PyObject *
6633
_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6634
{
6635
    if (!PyUnicode_Check(unicode)) {
  Branch (6635:9): [True: 0, False: 24.1k]
6636
        PyErr_BadArgument();
6637
        return NULL;
6638
    }
6639
    /* Fast path: if it is a one-byte string, construct
6640
       bytes object directly. */
6641
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
  Branch (6641:9): [True: 24.0k, False: 80]
6642
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6643
                                         PyUnicode_GET_LENGTH(unicode));
6644
    /* Non-Latin-1 characters present. Defer to above function to
6645
       raise the exception. */
6646
    return unicode_encode_ucs1(unicode, errors, 256);
6647
}
6648
6649
PyObject*
6650
PyUnicode_AsLatin1String(PyObject *unicode)
6651
{
6652
    return _PyUnicode_AsLatin1String(unicode, NULL);
6653
}
6654
6655
/* --- 7-bit ASCII Codec -------------------------------------------------- */
6656
6657
PyObject *
6658
PyUnicode_DecodeASCII(const char *s,
6659
                      Py_ssize_t size,
6660
                      const char *errors)
6661
{
6662
    const char *starts = s;
6663
    const char *e = s + size;
6664
    PyObject *error_handler_obj = NULL;
6665
    PyObject *exc = NULL;
6666
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6667
6668
    if (size == 0)
  Branch (6668:9): [True: 614, False: 664k]
6669
        _Py_RETURN_UNICODE_EMPTY();
6670
6671
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6672
    if (size == 1 && 
(unsigned char)s[0] < 128517k
) {
  Branch (6672:9): [True: 517k, False: 147k]
  Branch (6672:22): [True: 517k, False: 16]
6673
        return get_latin1_char((unsigned char)s[0]);
6674
    }
6675
6676
    // Shortcut for simple case
6677
    PyObject *u = PyUnicode_New(size, 127);
6678
    if (u == NULL) {
  Branch (6678:9): [True: 0, False: 147k]
6679
        return NULL;
6680
    }
6681
    Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
6682
    if (outpos == size) {
  Branch (6682:9): [True: 147k, False: 368]
6683
        return u;
6684
    }
6685
6686
    _PyUnicodeWriter writer;
6687
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
6688
    writer.pos = outpos;
6689
6690
    s += outpos;
6691
    int kind = writer.kind;
6692
    void *data = writer.data;
6693
    Py_ssize_t startinpos, endinpos;
6694
6695
    while (s < e) {
  Branch (6695:12): [True: 227k, False: 311]
6696
        unsigned char c = (unsigned char)*s;
6697
        if (c < 128) {
  Branch (6697:13): [True: 68.5k, False: 158k]
6698
            PyUnicode_WRITE(kind, data, writer.pos, c);
6699
            writer.pos++;
6700
            ++s;
6701
            continue;
6702
        }
6703
6704
        /* byte outsize range 0x00..0x7f: call the error handler */
6705
6706
        if (error_handler == _Py_ERROR_UNKNOWN)
  Branch (6706:13): [True: 368, False: 158k]
6707
            error_handler = _Py_GetErrorHandler(errors);
6708
6709
        switch (error_handler)
6710
        {
6711
        case _Py_ERROR_REPLACE:
  Branch (6711:9): [True: 762, False: 158k]
6712
        case _Py_ERROR_SURROGATEESCAPE:
  Branch (6712:9): [True: 157k, False: 894]
6713
            /* Fast-path: the error handler only writes one character,
6714
               but we may switch to UCS2 at the first write */
6715
            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
  Branch (6715:17): [True: 0, False: 158k]
6716
                goto onError;
6717
            kind = writer.kind;
6718
            data = writer.data;
6719
6720
            if (error_handler == _Py_ERROR_REPLACE)
  Branch (6720:17): [True: 762, False: 157k]
6721
                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6722
            else
6723
                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6724
            writer.pos++;
6725
            ++s;
6726
            break;
6727
6728
        case _Py_ERROR_IGNORE:
  Branch (6728:9): [True: 11, False: 158k]
6729
            ++s;
6730
            break;
6731
6732
        default:
  Branch (6732:9): [True: 121, False: 158k]
6733
            startinpos = s-starts;
6734
            endinpos = startinpos + 1;
6735
            if (unicode_decode_call_errorhandler_writer(
  Branch (6735:17): [True: 57, False: 64]
6736
                    errors, &error_handler_obj,
6737
                    "ascii", "ordinal not in range(128)",
6738
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
6739
                    &writer))
6740
                goto onError;
6741
            kind = writer.kind;
6742
            data = writer.data;
6743
        }
6744
    }
6745
    Py_XDECREF(error_handler_obj);
6746
    Py_XDECREF(exc);
6747
    return _PyUnicodeWriter_Finish(&writer);
6748
6749
  onError:
6750
    _PyUnicodeWriter_Dealloc(&writer);
6751
    Py_XDECREF(error_handler_obj);
6752
    Py_XDECREF(exc);
6753
    return NULL;
6754
}
6755
6756
PyObject *
6757
_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
6758
{
6759
    if (!PyUnicode_Check(unicode)) {
  Branch (6759:9): [True: 0, False: 1.00M]
6760
        PyErr_BadArgument();
6761
        return NULL;
6762
    }
6763
    /* Fast path: if it is an ASCII-only string, construct bytes object
6764
       directly. Else defer to above function to raise the exception. */
6765
    if (PyUnicode_IS_ASCII(unicode))
6766
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6767
                                         PyUnicode_GET_LENGTH(unicode));
6768
    return unicode_encode_ucs1(unicode, errors, 128);
6769
}
6770
6771
PyObject *
6772
PyUnicode_AsASCIIString(PyObject *unicode)
6773
{
6774
    return _PyUnicode_AsASCIIString(unicode, NULL);
6775
}
6776
6777
#ifdef MS_WINDOWS
6778
6779
/* --- MBCS codecs for Windows -------------------------------------------- */
6780
6781
#if SIZEOF_INT < SIZEOF_SIZE_T
6782
#define NEED_RETRY
6783
#endif
6784
6785
/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
6786
   transcoding from UTF-16), but INT_MAX / 4 performs better in
6787
   both cases also and avoids partial characters overrunning the
6788
   length limit in MultiByteToWideChar on Windows */
6789
#define DECODING_CHUNK_SIZE (INT_MAX/4)
6790
6791
#ifndef WC_ERR_INVALID_CHARS
6792
#  define WC_ERR_INVALID_CHARS 0x0080
6793
#endif
6794
6795
static const char*
6796
code_page_name(UINT code_page, PyObject **obj)
6797
{
6798
    *obj = NULL;
6799
    if (code_page == CP_ACP)
6800
        return "mbcs";
6801
    if (code_page == CP_UTF7)
6802
        return "CP_UTF7";
6803
    if (code_page == CP_UTF8)
6804
        return "CP_UTF8";
6805
6806
    *obj = PyBytes_FromFormat("cp%u", code_page);
6807
    if (*obj == NULL)
6808
        return NULL;
6809
    return PyBytes_AS_STRING(*obj);
6810
}
6811
6812
static DWORD
6813
decode_code_page_flags(UINT code_page)
6814
{
6815
    if (code_page == CP_UTF7) {
6816
        /* The CP_UTF7 decoder only supports flags=0 */
6817
        return 0;
6818
    }
6819
    else
6820
        return MB_ERR_INVALID_CHARS;
6821
}
6822
6823
/*
6824
 * Decode a byte string from a Windows code page into unicode object in strict
6825
 * mode.
6826
 *
6827
 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6828
 * OSError and returns -1 on other error.
6829
 */
6830
static int
6831
decode_code_page_strict(UINT code_page,
6832
                        wchar_t **buf,
6833
                        Py_ssize_t *bufsize,
6834
                        const char *in,
6835
                        int insize)
6836
{
6837
    DWORD flags = MB_ERR_INVALID_CHARS;
6838
    wchar_t *out;
6839
    DWORD outsize;
6840
6841
    /* First get the size of the result */
6842
    assert(insize > 0);
6843
    while ((outsize = MultiByteToWideChar(code_page, flags,
6844
                                          in, insize, NULL, 0)) <= 0)
6845
    {
6846
        if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
6847
            goto error;
6848
        }
6849
        /* For some code pages (e.g. UTF-7) flags must be set to 0. */
6850
        flags = 0;
6851
    }
6852
6853
    /* Extend a wchar_t* buffer */
6854
    Py_ssize_t n = *bufsize;   /* Get the current length */
6855
    if (widechar_resize(buf, bufsize, n + outsize) < 0) {
6856
        return -1;
6857
    }
6858
    out = *buf + n;
6859
6860
    /* Do the conversion */
6861
    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6862
    if (outsize <= 0)
6863
        goto error;
6864
    return insize;
6865
6866
error:
6867
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6868
        return -2;
6869
    PyErr_SetFromWindowsErr(0);
6870
    return -1;
6871
}
6872
6873
/*
6874
 * Decode a byte string from a code page into unicode object with an error
6875
 * handler.
6876
 *
6877
 * Returns consumed size if succeed, or raise an OSError or
6878
 * UnicodeDecodeError exception and returns -1 on error.
6879
 */
6880
static int
6881
decode_code_page_errors(UINT code_page,
6882
                        wchar_t **buf,
6883
                        Py_ssize_t *bufsize,
6884
                        const char *in, const int size,
6885
                        const char *errors, int final)
6886
{
6887
    const char *startin = in;
6888
    const char *endin = in + size;
6889
    DWORD flags = MB_ERR_INVALID_CHARS;
6890
    /* Ideally, we should get reason from FormatMessage. This is the Windows
6891
       2000 English version of the message. */
6892
    const char *reason = "No mapping for the Unicode character exists "
6893
                         "in the target code page.";
6894
    /* each step cannot decode more than 1 character, but a character can be
6895
       represented as a surrogate pair */
6896
    wchar_t buffer[2], *out;
6897
    int insize;
6898
    Py_ssize_t outsize;
6899
    PyObject *errorHandler = NULL;
6900
    PyObject *exc = NULL;
6901
    PyObject *encoding_obj = NULL;
6902
    const char *encoding;
6903
    DWORD err;
6904
    int ret = -1;
6905
6906
    assert(size > 0);
6907
6908
    encoding = code_page_name(code_page, &encoding_obj);
6909
    if (encoding == NULL)
6910
        return -1;
6911
6912
    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
6913
        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6914
           UnicodeDecodeError. */
6915
        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6916
        if (exc != NULL) {
6917
            PyCodec_StrictErrors(exc);
6918
            Py_CLEAR(exc);
6919
        }
6920
        goto error;
6921
    }
6922
6923
    /* Extend a wchar_t* buffer */
6924
    Py_ssize_t n = *bufsize;   /* Get the current length */
6925
    if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6926
        PyErr_NoMemory();
6927
        goto error;
6928
    }
6929
    if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
6930
        goto error;
6931
    }
6932
    out = *buf + n;
6933
6934
    /* Decode the byte string character per character */
6935
    while (in < endin)
6936
    {
6937
        /* Decode a character */
6938
        insize = 1;
6939
        do
6940
        {
6941
            outsize = MultiByteToWideChar(code_page, flags,
6942
                                          in, insize,
6943
                                          buffer, Py_ARRAY_LENGTH(buffer));
6944
            if (outsize > 0)
6945
                break;
6946
            err = GetLastError();
6947
            if (err == ERROR_INVALID_FLAGS && flags) {
6948
                /* For some code pages (e.g. UTF-7) flags must be set to 0. */
6949
                flags = 0;
6950
                continue;
6951
            }
6952
            if (err != ERROR_NO_UNICODE_TRANSLATION
6953
                && err != ERROR_INSUFFICIENT_BUFFER)
6954
            {
6955
                PyErr_SetFromWindowsErr(0);
6956
                goto error;
6957
            }
6958
            insize++;
6959
        }
6960
        /* 4=maximum length of a UTF-8 sequence */
6961
        while (insize <= 4 && (in + insize) <= endin);
6962
6963
        if (outsize <= 0) {
6964
            Py_ssize_t startinpos, endinpos, outpos;
6965
6966
            /* last character in partial decode? */
6967
            if (in + insize >= endin && !final)
6968
                break;
6969
6970
            startinpos = in - startin;
6971
            endinpos = startinpos + 1;
6972
            outpos = out - *buf;
6973
            if (unicode_decode_call_errorhandler_wchar(
6974
                    errors, &errorHandler,
6975
                    encoding, reason,
6976
                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
6977
                    buf, bufsize, &outpos))
6978
            {
6979
                goto error;
6980
            }
6981
            out = *buf + outpos;
6982
        }
6983
        else {
6984
            in += insize;
6985
            memcpy(out, buffer, outsize * sizeof(wchar_t));
6986
            out += outsize;
6987
        }
6988
    }
6989
6990
    /* Shrink the buffer */
6991
    assert(out - *buf <= *bufsize);
6992
    *bufsize = out - *buf;
6993
    /* (in - startin) <= size and size is an int */
6994
    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
6995
6996
error:
6997
    Py_XDECREF(encoding_obj);
6998
    Py_XDECREF(errorHandler);
6999
    Py_XDECREF(exc);
7000
    return ret;
7001
}
7002
7003
static PyObject *
7004
decode_code_page_stateful(int code_page,
7005
                          const char *s, Py_ssize_t size,
7006
                          const char *errors, Py_ssize_t *consumed)
7007
{
7008
    wchar_t *buf = NULL;
7009
    Py_ssize_t bufsize = 0;
7010
    int chunk_size, final, converted, done;
7011
7012
    if (code_page < 0) {
7013
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7014
        return NULL;
7015
    }
7016
    if (size < 0) {
7017
        PyErr_BadInternalCall();
7018
        return NULL;
7019
    }
7020
7021
    if (consumed)
7022
        *consumed = 0;
7023
7024
    do
7025
    {
7026
#ifdef NEED_RETRY
7027
        if (size > DECODING_CHUNK_SIZE) {
7028
            chunk_size = DECODING_CHUNK_SIZE;
7029
            final = 0;
7030
            done = 0;
7031
        }
7032
        else
7033
#endif
7034
        {
7035
            chunk_size = (int)size;
7036
            final = (consumed == NULL);
7037
            done = 1;
7038
        }
7039
7040
        if (chunk_size == 0 && done) {
7041
            if (buf != NULL)
7042
                break;
7043
            _Py_RETURN_UNICODE_EMPTY();
7044
        }
7045
7046
        converted = decode_code_page_strict(code_page, &buf, &bufsize,
7047
                                            s, chunk_size);
7048
        if (converted == -2)
7049
            converted = decode_code_page_errors(code_page, &buf, &bufsize,
7050
                                                s, chunk_size,
7051
                                                errors, final);
7052
        assert(converted != 0 || done);
7053
7054
        if (converted < 0) {
7055
            PyMem_Free(buf);
7056
            return NULL;
7057
        }
7058
7059
        if (consumed)
7060
            *consumed += converted;
7061
7062
        s += converted;
7063
        size -= converted;
7064
    } while (!done);
7065
7066
    PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7067
    PyMem_Free(buf);
7068
    return v;
7069
}
7070
7071
PyObject *
7072
PyUnicode_DecodeCodePageStateful(int code_page,
7073
                                 const char *s,
7074
                                 Py_ssize_t size,
7075
                                 const char *errors,
7076
                                 Py_ssize_t *consumed)
7077
{
7078
    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7079
}
7080
7081
PyObject *
7082
PyUnicode_DecodeMBCSStateful(const char *s,
7083
                             Py_ssize_t size,
7084
                             const char *errors,
7085
                             Py_ssize_t *consumed)
7086
{
7087
    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7088
}
7089
7090
PyObject *
7091
PyUnicode_DecodeMBCS(const char *s,
7092
                     Py_ssize_t size,
7093
                     const char *errors)
7094
{
7095
    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7096
}
7097
7098
static DWORD
7099
encode_code_page_flags(UINT code_page, const char *errors)
7100
{
7101
    if (code_page == CP_UTF8) {
7102
        return WC_ERR_INVALID_CHARS;
7103
    }
7104
    else if (code_page == CP_UTF7) {
7105
        /* CP_UTF7 only supports flags=0 */
7106
        return 0;
7107
    }
7108
    else {
7109
        if (errors != NULL && strcmp(errors, "replace") == 0)
7110
            return 0;
7111
        else
7112
            return WC_NO_BEST_FIT_CHARS;
7113
    }
7114
}
7115
7116
/*
7117
 * Encode a Unicode string to a Windows code page into a byte string in strict
7118
 * mode.
7119
 *
7120
 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7121
 * an OSError and returns -1 on other error.
7122
 */
7123
static int
7124
encode_code_page_strict(UINT code_page, PyObject **outbytes,
7125
                        PyObject *unicode, Py_ssize_t offset, int len,
7126
                        const char* errors)
7127
{
7128
    BOOL usedDefaultChar = FALSE;
7129
    BOOL *pusedDefaultChar = &usedDefaultChar;
7130
    int outsize;
7131
    wchar_t *p;
7132
    Py_ssize_t size;
7133
    const DWORD flags = encode_code_page_flags(code_page, NULL);
7134
    char *out;
7135
    /* Create a substring so that we can get the UTF-16 representation
7136
       of just the slice under consideration. */
7137
    PyObject *substring;
7138
    int ret = -1;
7139
7140
    assert(len > 0);
7141
7142
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7143
        pusedDefaultChar = &usedDefaultChar;
7144
    else
7145
        pusedDefaultChar = NULL;
7146
7147
    substring = PyUnicode_Substring(unicode, offset, offset+len);
7148
    if (substring == NULL)
7149
        return -1;
7150
    p = PyUnicode_AsWideCharString(substring, &size);
7151
    Py_CLEAR(substring);
7152
    if (p == NULL) {
7153
        return -1;
7154
    }
7155
    assert(size <= INT_MAX);
7156
7157
    /* First get the size of the result */
7158
    outsize = WideCharToMultiByte(code_page, flags,
7159
                                  p, (int)size,
7160
                                  NULL, 0,
7161
                                  NULL, pusedDefaultChar);
7162
    if (outsize <= 0)
7163
        goto error;
7164
    /* If we used a default char, then we failed! */
7165
    if (pusedDefaultChar && *pusedDefaultChar) {
7166
        ret = -2;
7167
        goto done;
7168
    }
7169
7170
    if (*outbytes == NULL) {
7171
        /* Create string object */
7172
        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7173
        if (*outbytes == NULL) {
7174
            goto done;
7175
        }
7176
        out = PyBytes_AS_STRING(*outbytes);
7177
    }
7178
    else {
7179
        /* Extend string object */
7180
        const Py_ssize_t n = PyBytes_Size(*outbytes);
7181
        if (outsize > PY_SSIZE_T_MAX - n) {
7182
            PyErr_NoMemory();
7183
            goto done;
7184
        }
7185
        if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7186
            goto done;
7187
        }
7188
        out = PyBytes_AS_STRING(*outbytes) + n;
7189
    }
7190
7191
    /* Do the conversion */
7192
    outsize = WideCharToMultiByte(code_page, flags,
7193
                                  p, (int)size,
7194
                                  out, outsize,
7195
                                  NULL, pusedDefaultChar);
7196
    if (outsize <= 0)
7197
        goto error;
7198
    if (pusedDefaultChar && *pusedDefaultChar) {
7199
        ret = -2;
7200
        goto done;
7201
    }
7202
    ret = 0;
7203
7204
done:
7205
    PyMem_Free(p);
7206
    return ret;
7207
7208
error:
7209
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7210
        ret = -2;
7211
        goto done;
7212
    }
7213
    PyErr_SetFromWindowsErr(0);
7214
    goto done;
7215
}
7216
7217
/*
7218
 * Encode a Unicode string to a Windows code page into a byte string using an
7219
 * error handler.
7220
 *
7221
 * Returns consumed characters if succeed, or raise an OSError and returns
7222
 * -1 on other error.
7223
 */
7224
static int
7225
encode_code_page_errors(UINT code_page, PyObject **outbytes,
7226
                        PyObject *unicode, Py_ssize_t unicode_offset,
7227
                        Py_ssize_t insize, const char* errors)
7228
{
7229
    const DWORD flags = encode_code_page_flags(code_page, errors);
7230
    Py_ssize_t pos = unicode_offset;
7231
    Py_ssize_t endin = unicode_offset + insize;
7232
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7233
       2000 English version of the message. */
7234
    const char *reason = "invalid character";
7235
    /* 4=maximum length of a UTF-8 sequence */
7236
    char buffer[4];
7237
    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7238
    Py_ssize_t outsize;
7239
    char *out;
7240
    PyObject *errorHandler = NULL;
7241
    PyObject *exc = NULL;
7242
    PyObject *encoding_obj = NULL;
7243
    const char *encoding;
7244
    Py_ssize_t newpos, newoutsize;
7245
    PyObject *rep;
7246
    int ret = -1;
7247
7248
    assert(insize > 0);
7249
7250
    encoding = code_page_name(code_page, &encoding_obj);
7251
    if (encoding == NULL)
7252
        return -1;
7253
7254
    if (errors == NULL || strcmp(errors, "strict") == 0) {
7255
        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7256
           then we raise a UnicodeEncodeError. */
7257
        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7258
        if (exc != NULL) {
7259
            PyCodec_StrictErrors(exc);
7260
            Py_DECREF(exc);
7261
        }
7262
        Py_XDECREF(encoding_obj);
7263
        return -1;
7264
    }
7265
7266
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7267
        pusedDefaultChar = &usedDefaultChar;
7268
    else
7269
        pusedDefaultChar = NULL;
7270
7271
    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7272
        PyErr_NoMemory();
7273
        goto error;
7274
    }
7275
    outsize = insize * Py_ARRAY_LENGTH(buffer);
7276
7277
    if (*outbytes == NULL) {
7278
        /* Create string object */
7279
        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7280
        if (*outbytes == NULL)
7281
            goto error;
7282
        out = PyBytes_AS_STRING(*outbytes);
7283
    }
7284
    else {
7285
        /* Extend string object */
7286
        Py_ssize_t n = PyBytes_Size(*outbytes);
7287
        if (n > PY_SSIZE_T_MAX - outsize) {
7288
            PyErr_NoMemory();
7289
            goto error;
7290
        }
7291
        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7292
            goto error;
7293
        out = PyBytes_AS_STRING(*outbytes) + n;
7294
    }
7295
7296
    /* Encode the string character per character */
7297
    while (pos < endin)
7298
    {
7299
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7300
        wchar_t chars[2];
7301
        int charsize;
7302
        if (ch < 0x10000) {
7303
            chars[0] = (wchar_t)ch;
7304
            charsize = 1;
7305
        }
7306
        else {
7307
            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7308
            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7309
            charsize = 2;
7310
        }
7311
7312
        outsize = WideCharToMultiByte(code_page, flags,
7313
                                      chars, charsize,
7314
                                      buffer, Py_ARRAY_LENGTH(buffer),
7315
                                      NULL, pusedDefaultChar);
7316
        if (outsize > 0) {
7317
            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7318
            {
7319
                pos++;
7320
                memcpy(out, buffer, outsize);
7321
                out += outsize;
7322
                continue;
7323
            }
7324
        }
7325
        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7326
            PyErr_SetFromWindowsErr(0);
7327
            goto error;
7328
        }
7329
7330
        rep = unicode_encode_call_errorhandler(
7331
                  errors, &errorHandler, encoding, reason,
7332
                  unicode, &exc,
7333
                  pos, pos + 1, &newpos);
7334
        if (rep == NULL)
7335
            goto error;
7336
7337
        Py_ssize_t morebytes = pos - newpos;
7338
        if (PyBytes_Check(rep)) {
7339
            outsize = PyBytes_GET_SIZE(rep);
7340
            morebytes += outsize;
7341
            if (morebytes > 0) {
7342
                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7343
                newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
7344
                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7345
                    Py_DECREF(rep);
7346
                    goto error;
7347
                }
7348
                out = PyBytes_AS_STRING(*outbytes) + offset;
7349
            }
7350
            memcpy(out, PyBytes_AS_STRING(rep), outsize);
7351
            out += outsize;
7352
        }
7353
        else {
7354
            Py_ssize_t i;
7355
            int kind;
7356
            const void *data;
7357
7358
            outsize = PyUnicode_GET_LENGTH(rep);
7359
            morebytes += outsize;
7360
            if (morebytes > 0) {
7361
                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7362
                newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
7363
                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7364
                    Py_DECREF(rep);
7365
                    goto error;
7366
                }
7367
                out = PyBytes_AS_STRING(*outbytes) + offset;
7368
            }
7369
            kind = PyUnicode_KIND(rep);
7370
            data = PyUnicode_DATA(rep);
7371
            for (i=0; i < outsize; i++) {
7372
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7373
                if (ch > 127) {
7374
                    raise_encode_exception(&exc,
7375
                        encoding, unicode,
7376
                        pos, pos + 1,
7377
                        "unable to encode error handler result to ASCII");
7378
                    Py_DECREF(rep);
7379
                    goto error;
7380
                }
7381
                *out = (unsigned char)ch;
7382
                out++;
7383
            }
7384
        }
7385
        pos = newpos;
7386
        Py_DECREF(rep);
7387
    }
7388
    /* write a NUL byte */
7389
    *out = 0;
7390
    outsize = out - PyBytes_AS_STRING(*outbytes);
7391
    assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7392
    if (_PyBytes_Resize(outbytes, outsize) < 0)
7393
        goto error;
7394
    ret = 0;
7395
7396
error:
7397
    Py_XDECREF(encoding_obj);
7398
    Py_XDECREF(errorHandler);
7399
    Py_XDECREF(exc);
7400
    return ret;
7401
}
7402
7403
static PyObject *
7404
encode_code_page(int code_page,
7405
                 PyObject *unicode,
7406
                 const char *errors)
7407
{
7408
    Py_ssize_t len;
7409
    PyObject *outbytes = NULL;
7410
    Py_ssize_t offset;
7411
    int chunk_len, ret, done;
7412
7413
    if (!PyUnicode_Check(unicode)) {
7414
        PyErr_BadArgument();
7415
        return NULL;
7416
    }
7417
7418
    len = PyUnicode_GET_LENGTH(unicode);
7419
7420
    if (code_page < 0) {
7421
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7422
        return NULL;
7423
    }
7424
7425
    if (len == 0)
7426
        return PyBytes_FromStringAndSize(NULL, 0);
7427
7428
    offset = 0;
7429
    do
7430
    {
7431
#ifdef NEED_RETRY
7432
        if (len > DECODING_CHUNK_SIZE) {
7433
            chunk_len = DECODING_CHUNK_SIZE;
7434
            done = 0;
7435
        }
7436
        else
7437
#endif
7438
        {
7439
            chunk_len = (int)len;
7440
            done = 1;
7441
        }
7442
7443
        ret = encode_code_page_strict(code_page, &outbytes,
7444
                                      unicode, offset, chunk_len,
7445
                                      errors);
7446
        if (ret == -2)
7447
            ret = encode_code_page_errors(code_page, &outbytes,
7448
                                          unicode, offset,
7449
                                          chunk_len, errors);
7450
        if (ret < 0) {
7451
            Py_XDECREF(outbytes);
7452
            return NULL;
7453
        }
7454
7455
        offset += chunk_len;
7456
        len -= chunk_len;
7457
    } while (!done);
7458
7459
    return outbytes;
7460
}
7461
7462
PyObject *
7463
PyUnicode_EncodeCodePage(int code_page,
7464
                         PyObject *unicode,
7465
                         const char *errors)
7466
{
7467
    return encode_code_page(code_page, unicode, errors);
7468
}
7469
7470
PyObject *
7471
PyUnicode_AsMBCSString(PyObject *unicode)
7472
{
7473
    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7474
}
7475
7476
#undef NEED_RETRY
7477
7478
#endif /* MS_WINDOWS */
7479
7480
/* --- Character Mapping Codec -------------------------------------------- */
7481
7482
static int
7483
charmap_decode_string(const char *s,
7484
                      Py_ssize_t size,
7485
                      PyObject *mapping,
7486
                      const char *errors,
7487
                      _PyUnicodeWriter *writer)
7488
{
7489
    const char *starts = s;
7490
    const char *e;
7491
    Py_ssize_t startinpos, endinpos;
7492
    PyObject *errorHandler = NULL, *exc = NULL;
7493
    Py_ssize_t maplen;
7494
    int mapkind;
7495
    const void *mapdata;
7496
    Py_UCS4 x;
7497
    unsigned char ch;
7498
7499
    maplen = PyUnicode_GET_LENGTH(mapping);
7500
    mapdata = PyUnicode_DATA(mapping);
7501
    mapkind = PyUnicode_KIND(mapping);
7502
7503
    e = s + size;
7504
7505
    if (mapkind == PyUnicode_1BYTE_KIND && 
maplen >= 256118
) {
  Branch (7505:9): [True: 118, False: 6.54k]
  Branch (7505:44): [True: 112, False: 6]
7506
        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7507
         * is disabled in encoding aliases, latin1 is preferred because
7508
         * its implementation is faster. */
7509
        const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
7510
        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7511
        Py_UCS4 maxchar = writer->maxchar;
7512
7513
        assert (writer->kind == PyUnicode_1BYTE_KIND);
7514
        while (s < e) {
  Branch (7514:16): [True: 12.7k, False: 112]
7515
            ch = *s;
7516
            x = mapdata_ucs1[ch];
7517
            if (x > maxchar) {
  Branch (7517:17): [True: 4, False: 12.6k]
7518
                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
  Branch (7518:21): [True: 0, False: 4]
7519
                    goto onError;
7520
                maxchar = writer->maxchar;
7521
                outdata = (Py_UCS1 *)writer->data;
7522
            }
7523
            outdata[writer->pos] = x;
7524
            writer->pos++;
7525
            ++s;
7526
        }
7527
        return 0;
7528
    }
7529
7530
    
while (6.55k
s < e) {
  Branch (7530:12): [True: 7.73k, False: 35]
7531
        if (mapkind == PyUnicode_2BYTE_KIND && 
maplen >= 2567.46k
) {
  Branch (7531:13): [True: 7.46k, False: 274]
  Branch (7531:48): [True: 7.44k, False: 12]
7532
            int outkind = writer->kind;
7533
            const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
7534
            if (outkind == PyUnicode_1BYTE_KIND) {
  Branch (7534:17): [True: 6.71k, False: 737]
7535
                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7536
                Py_UCS4 maxchar = writer->maxchar;
7537
                while (s < e) {
  Branch (7537:24): [True: 458k, False: 6.33k]
7538
                    ch = *s;
7539
                    x = mapdata_ucs2[ch];
7540
                    if (x > maxchar)
  Branch (7540:25): [True: 376, False: 457k]
7541
                        goto Error;
7542
                    outdata[writer->pos] = x;
7543
                    writer->pos++;
7544
                    ++s;
7545
                }
7546
                break;
7547
            }
7548
            else if (outkind == PyUnicode_2BYTE_KIND) {
  Branch (7548:22): [True: 737, False: 0]
7549
                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7550
                while (s < e) {
  Branch (7550:24): [True: 25.5k, False: 183]
7551
                    ch = *s;
7552
                    x = mapdata_ucs2[ch];
7553
                    if (x == 0xFFFE)
  Branch (7553:25): [True: 554, False: 25.0k]
7554
                        goto Error;
7555
                    outdata[writer->pos] = x;
7556
                    writer->pos++;
7557
                    ++s;
7558
                }
7559
                break;
7560
            }
7561
        }
7562
        ch = *s;
7563
7564
        if (ch < maplen)
  Branch (7564:13): [True: 26, False: 260]
7565
            x = PyUnicode_READ(mapkind, mapdata, ch);
7566
        else
7567
            x = 0xfffe; /* invalid value */
7568
Error:
7569
        if (x == 0xfffe)
  Branch (7569:13): [True: 876, False: 340]
7570
        {
7571
            /* undefined mapping */
7572
            startinpos = s-starts;
7573
            endinpos = startinpos+1;
7574
            if (unicode_decode_call_errorhandler_writer(
  Branch (7574:17): [True: 2, False: 874]
7575
                    errors, &errorHandler,
7576
                    "charmap", "character maps to <undefined>",
7577
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7578
                    writer)) {
7579
                goto onError;
7580
            }
7581
            continue;
7582
        }
7583
7584
        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
  Branch (7584:13): [True: 0, False: 340]
7585
            goto onError;
7586
        ++s;
7587
    }
7588
    Py_XDECREF(errorHandler);
7589
    Py_XDECREF(exc);
7590
    return 0;
7591
7592
onError:
7593
    Py_XDECREF(errorHandler);
7594
    Py_XDECREF(exc);
7595
    return -1;
7596
}
7597
7598
static int
7599
charmap_decode_mapping(const char *s,
7600
                       Py_ssize_t size,
7601
                       PyObject *mapping,
7602
                       const char *errors,
7603
                       _PyUnicodeWriter *writer)
7604
{
7605
    const char *starts = s;
7606
    const char *e;
7607
    Py_ssize_t startinpos, endinpos;
7608
    PyObject *errorHandler = NULL, *exc = NULL;
7609
    unsigned char ch;
7610
    PyObject *key, *item = NULL;
7611
7612
    e = s + size;
7613
7614
    while (s < e) {
  Branch (7614:12): [True: 387, False: 33]
7615
        ch = *s;
7616
7617
        /* Get mapping (char ordinal -> integer, Unicode char or None) */
7618
        key = PyLong_FromLong((long)ch);
7619
        if (key == NULL)
  Branch (7619:13): [True: 0, False: 387]
7620
            goto onError;
7621
7622
        item = PyObject_GetItem(mapping, key);
7623
        Py_DECREF(key);
7624
        if (item == NULL) {
  Branch (7624:13): [True: 267, False: 120]
7625
            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
  Branch (7625:17): [True: 266, False: 1]
7626
                /* No mapping found means: mapping is undefined. */
7627
                PyErr_Clear();
7628
                goto Undefined;
7629
            } else
7630
                goto onError;
7631
        }
7632
7633
        /* Apply mapping */
7634
        if (item == Py_None)
  Branch (7634:13): [True: 6, False: 114]
7635
            goto Undefined;
7636
        if (PyLong_Check(item)) {
7637
            long value = PyLong_AS_LONG(item);
7638
            if (value == 0xFFFE)
  Branch (7638:17): [True: 4, False: 55]
7639
                goto Undefined;
7640
            if (value < 0 || 
value > 54
MAX_UNICODE54
) {
  Branch (7640:17): [True: 1, False: 54]
  Branch (7640:30): [True: 3, False: 51]
7641
                PyErr_Format(PyExc_TypeError,
7642
                             "character mapping must be in range(0x%x)",
7643
                             (unsigned long)MAX_UNICODE + 1);
7644
                goto onError;
7645
            }
7646
7647
            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
  Branch (7647:17): [True: 0, False: 51]
7648
                goto onError;
7649
        }
7650
        else if (PyUnicode_Check(item)) {
7651
            if (PyUnicode_GET_LENGTH(item) == 1) {
  Branch (7651:17): [True: 39, False: 16]
7652
                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7653
                if (value == 0xFFFE)
  Branch (7653:21): [True: 4, False: 35]
7654
                    goto Undefined;
7655
                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
  Branch (7655:21): [True: 0, False: 35]
7656
                    goto onError;
7657
            }
7658
            else {
7659
                writer->overallocate = 1;
7660
                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
  Branch (7660:21): [True: 0, False: 16]
7661
                    goto onError;
7662
            }
7663
        }
7664
        else {
7665
            /* wrong return value */
7666
            PyErr_SetString(PyExc_TypeError,
7667
                            "character mapping must return integer, None or str");
7668
            goto onError;
7669
        }
7670
        Py_CLEAR(item);
7671
        ++s;
7672
        continue;
7673
7674
Undefined:
7675
        /* undefined mapping */
7676
        Py_CLEAR(item);
7677
        startinpos = s-starts;
7678
        endinpos = startinpos+1;
7679
        if (unicode_decode_call_errorhandler_writer(
  Branch (7679:13): [True: 7, False: 273]
7680
                errors, &errorHandler,
7681
                "charmap", "character maps to <undefined>",
7682
                &starts, &e, &startinpos, &endinpos, &exc, &s,
7683
                writer)) {
7684
            goto onError;
7685
        }
7686
    }
7687
    Py_XDECREF(errorHandler);
7688
    Py_XDECREF(exc);
7689
    return 0;
7690
7691
onError:
7692
    Py_XDECREF(item);
7693
    Py_XDECREF(errorHandler);
7694
    Py_XDECREF(exc);
7695
    return -1;
7696
}
7697
7698
PyObject *
7699
PyUnicode_DecodeCharmap(const char *s,
7700
                        Py_ssize_t size,
7701
                        PyObject *mapping,
7702
                        const char *errors)
7703
{
7704
    _PyUnicodeWriter writer;
7705
7706
    /* Default to Latin-1 */
7707
    if (mapping == NULL)
  Branch (7707:9): [True: 67, False: 7.54k]
7708
        return PyUnicode_DecodeLatin1(s, size, errors);
7709
7710
    if (size == 0)
  Branch (7710:9): [True: 832, False: 6.71k]
7711
        _Py_RETURN_UNICODE_EMPTY();
7712
    _PyUnicodeWriter_Init(&writer);
7713
    writer.min_length = size;
7714
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
  Branch (7714:9): [True: 0, False: 6.71k]
7715
        goto onError;
7716
7717
    if (PyUnicode_CheckExact(mapping)) {
7718
        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
  Branch (7718:13): [True: 2, False: 6.66k]
7719
            goto onError;
7720
    }
7721
    else {
7722
        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
  Branch (7722:13): [True: 12, False: 33]
7723
            goto onError;
7724
    }
7725
    return _PyUnicodeWriter_Finish(&writer);
7726
7727
  onError:
7728
    _PyUnicodeWriter_Dealloc(&writer);
7729
    return NULL;
7730
}
7731
7732
/* Charmap encoding: the lookup table */
7733
7734
/*[clinic input]
7735
class EncodingMap "struct encoding_map *" "&EncodingMapType"
7736
[clinic start generated code]*/
7737
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
7738
7739
struct encoding_map {
7740
    PyObject_HEAD
7741
    unsigned char level1[32];
7742
    int count2, count3;
7743
    unsigned char level23[1];
7744
};
7745
7746
/*[clinic input]
7747
EncodingMap.size
7748
7749
Return the size (in bytes) of this object.
7750
[clinic start generated code]*/
7751
7752
static PyObject *
7753
EncodingMap_size_impl(struct encoding_map *self)
7754
/*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
7755
{
7756
    return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
7757
                           128*self->count3);
7758
}
7759
7760
static PyMethodDef encoding_map_methods[] = {
7761
    ENCODINGMAP_SIZE_METHODDEF
7762
    {NULL, NULL}
7763
};
7764
7765
static PyTypeObject EncodingMapType = {
7766
    PyVarObject_HEAD_INIT(NULL, 0)
7767
    .tp_name = "EncodingMap",
7768
    .tp_basicsize = sizeof(struct encoding_map),
7769
    /* methods */
7770
    .tp_flags = Py_TPFLAGS_DEFAULT,
7771
    .tp_methods = encoding_map_methods,
7772
};
7773
7774
PyObject*
7775
PyUnicode_BuildEncodingMap(PyObject* string)
7776
{
7777
    PyObject *result;
7778
    struct encoding_map *mresult;
7779
    int i;
7780
    int need_dict = 0;
7781
    unsigned char level1[32];
7782
    unsigned char level2[512];
7783
    unsigned char *mlevel1, *mlevel2, *mlevel3;
7784
    int count2 = 0, count3 = 0;
7785
    int kind;
7786
    const void *data;
7787
    Py_ssize_t length;
7788
    Py_UCS4 ch;
7789
7790
    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
  Branch (7790:9): [True: 0, False: 49]
  Branch (7790:37): [True: 0, False: 49]
7791
        PyErr_BadArgument();
7792
        return NULL;
7793
    }
7794
    kind = PyUnicode_KIND(string);
7795
    data = PyUnicode_DATA(string);
7796
    length = PyUnicode_GET_LENGTH(string);
7797
    length = Py_MIN(length, 256);
7798
    memset(level1, 0xFF, sizeof level1);
7799
    memset(level2, 0xFF, sizeof level2);
7800
7801
    /* If there isn't a one-to-one mapping of NULL to \0,
7802
       or if there are non-BMP characters, we need to use
7803
       a mapping dictionary. */
7804
    if (PyUnicode_READ(kind, data, 0) != 0)
  Branch (7804:9): [True: 0, False: 49]
7805
        need_dict = 1;
7806
    for (i = 1; i < length; 
i++12.4k
) {
  Branch (7806:17): [True: 12.4k, False: 49]
7807
        int l1, l2;
7808
        ch = PyUnicode_READ(kind, data, i);
7809
        if (ch == 0 || ch > 0xFFFF) {
  Branch (7809:13): [True: 0, False: 12.4k]
  Branch (7809:24): [True: 0, False: 12.4k]
7810
            need_dict = 1;
7811
            break;
7812
        }
7813
        if (ch == 0xFFFE)
  Branch (7813:13): [True: 325, False: 12.1k]
7814
            /* unmapped character */
7815
            continue;
7816
        l1 = ch >> 11;
7817
        l2 = ch >> 7;
7818
        if (level1[l1] == 0xFF)
  Branch (7818:13): [True: 93, False: 12.0k]
7819
            level1[l1] = count2++;
7820
        if (level2[l2] == 0xFF)
  Branch (7820:13): [True: 295, False: 11.8k]
7821
            level2[l2] = count3++;
7822
    }
7823
7824
    if (count2 >= 0xFF || count3 >= 0xFF)
  Branch (7824:9): [True: 0, False: 49]
  Branch (7824:27): [True: 0, False: 49]
7825
        need_dict = 1;
7826
7827
    if (need_dict) {
  Branch (7827:9): [True: 0, False: 49]
7828
        PyObject *result = PyDict_New();
7829
        PyObject *key, *value;
7830
        if (!result)
  Branch (7830:13): [True: 0, False: 0]
7831
            return NULL;
7832
        for (i = 0; i < length; i++) {
  Branch (7832:21): [True: 0, False: 0]
7833
            key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
7834
            value = PyLong_FromLong(i);
7835
            if (!key || !value)
  Branch (7835:17): [True: 0, False: 0]
  Branch (7835:25): [True: 0, False: 0]
7836
                goto failed1;
7837
            if (PyDict_SetItem(result, key, value) == -1)
  Branch (7837:17): [True: 0, False: 0]
7838
                goto failed1;
7839
            Py_DECREF(key);
7840
            Py_DECREF(value);
7841
        }
7842
        return result;
7843
      failed1:
7844
        Py_XDECREF(key);
7845
        Py_XDECREF(value);
7846
        Py_DECREF(result);
7847
        return NULL;
7848
    }
7849
7850
    /* Create a three-level trie */
7851
    result = PyObject_Malloc(sizeof(struct encoding_map) +
7852
                             16*count2 + 128*count3 - 1);
7853
    if (!result) {
  Branch (7853:9): [True: 0, False: 49]
7854
        return PyErr_NoMemory();
7855
    }
7856
7857
    _PyObject_Init(result, &EncodingMapType);
7858
    mresult = (struct encoding_map*)result;
7859
    mresult->count2 = count2;
7860
    mresult->count3 = count3;
7861
    mlevel1 = mresult->level1;
7862
    mlevel2 = mresult->level23;
7863
    mlevel3 = mresult->level23 + 16*count2;
7864
    memcpy(mlevel1, level1, 32);
7865
    memset(mlevel2, 0xFF, 16*count2);
7866
    memset(mlevel3, 0, 128*count3);
7867
    count3 = 0;
7868
    for (i = 1; i < length; 
i++12.4k
) {
  Branch (7868:17): [True: 12.4k, False: 49]
7869
        int o1, o2, o3, i2, i3;
7870
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7871
        if (ch == 0xFFFE)
  Branch (7871:13): [True: 325, False: 12.1k]
7872
            /* unmapped character */
7873
            continue;
7874
        o1 = ch>>11;
7875
        o2 = (ch>>7) & 0xF;
7876
        i2 = 16*mlevel1[o1] + o2;
7877
        if (mlevel2[i2] == 0xFF)
  Branch (7877:13): [True: 295, False: 11.8k]
7878
            mlevel2[i2] = count3++;
7879
        o3 = ch & 0x7F;
7880
        i3 = 128*mlevel2[i2] + o3;
7881
        mlevel3[i3] = i;
7882
    }
7883
    return result;
7884
}
7885
7886
static int
7887
encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
7888
{
7889
    struct encoding_map *map = (struct encoding_map*)mapping;
7890
    int l1 = c>>11;
7891
    int l2 = (c>>7) & 0xF;
7892
    int l3 = c & 0x7F;
7893
    int i;
7894
7895
    if (c > 0xFFFF)
  Branch (7895:9): [True: 10, False: 220k]
7896
        return -1;
7897
    if (c == 0)
  Branch (7897:9): [True: 39, False: 220k]
7898
        return 0;
7899
    /* level 1*/
7900
    i = map->level1[l1];
7901
    if (i == 0xFF) {
  Branch (7901:9): [True: 12.0k, False: 208k]
7902
        return -1;
7903
    }
7904
    /* level 2*/
7905
    i = map->level23[16*i+l2];
7906
    if (i == 0xFF) {
  Branch (7906:9): [True: 9, False: 208k]
7907
        return -1;
7908
    }
7909
    /* level 3 */
7910
    i = map->level23[16*map->count2 + 128*i + l3];
7911
    if (i == 0) {
  Branch (7911:9): [True: 14, False: 208k]
7912
        return -1;
7913
    }
7914
    return i;
7915
}
7916
7917
/* Lookup the character ch in the mapping. If the character
7918
   can't be found, Py_None is returned (or NULL, if another
7919
   error occurred). */
7920
static PyObject *
7921
charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
7922
{
7923
    PyObject *w = PyLong_FromLong((long)c);
7924
    PyObject *x;
7925
7926
    if (w == NULL)
  Branch (7926:9): [True: 0, False: 28.1k]
7927
        return NULL;
7928
    x = PyObject_GetItem(mapping, w);
7929
    Py_DECREF(w);
7930
    if (x == NULL) {
  Branch (7930:9): [True: 42, False: 28.0k]
7931
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
  Branch (7931:13): [True: 36, False: 6]
7932
            /* No mapping found means: mapping is undefined. */
7933
            PyErr_Clear();
7934
            Py_RETURN_NONE;
7935
        } else
7936
            return NULL;
7937
    }
7938
    else if (x == Py_None)
  Branch (7938:14): [True: 6, False: 28.0k]
7939
        return x;
7940
    else if (PyLong_Check(x)) {
7941
        long value = PyLong_AS_LONG(x);
7942
        if (value < 0 || value > 255) {
  Branch (7942:13): [True: 0, False: 28.0k]
  Branch (7942:26): [True: 6, False: 28.0k]
7943
            PyErr_SetString(PyExc_TypeError,
7944
                            "character mapping must be in range(256)");
7945
            Py_DECREF(x);
7946
            return NULL;
7947
        }
7948
        return x;
7949
    }
7950
    else if (PyBytes_Check(x))
7951
        return x;
7952
    else {
7953
        /* wrong return value */
7954
        PyErr_Format(PyExc_TypeError,
7955
                     "character mapping must return integer, bytes or None, not %.400s",
7956
                     Py_TYPE(x)->tp_name);
7957
        Py_DECREF(x);
7958
        return NULL;
7959
    }
7960
}
7961
7962
static int
7963
charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
7964
{
7965
    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7966
    /* exponentially overallocate to minimize reallocations */
7967
    if (requiredsize < 2*outsize)
  Branch (7967:9): [True: 74, False: 0]
7968
        requiredsize = 2*outsize;
7969
    if (_PyBytes_Resize(outobj, requiredsize))
  Branch (7969:9): [True: 0, False: 74]
7970
        return -1;
7971
    return 0;
7972
}
7973
7974
typedef enum charmapencode_result {
7975
    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
7976
} charmapencode_result;
7977
/* lookup the character, put the result in the output string and adjust
7978
   various state variables. Resize the output bytes object if not enough
7979
   space is available. Return a new reference to the object that
7980
   was put in the output buffer, or Py_None, if the mapping was undefined
7981
   (in which case no character was written) or NULL, if a
7982
   reallocation error occurred. The caller must decref the result */
7983
static charmapencode_result
7984
charmapencode_output(Py_UCS4 c, PyObject *mapping,
7985
                     PyObject **outobj, Py_ssize_t *outpos)
7986
{
7987
    PyObject *rep;
7988
    char *outstart;
7989
    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7990
7991
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
7992
        int res = encoding_map_lookup(c, mapping);
7993
        Py_ssize_t requiredsize = *outpos+1;
7994
        if (res == -1)
  Branch (7994:13): [True: 98, False: 208k]
7995
            return enc_FAILED;
7996
        if (outsize<requiredsize)
  Branch (7996:13): [True: 60, False: 208k]
7997
            if (charmapencode_resize(outobj, outpos, requiredsize))
  Branch (7997:17): [True: 0, False: 60]
7998
                return enc_EXCEPTION;
7999
        outstart = PyBytes_AS_STRING(*outobj);
8000
        outstart[(*outpos)++] = (char)res;
8001
        return enc_SUCCESS;
8002
    }
8003
8004
    rep = charmapencode_lookup(c, mapping);
8005
    if (rep==NULL)
  Branch (8005:9): [True: 13, False: 28.0k]
8006
        return enc_EXCEPTION;
8007
    else if (rep==Py_None) {
  Branch (8007:14): [True: 38, False: 28.0k]
8008
        Py_DECREF(rep);
8009
        return enc_FAILED;
8010
    } else {
8011
        if (PyLong_Check(rep)) {
8012
            Py_ssize_t requiredsize = *outpos+1;
8013
            if (outsize<requiredsize)
  Branch (8013:17): [True: 8, False: 28.0k]
8014
                if (charmapencode_resize(outobj, outpos, requiredsize)) {
  Branch (8014:21): [True: 0, False: 8]
8015
                    Py_DECREF(rep);
8016
                    return enc_EXCEPTION;
8017
                }
8018
            outstart = PyBytes_AS_STRING(*outobj);
8019
            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8020
        }
8021
        else {
8022
            const char *repchars = PyBytes_AS_STRING(rep);
8023
            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8024
            Py_ssize_t requiredsize = *outpos+repsize;
8025
            if (outsize<requiredsize)
  Branch (8025:17): [True: 6, False: 14]
8026
                if (charmapencode_resize(outobj, outpos, requiredsize)) {
  Branch (8026:21): [True: 0, False: 6]
8027
                    Py_DECREF(rep);
8028
                    return enc_EXCEPTION;
8029
                }
8030
            outstart = PyBytes_AS_STRING(*outobj);
8031
            memcpy(outstart + *outpos, repchars, repsize);
8032
            *outpos += repsize;
8033
        }
8034
    }
8035
    Py_DECREF(rep);
8036
    return enc_SUCCESS;
8037
}
8038
8039
/* handle an error in PyUnicode_EncodeCharmap
8040
   Return 0 on success, -1 on error */
8041
static int
8042
charmap_encoding_error(
8043
    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8044
    PyObject **exceptionObject,
8045
    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8046
    PyObject **res, Py_ssize_t *respos)
8047
{
8048
    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8049
    Py_ssize_t size, repsize;
8050
    Py_ssize_t newpos;
8051
    int kind;
8052
    const void *data;
8053
    Py_ssize_t index;
8054
    /* startpos for collecting unencodable chars */
8055
    Py_ssize_t collstartpos = *inpos;
8056
    Py_ssize_t collendpos = *inpos+1;
8057
    Py_ssize_t collpos;
8058
    const char *encoding = "charmap";
8059
    const char *reason = "character maps to <undefined>";
8060
    charmapencode_result x;
8061
    Py_UCS4 ch;
8062
    int val;
8063
8064
    size = PyUnicode_GET_LENGTH(unicode);
8065
    /* find all unencodable characters */
8066
    while (collendpos < size) {
  Branch (8066:12): [True: 12.0k, False: 90]
8067
        PyObject *rep;
8068
        if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8069
            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8070
            val = encoding_map_lookup(ch, mapping);
8071
            if (val != -1)
  Branch (8071:17): [True: 15, False: 12.0k]
8072
                break;
8073
            ++collendpos;
8074
            continue;
8075
        }
8076
8077
        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8078
        rep = charmapencode_lookup(ch, mapping);
8079
        if (rep==NULL)
  Branch (8079:13): [True: 0, False: 28]
8080
            return -1;
8081
        else if (rep!=Py_None) {
  Branch (8081:18): [True: 24, False: 4]
8082
            Py_DECREF(rep);
8083
            break;
8084
        }
8085
        Py_DECREF(rep);
8086
        ++collendpos;
8087
    }
8088
    /* cache callback name lookup
8089
     * (if not done yet, i.e. it's the first error) */
8090
    if (*error_handler == _Py_ERROR_UNKNOWN)
  Branch (8090:9): [True: 62, False: 67]
8091
        *error_handler = _Py_GetErrorHandler(errors);
8092
8093
    switch (*error_handler) {
8094
    case _Py_ERROR_STRICT:
  Branch (8094:5): [True: 5, False: 124]
8095
        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8096
        return -1;
8097
8098
    case _Py_ERROR_REPLACE:
  Branch (8098:5): [True: 8, False: 121]
8099
        for (collpos = collstartpos; collpos<collendpos; 
++collpos1.00k
) {
  Branch (8099:38): [True: 1.00k, False: 6]
8100
            x = charmapencode_output('?', mapping, res, respos);
8101
            if (x==enc_EXCEPTION) {
  Branch (8101:17): [True: 1, False: 1.00k]
8102
                return -1;
8103
            }
8104
            else if (x==enc_FAILED) {
  Branch (8104:22): [True: 1, False: 1.00k]
8105
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8106
                return -1;
8107
            }
8108
        }
8109
        /* fall through */
8110
    case _Py_ERROR_IGNORE:
  Branch (8110:5): [True: 5, False: 124]
8111
        *inpos = collendpos;
8112
        break;
8113
8114
    case _Py_ERROR_XMLCHARREFREPLACE:
  Branch (8114:5): [True: 9, False: 120]
8115
        /* generate replacement (temporarily (mis)uses p) */
8116
        for (collpos = collstartpos; collpos < collendpos; 
++collpos1.01k
) {
  Branch (8116:38): [True: 1.01k, False: 8]
8117
            char buffer[2+29+1+1];
8118
            char *cp;
8119
            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8120
            for (cp = buffer; *cp; 
++cp8.10k
) {
  Branch (8120:31): [True: 8.10k, False: 1.01k]
8121
                x = charmapencode_output(*cp, mapping, res, respos);
8122
                if (x==enc_EXCEPTION)
  Branch (8122:21): [True: 0, False: 8.10k]
8123
                    return -1;
8124
                else if (x==enc_FAILED) {
  Branch (8124:26): [True: 1, False: 8.10k]
8125
                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8126
                    return -1;
8127
                }
8128
            }
8129
        }
8130
        *inpos = collendpos;
8131
        break;
8132
8133
    default:
  Branch (8133:5): [True: 102, False: 27]
8134
        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8135
                                                      encoding, reason, unicode, exceptionObject,
8136
                                                      collstartpos, collendpos, &newpos);
8137
        if (repunicode == NULL)
  Branch (8137:13): [True: 12, False: 90]
8138
            return -1;
8139
        if (PyBytes_Check(repunicode)) {
8140
            /* Directly copy bytes result to output. */
8141
            Py_ssize_t outsize = PyBytes_Size(*res);
8142
            Py_ssize_t requiredsize;
8143
            repsize = PyBytes_Size(repunicode);
8144
            requiredsize = *respos + repsize;
8145
            if (requiredsize > outsize)
  Branch (8145:17): [True: 0, False: 2]
8146
                /* Make room for all additional bytes. */
8147
                if (charmapencode_resize(res, respos, requiredsize)) {
  Branch (8147:21): [True: 0, False: 0]
8148
                    Py_DECREF(repunicode);
8149
                    return -1;
8150
                }
8151
            memcpy(PyBytes_AsString(*res) + *respos,
8152
                   PyBytes_AsString(repunicode),  repsize);
8153
            *respos += repsize;
8154
            *inpos = newpos;
8155
            Py_DECREF(repunicode);
8156
            break;
8157
        }
8158
        /* generate replacement  */
8159
        repsize = PyUnicode_GET_LENGTH(repunicode);
8160
        data = PyUnicode_DATA(repunicode);
8161
        kind = PyUnicode_KIND(repunicode);
8162
        for (index = 0; index < repsize; 
index++63.3k
) {
  Branch (8162:25): [True: 63.3k, False: 83]
8163
            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8164
            x = charmapencode_output(repch, mapping, res, respos);
8165
            if (x==enc_EXCEPTION) {
  Branch (8165:17): [True: 0, False: 63.3k]
8166
                Py_DECREF(repunicode);
8167
                return -1;
8168
            }
8169
            else if (x==enc_FAILED) {
  Branch (8169:22): [True: 5, False: 63.3k]
8170
                Py_DECREF(repunicode);
8171
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8172
                return -1;
8173
            }
8174
        }
8175
        *inpos = newpos;
8176
        Py_DECREF(repunicode);
8177
    }
8178
    return 0;
8179
}
8180
8181
PyObject *
8182
_PyUnicode_EncodeCharmap(PyObject *unicode,
8183
                         PyObject *mapping,
8184
                         const char *errors)
8185
{
8186
    /* output object */
8187
    PyObject *res = NULL;
8188
    /* current input position */
8189
    Py_ssize_t inpos = 0;
8190
    Py_ssize_t size;
8191
    /* current output position */
8192
    Py_ssize_t respos = 0;
8193
    PyObject *error_handler_obj = NULL;
8194
    PyObject *exc = NULL;
8195
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8196
    const void *data;
8197
    int kind;
8198
8199
    size = PyUnicode_GET_LENGTH(unicode);
8200
    data = PyUnicode_DATA(unicode);
8201
    kind = PyUnicode_KIND(unicode);
8202
8203
    /* Default to Latin-1 */
8204
    if (mapping == NULL)
  Branch (8204:9): [True: 58, False: 4.61k]
8205
        return unicode_encode_ucs1(unicode, errors, 256);
8206
8207
    /* allocate enough for a simple encoding without
8208
       replacements, if we need more, we'll resize */
8209
    res = PyBytes_FromStringAndSize(NULL, size);
8210
    if (res == NULL)
  Branch (8210:9): [True: 0, False: 4.61k]
8211
        goto onError;
8212
    if (size == 0)
  Branch (8212:9): [True: 385, False: 4.22k]
8213
        return res;
8214
8215
    
while (4.22k
inpos<size) {
  Branch (8215:12): [True: 163k, False: 4.19k]
8216
        Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8217
        /* try to encode it */
8218
        charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8219
        if (x==enc_EXCEPTION) /* error */
  Branch (8219:13): [True: 12, False: 163k]
8220
            goto onError;
8221
        if (x==enc_FAILED) { /* unencodable character */
  Branch (8221:13): [True: 129, False: 163k]
8222
            if (charmap_encoding_error(unicode, &inpos, mapping,
  Branch (8222:17): [True: 25, False: 104]
8223
                                       &exc,
8224
                                       &error_handler, &error_handler_obj, errors,
8225
                                       &res, &respos)) {
8226
                goto onError;
8227
            }
8228
        }
8229
        else
8230
            /* done with this character => adjust input position */
8231
            ++inpos;
8232
    }
8233
8234
    /* Resize if we allocated to much */
8235
    if (respos<PyBytes_GET_SIZE(res))
  Branch (8235:9): [True: 33, False: 4.15k]
8236
        if (_PyBytes_Resize(&res, respos) < 0)
  Branch (8236:13): [True: 0, False: 33]
8237
            goto onError;
8238
8239
    Py_XDECREF(exc);
8240
    Py_XDECREF(error_handler_obj);
8241
    return res;
8242
8243
  onError:
8244
    Py_XDECREF(res);
8245
    Py_XDECREF(exc);
8246
    Py_XDECREF(error_handler_obj);
8247
    return NULL;
8248
}
8249
8250
PyObject *
8251
PyUnicode_AsCharmapString(PyObject *unicode,
8252
                          PyObject *mapping)
8253
{
8254
    if (!PyUnicode_Check(unicode) || mapping == NULL) {
  Branch (8254:9): [True: 0, False: 0]
  Branch (8254:38): [True: 0, False: 0]
8255
        PyErr_BadArgument();
8256
        return NULL;
8257
    }
8258
    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8259
}
8260
8261
/* create or adjust a UnicodeTranslateError */
8262
static void
8263
make_translate_exception(PyObject **exceptionObject,
8264
                         PyObject *unicode,
8265
                         Py_ssize_t startpos, Py_ssize_t endpos,
8266
                         const char *reason)
8267
{
8268
    if (*exceptionObject == NULL) {
  Branch (8268:9): [True: 0, False: 0]
8269
        *exceptionObject = _PyUnicodeTranslateError_Create(
8270
            unicode, startpos, endpos, reason);
8271
    }
8272
    else {
8273
        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
  Branch (8273:13): [True: 0, False: 0]
8274
            goto onError;
8275
        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
  Branch (8275:13): [True: 0, False: 0]
8276
            goto onError;
8277
        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
  Branch (8277:13): [True: 0, False: 0]
8278
            goto onError;
8279
        return;
8280
      onError:
8281
        Py_CLEAR(*exceptionObject);
8282
    }
8283
}
8284
8285
/* error handling callback helper:
8286
   build arguments, call the callback and check the arguments,
8287
   put the result into newpos and return the replacement string, which
8288
   has to be freed by the caller */
8289
static PyObject *
8290
unicode_translate_call_errorhandler(const char *errors,
8291
                                    PyObject **errorHandler,
8292
                                    const char *reason,
8293
                                    PyObject *unicode, PyObject **exceptionObject,
8294
                                    Py_ssize_t startpos, Py_ssize_t endpos,
8295
                                    Py_ssize_t *newpos)
8296
{
8297
    static const char *argparse = "Un;translating error handler must return (str, int) tuple";
8298
8299
    Py_ssize_t i_newpos;
8300
    PyObject *restuple;
8301
    PyObject *resunicode;
8302
8303
    if (*errorHandler == NULL) {
  Branch (8303:9): [True: 0, False: 0]
8304
        *errorHandler = PyCodec_LookupError(errors);
8305
        if (*errorHandler == NULL)
  Branch (8305:13): [True: 0, False: 0]
8306
            return NULL;
8307
    }
8308
8309
    make_translate_exception(exceptionObject,
8310
                             unicode, startpos, endpos, reason);
8311
    if (*exceptionObject == NULL)
  Branch (8311:9): [True: 0, False: 0]
8312
        return NULL;
8313
8314
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
8315
    if (restuple == NULL)
  Branch (8315:9): [True: 0, False: 0]
8316
        return NULL;
8317
    if (!PyTuple_Check(restuple)) {
  Branch (8317:9): [True: 0, False: 0]
8318
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
8319
        Py_DECREF(restuple);
8320
        return NULL;
8321
    }
8322
    if (!PyArg_ParseTuple(restuple, argparse,
  Branch (8322:9): [True: 0, False: 0]
8323
                          &resunicode, &i_newpos)) {
8324
        Py_DECREF(restuple);
8325
        return NULL;
8326
    }
8327
    if (i_newpos<0)
  Branch (8327:9): [True: 0, False: 0]
8328
        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8329
    else
8330
        *newpos = i_newpos;
8331
    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
  Branch (8331:9): [True: 0, False: 0]
  Branch (8331:22): [True: 0, False: 0]
8332
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8333
        Py_DECREF(restuple);
8334
        return NULL;
8335
    }
8336
    Py_INCREF(resunicode);
8337
    Py_DECREF(restuple);
8338
    return resunicode;
8339
}
8340
8341
/* Lookup the character ch in the mapping and put the result in result,
8342
   which must be decrefed by the caller.
8343
   Return 0 on success, -1 on error */
8344
static int
8345
charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8346
{
8347
    PyObject *w = PyLong_FromLong((long)c);
8348
    PyObject *x;
8349
8350
    if (w == NULL)
  Branch (8350:9): [True: 0, False: 177k]
8351
        return -1;
8352
    x = PyObject_GetItem(mapping, w);
8353
    Py_DECREF(w);
8354
    if (x == NULL) {
  Branch (8354:9): [True: 124k, False: 53.0k]
8355
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
  Branch (8355:13): [True: 124k, False: 0]
8356
            /* No mapping found means: use 1:1 mapping. */
8357
            PyErr_Clear();
8358
            *result = NULL;
8359
            return 0;
8360
        } else
8361
            return -1;
8362
    }
8363
    else if (x == Py_None) {
  Branch (8363:14): [True: 19, False: 53.0k]
8364
        *result = x;
8365
        return 0;
8366
    }
8367
    else if (PyLong_Check(x)) {
8368
        long value = PyLong_AS_LONG(x);
8369
        if (value < 0 || value > MAX_UNICODE) {
  Branch (8369:13): [True: 0, False: 2.61k]
  Branch (8369:26): [True: 5, False: 2.60k]
8370
            PyErr_Format(PyExc_ValueError,
8371
                         "character mapping must be in range(0x%x)",
8372
                         MAX_UNICODE+1);
8373
            Py_DECREF(x);
8374
            return -1;
8375
        }
8376
        *result = x;
8377
        return 0;
8378
    }
8379
    else if (PyUnicode_Check(x)) {
8380
        *result = x;
8381
        return 0;
8382
    }
8383
    else {
8384
        /* wrong return value */
8385
        PyErr_SetString(PyExc_TypeError,
8386
                        "character mapping must return integer, None or str");
8387
        Py_DECREF(x);
8388
        return -1;
8389
    }
8390
}
8391
8392
/* lookup the character, write the result into the writer.
8393
   Return 1 if the result was written into the writer, return 0 if the mapping
8394
   was undefined, raise an exception return -1 on error. */
8395
static int
8396
charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8397
                        _PyUnicodeWriter *writer)
8398
{
8399
    PyObject *item;
8400
8401
    if (charmaptranslate_lookup(ch, mapping, &item))
  Branch (8401:9): [True: 5, False: 105k]
8402
        return -1;
8403
8404
    if (item == NULL) {
  Branch (8404:9): [True: 67.0k, False: 38.1k]
8405
        /* not found => default to 1:1 mapping */
8406
        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
  Branch (8406:13): [True: 0, False: 67.0k]
8407
            return -1;
8408
        }
8409
        return 1;
8410
    }
8411
8412
    if (item == Py_None) {
  Branch (8412:9): [True: 5, False: 38.1k]
8413
        Py_DECREF(item);
8414
        return 0;
8415
    }
8416
8417
    if (PyLong_Check(item)) {
8418
        long ch = (Py_UCS4)PyLong_AS_LONG(item);
8419
        /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8420
           used it */
8421
        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
  Branch (8421:13): [True: 0, False: 31]
8422
            Py_DECREF(item);
8423
            return -1;
8424
        }
8425
        Py_DECREF(item);
8426
        return 1;
8427
    }
8428
8429
    if (!PyUnicode_Check(item)) {
  Branch (8429:9): [True: 0, False: 38.0k]
8430
        Py_DECREF(item);
8431
        return -1;
8432
    }
8433
8434
    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
  Branch (8434:9): [True: 0, False: 38.0k]
8435
        Py_DECREF(item);
8436
        return -1;
8437
    }
8438
8439
    Py_DECREF(item);
8440
    return 1;
8441
}
8442
8443
static int
8444
unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8445
                              Py_UCS1 *translate)
8446
{
8447
    PyObject *item = NULL;
8448
    int ret = 0;
8449
8450
    if (charmaptranslate_lookup(ch, mapping, &item)) {
  Branch (8450:9): [True: 1, False: 72.5k]
8451
        return -1;
8452
    }
8453
8454
    if (item == Py_None) {
  Branch (8454:9): [True: 14, False: 72.4k]
8455
        /* deletion */
8456
        translate[ch] = 0xfe;
8457
    }
8458
    else if (item == NULL) {
  Branch (8458:14): [True: 57.5k, False: 14.8k]
8459
        /* not found => default to 1:1 mapping */
8460
        translate[ch] = ch;
8461
        return 1;
8462
    }
8463
    else if (PyLong_Check(item)) {
8464
        long replace = PyLong_AS_LONG(item);
8465
        /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8466
           used it */
8467
        if (127 < replace) {
  Branch (8467:13): [True: 0, False: 2.57k]
8468
            /* invalid character or character outside ASCII:
8469
               skip the fast translate */
8470
            goto exit;
8471
        }
8472
        translate[ch] = (Py_UCS1)replace;
8473
    }
8474
    else if (PyUnicode_Check(item)) {
8475
        Py_UCS4 replace;
8476
8477
        if (PyUnicode_GET_LENGTH(item) != 1)
  Branch (8477:13): [True: 11.8k, False: 494]
8478
            goto exit;
8479
8480
        replace = PyUnicode_READ_CHAR(item, 0);
8481
        if (replace > 127)
  Branch (8481:13): [True: 2, False: 492]
8482
            goto exit;
8483
        translate[ch] = (Py_UCS1)replace;
8484
    }
8485
    else {
8486
        /* not None, NULL, long or unicode */
8487
        goto exit;
8488
    }
8489
    ret = 1;
8490
8491
  exit:
8492
    Py_DECREF(item);
8493
    return ret;
8494
}
8495
8496
/* Fast path for ascii => ascii translation. Return 1 if the whole string
8497
   was translated into writer, return 0 if the input string was partially
8498
   translated into writer, raise an exception and return -1 on error. */
8499
static int
8500
unicode_fast_translate(PyObject *input, PyObject *mapping,
8501
                       _PyUnicodeWriter *writer, int ignore,
8502
                       Py_ssize_t *input_pos)
8503
{
8504
    Py_UCS1 ascii_table[128], ch, ch2;
8505
    Py_ssize_t len;
8506
    const Py_UCS1 *in, *end;
8507
    Py_UCS1 *out;
8508
    int res = 0;
8509
8510
    len = PyUnicode_GET_LENGTH(input);
8511
8512
    memset(ascii_table, 0xff, 128);
8513
8514
    in = PyUnicode_1BYTE_DATA(input);
8515
    end = in + len;
8516
8517
    assert(PyUnicode_IS_ASCII(writer->buffer));
8518
    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8519
    out = PyUnicode_1BYTE_DATA(writer->buffer);
8520
8521
    for (; in < end; 
in++94.1k
) {
  Branch (8521:12): [True: 105k, False: 11.8k]
8522
        ch = *in;
8523
        ch2 = ascii_table[ch];
8524
        if (ch2 == 0xff) {
  Branch (8524:13): [True: 72.5k, False: 33.4k]
8525
            int translate = unicode_fast_translate_lookup(mapping, ch,
8526
                                                          ascii_table);
8527
            if (translate < 0)
  Branch (8527:17): [True: 1, False: 72.5k]
8528
                return -1;
8529
            if (translate == 0)
  Branch (8529:17): [True: 11.8k, False: 60.6k]
8530
                goto exit;
8531
            ch2 = ascii_table[ch];
8532
        }
8533
        if (ch2 == 0xfe) {
  Branch (8533:13): [True: 25, False: 94.1k]
8534
            if (ignore)
  Branch (8534:17): [True: 25, False: 0]
8535
                continue;
8536
            goto exit;
8537
        }
8538
        assert(ch2 < 128);
8539
        *out = ch2;
8540
        out++;
8541
    }
8542
    res = 1;
8543
8544
exit:
8545
    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8546
    *input_pos = in - PyUnicode_1BYTE_DATA(input);
8547
    return res;
8548
}
8549
8550
static PyObject *
8551
_PyUnicode_TranslateCharmap(PyObject *input,
8552
                            PyObject *mapping,
8553
                            const char *errors)
8554
{
8555
    /* input object */
8556
    const void *data;
8557
    Py_ssize_t size, i;
8558
    int kind;
8559
    /* output buffer */
8560
    _PyUnicodeWriter writer;
8561
    /* error handler */
8562
    const char *reason = "character maps to <undefined>";
8563
    PyObject *errorHandler = NULL;
8564
    PyObject *exc = NULL;
8565
    int ignore;
8566
    int res;
8567
8568
    if (mapping == NULL) {
  Branch (8568:9): [True: 0, False: 24.5k]
8569
        PyErr_BadArgument();
8570
        return NULL;
8571
    }
8572
8573
    data = PyUnicode_DATA(input);
8574
    kind = PyUnicode_KIND(input);
8575
    size = PyUnicode_GET_LENGTH(input);
8576
8577
    if (size == 0)
  Branch (8577:9): [True: 13, False: 24.5k]
8578
        return PyUnicode_FromObject(input);
8579
8580
    /* allocate enough for a simple 1:1 translation without
8581
       replacements, if we need more, we'll resize */
8582
    _PyUnicodeWriter_Init(&writer);
8583
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
  Branch (8583:9): [True: 0, False: 24.5k]
8584
        goto onError;
8585
8586
    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
  Branch (8586:15): [True: 24.5k, False: 0]
  Branch (8586:33): [True: 24.5k, False: 0]
8587
8588
    if (PyUnicode_IS_ASCII(input)) {
8589
        res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8590
        if (res < 0) {
  Branch (8590:13): [True: 1, False: 23.6k]
8591
            _PyUnicodeWriter_Dealloc(&writer);
8592
            return NULL;
8593
        }
8594
        if (res == 1)
  Branch (8594:13): [True: 11.8k, False: 11.8k]
8595
            return _PyUnicodeWriter_Finish(&writer);
8596
    }
8597
    else {
8598
        i = 0;
8599
    }
8600
8601
    
while (12.6k
i<size) {
  Branch (8601:12): [True: 105k, False: 12.6k]
8602
        /* try to encode it */
8603
        int translate;
8604
        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8605
        Py_ssize_t newpos;
8606
        /* startpos for collecting untranslatable chars */
8607
        Py_ssize_t collstart;
8608
        Py_ssize_t collend;
8609
        Py_UCS4 ch;
8610
8611
        ch = PyUnicode_READ(kind, data, i);
8612
        translate = charmaptranslate_output(ch, mapping, &writer);
8613
        if (translate < 0)
  Branch (8613:13): [True: 5, False: 105k]
8614
            goto onError;
8615
8616
        if (translate != 0) {
  Branch (8616:13): [True: 105k, False: 5]
8617
            /* it worked => adjust input pointer */
8618
            ++i;
8619
            continue;
8620
        }
8621
8622
        /* untranslatable character */
8623
        collstart = i;
8624
        collend = i+1;
8625
8626
        /* find all untranslatable characters */
8627
        while (collend < size) {
  Branch (8627:16): [True: 5, False: 0]
8628
            PyObject *x;
8629
            ch = PyUnicode_READ(kind, data, collend);
8630
            if (charmaptranslate_lookup(ch, mapping, &x))
  Branch (8630:17): [True: 0, False: 5]
8631
                goto onError;
8632
            Py_XDECREF(x);
8633
            if (x != Py_None)
  Branch (8633:17): [True: 5, False: 0]
8634
                break;
8635
            ++collend;
8636
        }
8637
8638
        if (ignore) {
  Branch (8638:13): [True: 5, False: 0]
8639
            i = collend;
8640
        }
8641
        else {
8642
            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8643
                                                             reason, input, &exc,
8644
                                                             collstart, collend, &newpos);
8645
            if (repunicode == NULL)
  Branch (8645:17): [True: 0, False: 0]
8646
                goto onError;
8647
            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
  Branch (8647:17): [True: 0, False: 0]
8648
                Py_DECREF(repunicode);
8649
                goto onError;
8650
            }
8651
            Py_DECREF(repunicode);
8652
            i = newpos;
8653
        }
8654
    }
8655
    Py_XDECREF(exc);
8656
    Py_XDECREF(errorHandler);
8657
    return _PyUnicodeWriter_Finish(&writer);
8658
8659
  onError:
8660
    _PyUnicodeWriter_Dealloc(&writer);
8661
    Py_XDECREF(exc);
8662
    Py_XDECREF(errorHandler);
8663
    return NULL;
8664
}
8665
8666
PyObject *
8667
PyUnicode_Translate(PyObject *str,
8668
                    PyObject *mapping,
8669
                    const char *errors)
8670
{
8671
    if (ensure_unicode(str) < 0)
  Branch (8671:9): [True: 0, False: 0]
8672
        return NULL;
8673
    return _PyUnicode_TranslateCharmap(str, mapping, errors);
8674
}
8675
8676
PyObject *
8677
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8678
{
8679
    if (!PyUnicode_Check(unicode)) {
  Branch (8679:9): [True: 0, False: 515k]
8680
        PyErr_BadInternalCall();
8681
        return NULL;
8682
    }
8683
    if (PyUnicode_IS_ASCII(unicode)) {
8684
        /* If the string is already ASCII, just return the same string */
8685
        Py_INCREF(unicode);
8686
        return unicode;
8687
    }
8688
8689
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
8690
    PyObject *result = PyUnicode_New(len, 127);
8691
    if (result == NULL) {
  Branch (8691:9): [True: 0, False: 53]
8692
        return NULL;
8693
    }
8694
8695
    Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
8696
    int kind = PyUnicode_KIND(unicode);
8697
    const void *data = PyUnicode_DATA(unicode);
8698
    Py_ssize_t i;
8699
    for (i = 0; i < len; 
++i92
) {
  Branch (8699:17): [True: 135, False: 10]
8700
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8701
        if (ch < 127) {
  Branch (8701:13): [True: 67, False: 68]
8702
            out[i] = ch;
8703
        }
8704
        else if (Py_UNICODE_ISSPACE(ch)) {
  Branch (8704:18): [True: 6, False: 62]
8705
            out[i] = ' ';
8706
        }
8707
        else {
8708
            int decimal = Py_UNICODE_TODECIMAL(ch);
8709
            if (decimal < 0) {
  Branch (8709:17): [True: 43, False: 19]
8710
                out[i] = '?';
8711
                out[i+1] = '\0';
8712
                _PyUnicode_LENGTH(result) = i + 1;
8713
                break;
8714
            }
8715
            out[i] = '0' + decimal;
8716
        }
8717
    }
8718
8719
    assert(_PyUnicode_CheckConsistency(result, 1));
8720
    return result;
8721
}
8722
8723
/* --- Helpers ------------------------------------------------------------ */
8724
8725
/* helper macro to fixup start/end slice values */
8726
#define ADJUST_INDICES(start, end, len)         \
8727
    if (end > len)                              \
8728
        
end = len9.44M
; \
8729
    else 
if (1.77M
end < 01.77M
) { \
8730
        end += len;                             \
8731
        if (end < 0)                            \
8732
            
end = 07
; \
8733
    }                                           \
8734
    if (start < 0) {                            \
8735
        start += len;                           \
8736
        if (start < 0)                          \
8737
            
start = 013
; \
8738
    }
8739
8740
static Py_ssize_t
8741
any_find_slice(PyObject* s1, PyObject* s2,
8742
               Py_ssize_t start,
8743
               Py_ssize_t end,
8744
               int direction)
8745
{
8746
    int kind1, kind2;
8747
    const void *buf1, *buf2;
8748
    Py_ssize_t len1, len2, result;
8749
8750
    kind1 = PyUnicode_KIND(s1);
8751
    kind2 = PyUnicode_KIND(s2);
8752
    if (kind1 < kind2)
  Branch (8752:9): [True: 24, False: 1.81M]
8753
        return -1;
8754
8755
    len1 = PyUnicode_GET_LENGTH(s1);
8756
    len2 = PyUnicode_GET_LENGTH(s2);
8757
    ADJUST_INDICES(start, end, len1);
8758
    if (end - start < len2)
  Branch (8758:9): [True: 171k, False: 1.64M]
8759
        return -1;
8760
8761
    buf1 = PyUnicode_DATA(s1);
8762
    buf2 = PyUnicode_DATA(s2);
8763
    if (len2 == 1) {
  Branch (8763:9): [True: 1.08M, False: 555k]
8764
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
8765
        result = findchar((const char *)buf1 + kind1*start,
8766
                          kind1, end - start, ch, direction);
8767
        if (result == -1)
  Branch (8767:13): [True: 433k, False: 651k]
8768
            return -1;
8769
        else
8770
            return start + result;
8771
    }
8772
8773
    if (kind2 != kind1) {
  Branch (8773:9): [True: 14, False: 555k]
8774
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
8775
        if (!buf2)
  Branch (8775:13): [True: 0, False: 14]
8776
            return -2;
8777
    }
8778
8779
    if (direction > 0) {
  Branch (8779:9): [True: 358k, False: 196k]
8780
        switch (kind1) {
8781
        case PyUnicode_1BYTE_KIND:
  Branch (8781:9): [True: 358k, False: 8]
8782
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8783
                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8784
            else
8785
                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8786
            break;
8787
        case PyUnicode_2BYTE_KIND:
  Branch (8787:9): [True: 4, False: 358k]
8788
            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8789
            break;
8790
        case PyUnicode_4BYTE_KIND:
  Branch (8790:9): [True: 4, False: 358k]
8791
            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8792
            break;
8793
        default:
  Branch (8793:9): [True: 0, False: 358k]
8794
            Py_UNREACHABLE();
8795
        }
8796
    }
8797
    else {
8798
        switch (kind1) {
8799
        case PyUnicode_1BYTE_KIND:
  Branch (8799:9): [True: 196k, False: 6]
8800
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8801
                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8802
            else
8803
                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8804
            break;
8805
        case PyUnicode_2BYTE_KIND:
  Branch (8805:9): [True: 2, False: 196k]
8806
            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8807
            break;
8808
        case PyUnicode_4BYTE_KIND:
  Branch (8808:9): [True: 4, False: 196k]
8809
            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8810
            break;
8811
        default:
  Branch (8811:9): [True: 0, False: 196k]
8812
            Py_UNREACHABLE();
8813
        }
8814
    }
8815
8816
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
8817
    if (kind2 != kind1)
  Branch (8817:9): [True: 14, False: 555k]
8818
        PyMem_Free((void *)buf2);
8819
8820
    return result;
8821
}
8822
8823
/* _PyUnicode_InsertThousandsGrouping() helper functions */
8824
#include "stringlib/localeutil.h"
8825
8826
/**
8827
 * InsertThousandsGrouping:
8828
 * @writer: Unicode writer.
8829
 * @n_buffer: Number of characters in @buffer.
8830
 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
8831
 * @d_pos: Start of digits string.
8832
 * @n_digits: The number of digits in the string, in which we want
8833
 *            to put the grouping chars.
8834
 * @min_width: The minimum width of the digits in the output string.
8835
 *             Output will be zero-padded on the left to fill.
8836
 * @grouping: see definition in localeconv().
8837
 * @thousands_sep: see definition in localeconv().
8838
 *
8839
 * There are 2 modes: counting and filling. If @writer is NULL,
8840
 *  we are in counting mode, else filling mode.
8841
 * If counting, the required buffer size is returned.
8842
 * If filling, we know the buffer will be large enough, so we don't
8843
 *  need to pass in the buffer size.
8844
 * Inserts thousand grouping characters (as defined by grouping and
8845
 *  thousands_sep) into @writer.
8846
 *
8847
 * Return value: -1 on error, number of characters otherwise.
8848
 **/
8849
Py_ssize_t
8850
_PyUnicode_InsertThousandsGrouping(
8851
    _PyUnicodeWriter *writer,
8852
    Py_ssize_t n_buffer,
8853
    PyObject *digits,
8854
    Py_ssize_t d_pos,
8855
    Py_ssize_t n_digits,
8856
    Py_ssize_t min_width,
8857
    const char *grouping,
8858
    PyObject *thousands_sep,
8859
    Py_UCS4 *maxchar)
8860
{
8861
    min_width = Py_MAX(0, min_width);
8862
    if (writer) {
  Branch (8862:9): [True: 157k, False: 157k]
8863
        assert(digits != NULL);
8864
        assert(maxchar == NULL);
8865
    }
8866
    else {
8867
        assert(digits == NULL);
8868
        assert(maxchar != NULL);
8869
    }
8870
    assert(0 <= d_pos);
8871
    assert(0 <= n_digits);
8872
    assert(grouping != NULL);
8873
8874
    Py_ssize_t count = 0;
8875
    Py_ssize_t n_zeros;
8876
    int loop_broken = 0;
8877
    int use_separator = 0; /* First time through, don't append the
8878
                              separator. They only go between
8879
                              groups. */
8880
    Py_ssize_t buffer_pos;
8881
    Py_ssize_t digits_pos;
8882
    Py_ssize_t len;
8883
    Py_ssize_t n_chars;
8884
    Py_ssize_t remaining = n_digits; /* Number of chars remaining to
8885
                                        be looked at */
8886
    /* A generator that returns all of the grouping widths, until it
8887
       returns 0. */
8888
    GroupGenerator groupgen;
8889
    GroupGenerator_init(&groupgen, grouping);
8890
    const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8891
8892
    /* if digits are not grouped, thousands separator
8893
       should be an empty string */
8894
    assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
8895
8896
    digits_pos = d_pos + n_digits;
8897
    if (writer) {
  Branch (8897:9): [True: 157k, False: 157k]
8898
        buffer_pos = writer->pos + n_buffer;
8899
        assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
8900
        assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
8901
    }
8902
    else {
8903
        buffer_pos = n_buffer;
8904
    }
8905
8906
    if (!writer) {
  Branch (8906:9): [True: 157k, False: 157k]
8907
        *maxchar = 127;
8908
    }
8909
8910
    while ((len = GroupGenerator_next(&groupgen)) > 0) {
  Branch (8910:12): [True: 1.33k, False: 314k]
8911
        len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
8912
        n_zeros = Py_MAX(0, len - remaining);
8913
        n_chars = Py_MAX(0, Py_MIN(remaining, len));
8914
8915
        /* Use n_zero zero's and n_chars chars */
8916
8917
        /* Count only, don't do anything. */
8918
        count += (use_separator ? 
thousands_sep_len832
:
0500
) + n_zeros + n_chars;
  Branch (8918:19): [True: 832, False: 500]
8919
8920
        /* Copy into the writer. */
8921
        InsertThousandsGrouping_fill(writer, &buffer_pos,
8922
                                     digits, &digits_pos,
8923
                                     n_chars, n_zeros,
8924
                                     use_separator ? 
thousands_sep832
: NULL,
  Branch (8924:38): [True: 832, False: 500]
8925
                                     thousands_sep_len, maxchar);
8926
8927
        /* Use a separator next time. */
8928
        use_separator = 1;
8929
8930
        remaining -= n_chars;
8931
        min_width -= len;
8932
8933
        if (remaining <= 0 && 
min_width <= 0706
) {
  Branch (8933:13): [True: 706, False: 626]
  Branch (8933:31): [True: 500, False: 206]
8934
            loop_broken = 1;
8935
            break;
8936
        }
8937
        min_width -= thousands_sep_len;
8938
    }
8939
    if (!loop_broken) {
  Branch (8939:9): [True: 314k, False: 500]
8940
        /* We left the loop without using a break statement. */
8941
8942
        len = Py_MAX(Py_MAX(remaining, min_width), 1);
8943
        n_zeros = Py_MAX(0, len - remaining);
8944
        n_chars = Py_MAX(0, Py_MIN(remaining, len));
8945
8946
        /* Use n_zero zero's and n_chars chars */
8947
        count += (use_separator ? 
thousands_sep_len0
: 0) + n_zeros + n_chars;
  Branch (8947:19): [True: 0, False: 314k]
8948
8949
        /* Copy into the writer. */
8950
        InsertThousandsGrouping_fill(writer, &buffer_pos,
8951
                                     digits, &digits_pos,
8952
                                     n_chars, n_zeros,
8953
                                     use_separator ? 
thousands_sep0
: NULL,
  Branch (8953:38): [True: 0, False: 314k]
8954
                                     thousands_sep_len, maxchar);
8955
    }
8956
    return count;
8957
}
8958
8959
8960
Py_ssize_t
8961
PyUnicode_Count(PyObject *str,
8962
                PyObject *substr,
8963
                Py_ssize_t start,
8964
                Py_ssize_t end)
8965
{
8966
    Py_ssize_t result;
8967
    int kind1, kind2;
8968
    const void *buf1 = NULL, *buf2 = NULL;
8969
    Py_ssize_t len1, len2;
8970
8971
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
  Branch (8971:9): [True: 0, False: 0]
  Branch (8971:36): [True: 0, False: 0]
8972
        return -1;
8973
8974
    kind1 = PyUnicode_KIND(str);
8975
    kind2 = PyUnicode_KIND(substr);
8976
    if (kind1 < kind2)
  Branch (8976:9): [True: 0, False: 0]
8977
        return 0;
8978
8979
    len1 = PyUnicode_GET_LENGTH(str);
8980
    len2 = PyUnicode_GET_LENGTH(substr);
8981
    ADJUST_INDICES(start, end, len1);
8982
    if (end - start < len2)
  Branch (8982:9): [True: 0, False: 0]
8983
        return 0;
8984
8985
    buf1 = PyUnicode_DATA(str);
8986
    buf2 = PyUnicode_DATA(substr);
8987
    if (kind2 != kind1) {
  Branch (8987:9): [True: 0, False: 0]
8988
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
8989
        if (!buf2)
  Branch (8989:13): [True: 0, False: 0]
8990
            goto onError;
8991
    }
8992
8993
    switch (kind1) {
8994
    case PyUnicode_1BYTE_KIND:
  Branch (8994:5): [True: 0, False: 0]
8995
        if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
8996
            result = asciilib_count(
8997
                ((const Py_UCS1*)buf1) + start, end - start,
8998
                buf2, len2, PY_SSIZE_T_MAX
8999
                );
9000
        else
9001
            result = ucs1lib_count(
9002
                ((const Py_UCS1*)buf1) + start, end - start,
9003
                buf2, len2, PY_SSIZE_T_MAX
9004
                );
9005
        break;
9006
    case PyUnicode_2BYTE_KIND:
  Branch (9006:5): [True: 0, False: 0]
9007
        result = ucs2lib_count(
9008
            ((const Py_UCS2*)buf1) + start, end - start,
9009
            buf2, len2, PY_SSIZE_T_MAX
9010
            );
9011
        break;
9012
    case PyUnicode_4BYTE_KIND:
  Branch (9012:5): [True: 0, False: 0]
9013
        result = ucs4lib_count(
9014
            ((const Py_UCS4*)buf1) + start, end - start,
9015
            buf2, len2, PY_SSIZE_T_MAX
9016
            );
9017
        break;
9018
    default:
  Branch (9018:5): [True: 0, False: 0]
9019
        Py_UNREACHABLE();
9020
    }
9021
9022
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9023
    if (kind2 != kind1)
  Branch (9023:9): [True: 0, False: 0]
9024
        PyMem_Free((void *)buf2);
9025
9026
    return result;
9027
  onError:
9028
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9029
    if (kind2 != kind1)
  Branch (9029:9): [True: 0, False: 0]
9030
        PyMem_Free((void *)buf2);
9031
    return -1;
9032
}
9033
9034
Py_ssize_t
9035
PyUnicode_Find(PyObject *str,
9036
               PyObject *substr,
9037
               Py_ssize_t start,
9038
               Py_ssize_t end,
9039
               int direction)
9040
{
9041
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
  Branch (9041:9): [True: 0, False: 119]
  Branch (9041:36): [True: 0, False: 119]
9042
        return -2;
9043
9044
    return any_find_slice(str, substr, start, end, direction);
9045
}
9046
9047
Py_ssize_t
9048
PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9049
                   Py_ssize_t start, Py_ssize_t end,
9050
                   int direction)
9051
{
9052
    int kind;
9053
    Py_ssize_t len, result;
9054
    len = PyUnicode_GET_LENGTH(str);
9055
    ADJUST_INDICES(start, end, len);
9056
    if (end - start < 1)
  Branch (9056:9): [True: 12, False: 1.75M]
9057
        return -1;
9058
    kind = PyUnicode_KIND(str);
9059
    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9060
                      kind, end-start, ch, direction);
9061
    if (result == -1)
  Branch (9061:9): [True: 951k, False: 802k]
9062
        return -1;
9063
    else
9064
        return start + result;
9065
}
9066
9067
static int
9068
tailmatch(PyObject *self,
9069
          PyObject *substring,
9070
          Py_ssize_t start,
9071
          Py_ssize_t end,
9072
          int direction)
9073
{
9074
    int kind_self;
9075
    int kind_sub;
9076
    const void *data_self;
9077
    const void *data_sub;
9078
    Py_ssize_t offset;
9079
    Py_ssize_t i;
9080
    Py_ssize_t end_sub;
9081
9082
    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9083
    end -= PyUnicode_GET_LENGTH(substring);
9084
    if (end < start)
  Branch (9084:9): [True: 428k, False: 6.90M]
9085
        return 0;
9086
9087
    if (PyUnicode_GET_LENGTH(substring) == 0)
  Branch (9087:9): [True: 3.30k, False: 6.90M]
9088
        return 1;
9089
9090
    kind_self = PyUnicode_KIND(self);
9091
    data_self = PyUnicode_DATA(self);
9092
    kind_sub = PyUnicode_KIND(substring);
9093
    data_sub = PyUnicode_DATA(substring);
9094
    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9095
9096
    if (direction > 0)
  Branch (9096:9): [True: 922k, False: 5.98M]
9097
        offset = end;
9098
    else
9099
        offset = start;
9100
9101
    if (PyUnicode_READ(kind_self, data_self, offset) ==
  Branch (9101:9): [True: 1.10M, False: 5.79M]
9102
        PyUnicode_READ(kind_sub, data_sub, 0) &&
9103
        
PyUnicode_READ1.10M
(kind_self, data_self, offset + end_sub) ==
  Branch (9103:9): [True: 544k, False: 562k]
9104
        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9105
        /* If both are of the same kind, memcmp is sufficient */
9106
        if (kind_self == kind_sub) {
  Branch (9106:13): [True: 543k, False: 209]
9107
            return ! memcmp((char *)data_self +
9108
                                (offset * PyUnicode_KIND(substring)),
9109
                            data_sub,
9110
                            PyUnicode_GET_LENGTH(substring) *
9111
                                PyUnicode_KIND(substring));
9112
        }
9113
        /* otherwise we have to compare each character by first accessing it */
9114
        else {
9115
            /* We do not need to compare 0 and len(substring)-1 because
9116
               the if statement above ensured already that they are equal
9117
               when we end up here. */
9118
            for (i = 1; i < end_sub; 
++i24
) {
  Branch (9118:25): [True: 24, False: 209]
9119
                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
  Branch (9119:21): [True: 0, False: 24]
9120
                    PyUnicode_READ(kind_sub, data_sub, i))
9121
                    return 0;
9122
            }
9123
            return 1;
9124
        }
9125
    }
9126
9127
    return 0;
9128
}
9129
9130
Py_ssize_t
9131
PyUnicode_Tailmatch(PyObject *str,
9132
                    PyObject *substr,
9133
                    Py_ssize_t start,
9134
                    Py_ssize_t end,
9135
                    int direction)
9136
{
9137
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
  Branch (9137:9): [True: 0, False: 1]
  Branch (9137:36): [True: 0, False: 1]
9138
        return -1;
9139
9140
    return tailmatch(str, substr, start, end, direction);
9141
}
9142
9143
static PyObject *
9144
ascii_upper_or_lower(PyObject *self, int lower)
9145
{
9146
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9147
    const char *data = PyUnicode_DATA(self);
9148
    char *resdata;
9149
    PyObject *res;
9150
9151
    res = PyUnicode_New(len, 127);
9152
    if (res == NULL)
  Branch (9152:9): [True: 0, False: 418k]
9153
        return NULL;
9154
    resdata = PyUnicode_DATA(res);
9155
    if (lower)
  Branch (9155:9): [True: 410k, False: 7.90k]
9156
        _Py_bytes_lower(resdata, data, len);
9157
    else
9158
        _Py_bytes_upper(resdata, data, len);
9159
    return res;
9160
}
9161
9162
static Py_UCS4
9163
handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9164
{
9165
    Py_ssize_t j;
9166
    int final_sigma;
9167
    Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9168
    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9169
9170
     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9171
9172
    where ! is a negation and \p{xxx} is a character with property xxx.
9173
    */
9174
    for (j = i - 1; j >= 0; 
j--9
) {
  Branch (9174:21): [True: 20, False: 11]
9175
        c = PyUnicode_READ(kind, data, j);
9176
        if (!_PyUnicode_IsCaseIgnorable(c))
  Branch (9176:13): [True: 11, False: 9]
9177
            break;
9178
    }
9179
    final_sigma = j >= 0 && 
_PyUnicode_IsCased(c)11
;
  Branch (9179:19): [True: 11, False: 11]
  Branch (9179:29): [True: 11, False: 0]
9180
    if (final_sigma) {
  Branch (9180:9): [True: 11, False: 11]
9181
        for (j = i + 1; j < length; 
j++2
) {
  Branch (9181:25): [True: 6, False: 7]
9182
            c = PyUnicode_READ(kind, data, j);
9183
            if (!_PyUnicode_IsCaseIgnorable(c))
  Branch (9183:17): [True: 4, False: 2]
9184
                break;
9185
        }
9186
        final_sigma = j == length || 
!_PyUnicode_IsCased(c)4
;
  Branch (9186:23): [True: 7, False: 4]
  Branch (9186:38): [True: 1, False: 3]
9187
    }
9188
    return (final_sigma) ? 
0x3C28
:
0x3C314
;
  Branch (9188:12): [True: 8, False: 14]
9189
}
9190
9191
static int
9192
lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9193
           Py_UCS4 c, Py_UCS4 *mapped)
9194
{
9195
    /* Obscure special case. */
9196
    if (c == 0x3A3) {
  Branch (9196:9): [True: 22, False: 1.15M]
9197
        mapped[0] = handle_capital_sigma(kind, data, length, i);
9198
        return 1;
9199
    }
9200
    return _PyUnicode_ToLowerFull(c, mapped);
9201
}
9202
9203
static Py_ssize_t
9204
do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9205
{
9206
    Py_ssize_t i, k = 0;
9207
    int n_res, j;
9208
    Py_UCS4 c, mapped[3];
9209
9210
    c = PyUnicode_READ(kind, data, 0);
9211
    n_res = _PyUnicode_ToTitleFull(c, mapped);
9212
    for (j = 0; j < n_res; 
j++1.08k
) {
  Branch (9212:17): [True: 1.08k, False: 1.07k]
9213
        *maxchar = Py_MAX(*maxchar, mapped[j]);
9214
        res[k++] = mapped[j];
9215
    }
9216
    for (i = 1; i < length; 
i++12.6k
) {
  Branch (9216:17): [True: 12.6k, False: 1.07k]
9217
        c = PyUnicode_READ(kind, data, i);
9218
        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9219
        for (j = 0; j < n_res; 
j++12.6k
) {
  Branch (9219:21): [True: 12.6k, False: 12.6k]
9220
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9221
            res[k++] = mapped[j];
9222
        }
9223
    }
9224
    return k;
9225
}
9226
9227
static Py_ssize_t
9228
do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9229
    Py_ssize_t i, k = 0;
9230
9231
    for (i = 0; i < length; 
i++7.22k
) {
  Branch (9231:17): [True: 7.22k, False: 29]
9232
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9233
        int n_res, j;
9234
        if (Py_UNICODE_ISUPPER(c)) {
9235
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9236
        }
9237
        else if (Py_UNICODE_ISLOWER(c)) {
9238
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9239
        }
9240
        else {
9241
            n_res = 1;
9242
            mapped[0] = c;
9243
        }
9244
        for (j = 0; j < n_res; 
j++7.22k
) {
  Branch (9244:21): [True: 7.22k, False: 7.22k]
9245
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9246
            res[k++] = mapped[j];
9247
        }
9248
    }
9249
    return k;
9250
}
9251
9252
static Py_ssize_t
9253
do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9254
                  Py_UCS4 *maxchar, int lower)
9255
{
9256
    Py_ssize_t i, k = 0;
9257
9258
    for (i = 0; i < length; 
i++2.24M
) {
  Branch (9258:17): [True: 2.24M, False: 2.24M]
9259
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9260
        int n_res, j;
9261
        if (lower)
  Branch (9261:13): [True: 1.12M, False: 1.12M]
9262
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9263
        else
9264
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9265
        for (j = 0; j < n_res; 
j++2.25M
) {
  Branch (9265:21): [True: 2.25M, False: 2.24M]
9266
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9267
            res[k++] = mapped[j];
9268
        }
9269
    }
9270
    return k;
9271
}
9272
9273
static Py_ssize_t
9274
do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9275
{
9276
    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9277
}
9278
9279
static Py_ssize_t
9280
do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9281
{
9282
    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9283
}
9284
9285
static Py_ssize_t
9286
do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9287
{
9288
    Py_ssize_t i, k = 0;
9289
9290
    for (i = 0; i < length; 
i++9
) {
  Branch (9290:17): [True: 9, False: 6]
9291
        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9292
        Py_UCS4 mapped[3];
9293
        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9294
        for (j = 0; j < n_res; 
j++11
) {
  Branch (9294:21): [True: 11, False: 9]
9295
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9296
            res[k++] = mapped[j];
9297
        }
9298
    }
9299
    return k;
9300
}
9301
9302
static Py_ssize_t
9303
do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9304
{
9305
    Py_ssize_t i, k = 0;
9306
    int previous_is_cased;
9307
9308
    previous_is_cased = 0;
9309
    for (i = 0; i < length; 
i++1.13M
) {
  Branch (9309:17): [True: 1.13M, False: 1.11M]
9310
        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9311
        Py_UCS4 mapped[3];
9312
        int n_res, j;
9313
9314
        if (previous_is_cased)
  Branch (9314:13): [True: 14.1k, False: 1.11M]
9315
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9316
        else
9317
            n_res = _PyUnicode_ToTitleFull(c, mapped);
9318
9319
        for (j = 0; j < n_res; 
j++1.13M
) {
  Branch (9319:21): [True: 1.13M, False: 1.13M]
9320
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9321
            res[k++] = mapped[j];
9322
        }
9323
9324
        previous_is_cased = _PyUnicode_IsCased(c);
9325
    }
9326
    return k;
9327
}
9328
9329
static PyObject *
9330
case_operation(PyObject *self,
9331
               Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9332
{
9333
    PyObject *res = NULL;
9334
    Py_ssize_t length, newlength = 0;
9335
    int kind, outkind;
9336
    const void *data;
9337
    void *outdata;
9338
    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9339
9340
    kind = PyUnicode_KIND(self);
9341
    data = PyUnicode_DATA(self);
9342
    length = PyUnicode_GET_LENGTH(self);
9343
    if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
  Branch (9343:9): [True: 0, False: 3.36M]
9344
        PyErr_SetString(PyExc_OverflowError, "string is too long");
9345
        return NULL;
9346
    }
9347
    tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
9348
    if (tmp == NULL)
  Branch (9348:9): [True: 0, False: 3.36M]
9349
        return PyErr_NoMemory();
9350
    newlength = perform(kind, data, length, tmp, &maxchar);
9351
    res = PyUnicode_New(newlength, maxchar);
9352
    if (res == NULL)
  Branch (9352:9): [True: 0, False: 3.36M]
9353
        goto leave;
9354
    tmpend = tmp + newlength;
9355
    outdata = PyUnicode_DATA(res);
9356
    outkind = PyUnicode_KIND(res);
9357
    switch (outkind) {
9358
    case PyUnicode_1BYTE_KIND:
  Branch (9358:5): [True: 4.00k, False: 3.36M]
9359
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9360
        break;
9361
    case PyUnicode_2BYTE_KIND:
  Branch (9361:5): [True: 214k, False: 3.14M]
9362
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9363
        break;
9364
    case PyUnicode_4BYTE_KIND:
  Branch (9364:5): [True: 3.14M, False: 218k]
9365
        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9366
        break;
9367
    default:
  Branch (9367:5): [True: 0, False: 3.36M]
9368
        Py_UNREACHABLE();
9369
    }
9370
  leave:
9371
    PyMem_Free(tmp);
9372
    return res;
9373
}
9374
9375
PyObject *
9376
PyUnicode_Join(PyObject *separator, PyObject *seq)
9377
{
9378
    PyObject *res;
9379
    PyObject *fseq;
9380
    Py_ssize_t seqlen;
9381
    PyObject **items;
9382
9383
    fseq = PySequence_Fast(seq, "can only join an iterable");
9384
    if (fseq == NULL) {
  Branch (9384:9): [True: 6, False: 1.14M]
9385
        return NULL;
9386
    }
9387
9388
    /* NOTE: the following code can't call back into Python code,
9389
     * so we are sure that fseq won't be mutated.
9390
     */
9391
9392
    items = PySequence_Fast_ITEMS(fseq);
9393
    seqlen = PySequence_Fast_GET_SIZE(fseq);
9394
    res = _PyUnicode_JoinArray(separator, items, seqlen);
9395
    Py_DECREF(fseq);
9396
    return res;
9397
}
9398
9399
PyObject *
9400
_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
9401
{
9402
    PyObject *res = NULL; /* the result */
9403
    PyObject *sep = NULL;
9404
    Py_ssize_t seplen;
9405
    PyObject *item;
9406
    Py_ssize_t sz, i, res_offset;
9407
    Py_UCS4 maxchar;
9408
    Py_UCS4 item_maxchar;
9409
    int use_memcpy;
9410
    unsigned char *res_data = NULL, *sep_data = NULL;
9411
    PyObject *last_obj;
9412
    int kind = 0;
9413
9414
    /* If empty sequence, return u"". */
9415
    if (seqlen == 0) {
  Branch (9415:9): [True: 53.7k, False: 1.46M]
9416
        _Py_RETURN_UNICODE_EMPTY();
9417
    }
9418
9419
    /* If singleton sequence with an exact Unicode, return that. */
9420
    last_obj = NULL;
9421
    if (seqlen == 1) {
  Branch (9421:9): [True: 354k, False: 1.11M]
9422
        if (PyUnicode_CheckExact(items[0])) {
9423
            res = items[0];
9424
            Py_INCREF(res);
9425
            return res;
9426
        }
9427
        seplen = 0;
9428
        maxchar = 0;
9429
    }
9430
    else {
9431
        /* Set up sep and seplen */
9432
        if (separator == NULL) {
  Branch (9432:13): [True: 0, False: 1.11M]
9433
            /* fall back to a blank space separator */
9434
            sep = PyUnicode_FromOrdinal(' ');
9435
            if (!sep)
  Branch (9435:17): [True: 0, False: 0]
9436
                goto onError;
9437
            seplen = 1;
9438
            maxchar = 32;
9439
        }
9440
        else {
9441
            if (!PyUnicode_Check(separator)) {
  Branch (9441:17): [True: 0, False: 1.11M]
9442
                PyErr_Format(PyExc_TypeError,
9443
                             "separator: expected str instance,"
9444
                             " %.80s found",
9445
                             Py_TYPE(separator)->tp_name);
9446
                goto onError;
9447
            }
9448
            sep = separator;
9449
            seplen = PyUnicode_GET_LENGTH(separator);
9450
            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9451
            /* inc refcount to keep this code path symmetric with the
9452
               above case of a blank separator */
9453
            Py_INCREF(sep);
9454
        }
9455
        last_obj = sep;
9456
    }
9457
9458
    /* There are at least two things to join, or else we have a subclass
9459
     * of str in the sequence.
9460
     * Do a pre-pass to figure out the total amount of space we'll
9461
     * need (sz), and see whether all argument are strings.
9462
     */
9463
    sz = 0;
9464
#ifdef Py_DEBUG
9465
    use_memcpy = 0;
9466
#else
9467
    use_memcpy = 1;
9468
#endif
9469
    for (i = 0; i < seqlen; 
i++19.6M
) {
  Branch (9469:17): [True: 19.6M, False: 1.12M]
9470
        size_t add_sz;
9471
        item = items[i];
9472
        if (!PyUnicode_Check(item)) {
  Branch (9472:13): [True: 18, False: 19.6M]
9473
            PyErr_Format(PyExc_TypeError,
9474
                         "sequence item %zd: expected str instance,"
9475
                         " %.80s found",
9476
                         i, Py_TYPE(item)->tp_name);
9477
            goto onError;
9478
        }
9479
        add_sz = PyUnicode_GET_LENGTH(item);
9480
        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9481
        maxchar = Py_MAX(maxchar, item_maxchar);
9482
        if (i != 0) {
  Branch (9482:13): [True: 18.5M, False: 1.12M]
9483
            add_sz += seplen;
9484
        }
9485
        if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
  Branch (9485:13): [True: 0, False: 19.6M]
9486
            PyErr_SetString(PyExc_OverflowError,
9487
                            "join() result is too long for a Python string");
9488
            goto onError;
9489
        }
9490
        sz += add_sz;
9491
        if (use_memcpy && 
last_obj != NULL18.4M
) {
  Branch (9491:13): [True: 18.4M, False: 1.24M]
  Branch (9491:27): [True: 18.4M, False: 17.7k]
9492
            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
  Branch (9492:17): [True: 6.11k, False: 18.4M]
9493
                use_memcpy = 0;
9494
        }
9495
        last_obj = item;
9496
    }
9497
9498
    res = PyUnicode_New(sz, maxchar);
9499
    if (res == NULL)
  Branch (9499:9): [True: 0, False: 1.12M]
9500
        goto onError;
9501
9502
    /* Catenate everything. */
9503
#ifdef Py_DEBUG
9504
    use_memcpy = 0;
9505
#else
9506
    if (use_memcpy) {
  Branch (9506:9): [True: 1.12M, False: 6.11k]
9507
        res_data = PyUnicode_1BYTE_DATA(res);
9508
        kind = PyUnicode_KIND(res);
9509
        if (seplen != 0)
  Branch (9509:13): [True: 418k, False: 704k]
9510
            sep_data = PyUnicode_1BYTE_DATA(sep);
9511
    }
9512
#endif
9513
    if (use_memcpy) {
  Branch (9513:9): [True: 1.12M, False: 6.11k]
9514
        for (i = 0; i < seqlen; 
++i18.3M
) {
  Branch (9514:21): [True: 18.3M, False: 1.12M]
9515
            Py_ssize_t itemlen;
9516
            item = items[i];
9517
9518
            /* Copy item, and maybe the separator. */
9519
            if (i && 
seplen != 017.2M
) {
  Branch (9519:17): [True: 17.2M, False: 1.12M]
  Branch (9519:22): [True: 3.47M, False: 13.7M]
9520
                memcpy(res_data,
9521
                          sep_data,
9522
                          kind * seplen);
9523
                res_data += kind * seplen;
9524
            }
9525
9526
            itemlen = PyUnicode_GET_LENGTH(item);
9527
            if (itemlen != 0) {
  Branch (9527:17): [True: 18.2M, False: 94.5k]
9528
                memcpy(res_data,
9529
                          PyUnicode_DATA(item),
9530
                          kind * itemlen);
9531
                res_data += kind * itemlen;
9532
            }
9533
        }
9534
        assert(res_data == PyUnicode_1BYTE_DATA(res)
9535
                           + kind * PyUnicode_GET_LENGTH(res));
9536
    }
9537
    else {
9538
        for (i = 0, res_offset = 0; i < seqlen; 
++i1.28M
) {
  Branch (9538:37): [True: 1.28M, False: 6.11k]
9539
            Py_ssize_t itemlen;
9540
            item = items[i];
9541
9542
            /* Copy item, and maybe the separator. */
9543
            if (i && 
seplen != 01.27M
) {
  Branch (9543:17): [True: 1.27M, False: 6.11k]
  Branch (9543:22): [True: 819, False: 1.27M]
9544
                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9545
                res_offset += seplen;
9546
            }
9547
9548
            itemlen = PyUnicode_GET_LENGTH(item);
9549
            if (itemlen != 0) {
  Branch (9549:17): [True: 1.28M, False: 261]
9550
                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
9551
                res_offset += itemlen;
9552
            }
9553
        }
9554
        assert(res_offset == PyUnicode_GET_LENGTH(res));
9555
    }
9556
9557
    Py_XDECREF(sep);
9558
    assert(_PyUnicode_CheckConsistency(res, 1));
9559
    return res;
9560
9561
  onError:
9562
    Py_XDECREF(sep);
9563
    Py_XDECREF(res);
9564
    return NULL;
9565
}
9566
9567
void
9568
_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9569
                    Py_UCS4 fill_char)
9570
{
9571
    const int kind = PyUnicode_KIND(unicode);
9572
    void *data = PyUnicode_DATA(unicode);
9573
    assert(unicode_modifiable(unicode));
9574
    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9575
    assert(start >= 0);
9576
    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9577
    unicode_fill(kind, data, fill_char, start, length);
9578
}
9579
9580
Py_ssize_t
9581
PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9582
               Py_UCS4 fill_char)
9583
{
9584
    Py_ssize_t maxlen;
9585
9586
    if (!PyUnicode_Check(unicode)) {
  Branch (9586:9): [True: 0, False: 7.40k]
9587
        PyErr_BadInternalCall();
9588
        return -1;
9589
    }
9590
    if (unicode_check_modifiable(unicode))
  Branch (9590:9): [True: 0, False: 7.40k]
9591
        return -1;
9592
9593
    if (start < 0) {
  Branch (9593:9): [True: 0, False: 7.40k]
9594
        PyErr_SetString(PyExc_IndexError, "string index out of range");
9595
        return -1;
9596
    }
9597
    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
  Branch (9597:9): [True: 0, False: 7.40k]
9598
        PyErr_SetString(PyExc_ValueError,
9599
                         "fill character is bigger than "
9600
                         "the string maximum character");
9601
        return -1;
9602
    }
9603
9604
    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9605
    length = Py_MIN(maxlen, length);
9606
    if (length <= 0)
  Branch (9606:9): [True: 0, False: 7.40k]
9607
        return 0;
9608
9609
    _PyUnicode_FastFill(unicode, start, length, fill_char);
9610
    return length;
9611
}
9612
9613
static PyObject *
9614
pad(PyObject *self,
9615
    Py_ssize_t left,
9616
    Py_ssize_t right,
9617
    Py_UCS4 fill)
9618
{
9619
    PyObject *u;
9620
    Py_UCS4 maxchar;
9621
    int kind;
9622
    void *data;
9623
9624
    if (left < 0)
  Branch (9624:9): [True: 0, False: 66.4k]
9625
        left = 0;
9626
    if (right < 0)
  Branch (9626:9): [True: 0, False: 66.4k]
9627
        right = 0;
9628
9629
    if (left == 0 && 
right == 025.4k
)
  Branch (9629:9): [True: 25.4k, False: 40.9k]
  Branch (9629:22): [True: 0, False: 25.4k]
9630
        return unicode_result_unchanged(self);
9631
9632
    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
  Branch (9632:9): [True: 0, False: 66.4k]
9633
        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
  Branch (9633:9): [True: 0, False: 66.4k]
9634
        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9635
        return NULL;
9636
    }
9637
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9638
    maxchar = Py_MAX(maxchar, fill);
9639
    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
9640
    if (!u)
  Branch (9640:9): [True: 0, False: 66.4k]
9641
        return NULL;
9642
9643
    kind = PyUnicode_KIND(u);
9644
    data = PyUnicode_DATA(u);
9645
    if (left)
  Branch (9645:9): [True: 40.9k, False: 25.4k]
9646
        unicode_fill(kind, data, fill, 0, left);
9647
    if (right)
  Branch (9647:9): [True: 25.9k, False: 40.4k]
9648
        unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
9649
    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
9650
    assert(_PyUnicode_CheckConsistency(u, 1));
9651
    return u;
9652
}
9653
9654
PyObject *
9655
PyUnicode_Splitlines(PyObject *string, int keepends)
9656
{
9657
    PyObject *list;
9658
9659
    if (ensure_unicode(string) < 0)
  Branch (9659:9): [True: 0, False: 327k]
9660
        return NULL;
9661
9662
    switch (PyUnicode_KIND(string)) {
9663
    case PyUnicode_1BYTE_KIND:
  Branch (9663:5): [True: 158k, False: 169k]
9664
        if (PyUnicode_IS_ASCII(string))
9665
            list = asciilib_splitlines(
9666
                string, PyUnicode_1BYTE_DATA(string),
9667
                PyUnicode_GET_LENGTH(string), keepends);
9668
        else
9669
            list = ucs1lib_splitlines(
9670
                string, PyUnicode_1BYTE_DATA(string),
9671
                PyUnicode_GET_LENGTH(string), keepends);
9672
        break;
9673
    case PyUnicode_2BYTE_KIND:
  Branch (9673:5): [True: 168k, False: 158k]
9674
        list = ucs2lib_splitlines(
9675
            string, PyUnicode_2BYTE_DATA(string),
9676
            PyUnicode_GET_LENGTH(string), keepends);
9677
        break;
9678
    case PyUnicode_4BYTE_KIND:
  Branch (9678:5): [True: 795, False: 326k]
9679
        list = ucs4lib_splitlines(
9680
            string, PyUnicode_4BYTE_DATA(string),
9681
            PyUnicode_GET_LENGTH(string), keepends);
9682
        break;
9683
    default:
  Branch (9683:5): [True: 0, False: 327k]
9684
        Py_UNREACHABLE();
9685
    }
9686
    return list;
9687
}
9688
9689
static PyObject *
9690
split(PyObject *self,
9691
      PyObject *substring,
9692
      Py_ssize_t maxcount)
9693
{
9694
    int kind1, kind2;
9695
    const void *buf1, *buf2;
9696
    Py_ssize_t len1, len2;
9697
    PyObject* out;
9698
9699
    if (maxcount < 0)
  Branch (9699:9): [True: 1.46M, False: 302k]
9700
        maxcount = PY_SSIZE_T_MAX;
9701
9702
    if (substring == NULL)
  Branch (9702:9): [True: 55.2k, False: 1.71M]
9703
        switch (PyUnicode_KIND(self)) {
9704
        case PyUnicode_1BYTE_KIND:
  Branch (9704:9): [True: 55.2k, False: 1]
9705
            if (PyUnicode_IS_ASCII(self))
9706
                return asciilib_split_whitespace(
9707
                    self,  PyUnicode_1BYTE_DATA(self),
9708
                    PyUnicode_GET_LENGTH(self), maxcount
9709
                    );
9710
            else
9711
                return ucs1lib_split_whitespace(
9712
                    self,  PyUnicode_1BYTE_DATA(self),
9713
                    PyUnicode_GET_LENGTH(self), maxcount
9714
                    );
9715
        case PyUnicode_2BYTE_KIND:
  Branch (9715:9): [True: 1, False: 55.2k]
9716
            return ucs2lib_split_whitespace(
9717
                self,  PyUnicode_2BYTE_DATA(self),
9718
                PyUnicode_GET_LENGTH(self), maxcount
9719
                );
9720
        case PyUnicode_4BYTE_KIND:
  Branch (9720:9): [True: 0, False: 55.2k]
9721
            return ucs4lib_split_whitespace(
9722
                self,  PyUnicode_4BYTE_DATA(self),
9723
                PyUnicode_GET_LENGTH(self), maxcount
9724
                );
9725
        default:
  Branch (9725:9): [True: 0, False: 55.2k]
9726
            Py_UNREACHABLE();
9727
        }
9728
9729
    kind1 = PyUnicode_KIND(self);
9730
    kind2 = PyUnicode_KIND(substring);
9731
    len1 = PyUnicode_GET_LENGTH(self);
9732
    len2 = PyUnicode_GET_LENGTH(substring);
9733
    if (kind1 < kind2 || 
len1 < len21.71M
) {
  Branch (9733:9): [True: 6, False: 1.71M]
  Branch (9733:26): [True: 17.8k, False: 1.69M]
9734
        out = PyList_New(1);
9735
        if (out == NULL)
  Branch (9735:13): [True: 0, False: 17.9k]
9736
            return NULL;
9737
        Py_INCREF(self);
9738
        PyList_SET_ITEM(out, 0, self);
9739
        return out;
9740
    }
9741
    buf1 = PyUnicode_DATA(self);
9742
    buf2 = PyUnicode_DATA(substring);
9743
    if (kind2 != kind1) {
  Branch (9743:9): [True: 1.43k, False: 1.69M]
9744
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
9745
        if (!buf2)
  Branch (9745:13): [True: 0, False: 1.43k]
9746
            return NULL;
9747
    }
9748
9749
    switch (kind1) {
9750
    case PyUnicode_1BYTE_KIND:
  Branch (9750:5): [True: 1.69M, False: 1.44k]
9751
        if (PyUnicode_IS_ASCII(self) && 
PyUnicode_IS_ASCII1.69M
(substring))
9752
            out = asciilib_split(
9753
                self,  buf1, len1, buf2, len2, maxcount);
9754
        else
9755
            out = ucs1lib_split(
9756
                self,  buf1, len1, buf2, len2, maxcount);
9757
        break;
9758
    case PyUnicode_2BYTE_KIND:
  Branch (9758:5): [True: 1.42k, False: 1.69M]
9759
        out = ucs2lib_split(
9760
            self,  buf1, len1, buf2, len2, maxcount);
9761
        break;
9762
    case PyUnicode_4BYTE_KIND:
  Branch (9762:5): [True: 24, False: 1.69M]
9763
        out = ucs4lib_split(
9764
            self,  buf1, len1, buf2, len2, maxcount);
9765
        break;
9766
    default:
  Branch (9766:5): [True: 0, False: 1.69M]
9767
        out = NULL;
9768
    }
9769
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
9770
    if (kind2 != kind1)
  Branch (9770:9): [True: 1.43k, False: 1.69M]
9771
        PyMem_Free((void *)buf2);
9772
    return out;
9773
}
9774
9775
static PyObject *
9776
rsplit(PyObject *self,
9777
       PyObject *substring,
9778
       Py_ssize_t maxcount)
9779
{
9780
    int kind1, kind2;
9781
    const void *buf1, *buf2;
9782
    Py_ssize_t len1, len2;
9783
    PyObject* out;
9784
9785
    if (maxcount < 0)
  Branch (9785:9): [True: 112, False: 3.17k]
9786
        maxcount = PY_SSIZE_T_MAX;
9787
9788
    if (substring == NULL)
  Branch (9788:9): [True: 76, False: 3.20k]
9789
        switch (PyUnicode_KIND(self)) {
9790
        case PyUnicode_1BYTE_KIND:
  Branch (9790:9): [True: 76, False: 0]
9791
            if (PyUnicode_IS_ASCII(self))
9792
                return asciilib_rsplit_whitespace(
9793
                    self,  PyUnicode_1BYTE_DATA(self),
9794
                    PyUnicode_GET_LENGTH(self), maxcount
9795
                    );
9796
            else
9797
                return ucs1lib_rsplit_whitespace(
9798
                    self,  PyUnicode_1BYTE_DATA(self),
9799
                    PyUnicode_GET_LENGTH(self), maxcount
9800
                    );
9801
        case PyUnicode_2BYTE_KIND:
  Branch (9801:9): [True: 0, False: 76]
9802
            return ucs2lib_rsplit_whitespace(
9803
                self,  PyUnicode_2BYTE_DATA(self),
9804
                PyUnicode_GET_LENGTH(self), maxcount
9805
                );
9806
        case PyUnicode_4BYTE_KIND:
  Branch (9806:9): [True: 0, False: 76]
9807
            return ucs4lib_rsplit_whitespace(
9808
                self,  PyUnicode_4BYTE_DATA(self),
9809
                PyUnicode_GET_LENGTH(self), maxcount
9810
                );
9811
        default:
  Branch (9811:9): [True: 0, False: 76]
9812
            Py_UNREACHABLE();
9813
        }
9814
9815
    kind1 = PyUnicode_KIND(self);
9816
    kind2 = PyUnicode_KIND(substring);
9817
    len1 = PyUnicode_GET_LENGTH(self);
9818
    len2 = PyUnicode_GET_LENGTH(substring);
9819
    if (kind1 < kind2 || 
len1 < len23.20k
) {
  Branch (9819:9): [True: 6, False: 3.20k]
  Branch (9819:26): [True: 7, False: 3.19k]
9820
        out = PyList_New(1);
9821
        if (out == NULL)
  Branch (9821:13): [True: 0, False: 13]
9822
            return NULL;
9823
        Py_INCREF(self);
9824
        PyList_SET_ITEM(out, 0, self);
9825
        return out;
9826
    }
9827
    buf1 = PyUnicode_DATA(self);
9828
    buf2 = PyUnicode_DATA(substring);
9829
    if (kind2 != kind1) {
  Branch (9829:9): [True: 12, False: 3.18k]
9830
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
9831
        if (!buf2)
  Branch (9831:13): [True: 0, False: 12]
9832
            return NULL;
9833
    }
9834
9835
    switch (kind1) {
9836
    case PyUnicode_1BYTE_KIND:
  Branch (9836:5): [True: 3.16k, False: 26]
9837
        if (PyUnicode_IS_ASCII(self) && 
PyUnicode_IS_ASCII3.16k
(substring))
9838
            out = asciilib_rsplit(
9839
                self,  buf1, len1, buf2, len2, maxcount);
9840
        else
9841
            out = ucs1lib_rsplit(
9842
                self,  buf1, len1, buf2, len2, maxcount);
9843
        break;
9844
    case PyUnicode_2BYTE_KIND:
  Branch (9844:5): [True: 10, False: 3.18k]
9845
        out = ucs2lib_rsplit(
9846
            self,  buf1, len1, buf2, len2, maxcount);
9847
        break;
9848
    case PyUnicode_4BYTE_KIND:
  Branch (9848:5): [True: 16, False: 3.17k]
9849
        out = ucs4lib_rsplit(
9850
            self,  buf1, len1, buf2, len2, maxcount);
9851
        break;
9852
    default:
  Branch (9852:5): [True: 0, False: 3.19k]
9853
        out = NULL;
9854
    }
9855
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
9856
    if (kind2 != kind1)
  Branch (9856:9): [True: 12, False: 3.18k]
9857
        PyMem_Free((void *)buf2);
9858
    return out;
9859
}
9860
9861
static Py_ssize_t
9862
anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
9863
            PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9864
{
9865
    switch (kind) {
  Branch (9865:13): [True: 0, False: 487k]
9866
    case PyUnicode_1BYTE_KIND:
  Branch (9866:5): [True: 486k, False: 350]
9867
        if (PyUnicode_IS_ASCII(str1) && 
PyUnicode_IS_ASCII474k
(str2))
9868
            return asciilib_find(buf1, len1, buf2, len2, offset);
9869
        else
9870
            return ucs1lib_find(buf1, len1, buf2, len2, offset);
9871
    case PyUnicode_2BYTE_KIND:
  Branch (9871:5): [True: 329, False: 486k]
9872
        return ucs2lib_find(buf1, len1, buf2, len2, offset);
9873
    case PyUnicode_4BYTE_KIND:
  Branch (9873:5): [True: 21, False: 487k]
9874
        return ucs4lib_find(buf1, len1, buf2, len2, offset);
9875
    }
9876
    
Py_UNREACHABLE0
();
9877
}
9878
9879
static Py_ssize_t
9880
anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
9881
             PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9882
{
9883
    switch (kind) {
  Branch (9883:13): [True: 0, False: 775k]
9884
    case PyUnicode_1BYTE_KIND:
  Branch (9884:5): [True: 772k, False: 3.15k]
9885
        if (PyUnicode_IS_ASCII(sstr) && 
PyUnicode_IS_ASCII769k
(str1))
9886
            return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9887
        else
9888
            return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9889
    case PyUnicode_2BYTE_KIND:
  Branch (9889:5): [True: 2.33k, False: 773k]
9890
        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9891
    case PyUnicode_4BYTE_KIND:
  Branch (9891:5): [True: 821, False: 775k]
9892
        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9893
    }
9894
    
Py_UNREACHABLE0
();
9895
}
9896
9897
static void
9898
replace_1char_inplace(PyObject *u, Py_ssize_t pos,
9899
                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
9900
{
9901
    int kind = PyUnicode_KIND(u);
9902
    void *data = PyUnicode_DATA(u);
9903
    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
9904
    if (kind == PyUnicode_1BYTE_KIND) {
  Branch (9904:9): [True: 23.5k, False: 33]
9905
        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
9906
                                      (Py_UCS1 *)data + len,
9907
                                      u1, u2, maxcount);
9908
    }
9909
    else if (kind == PyUnicode_2BYTE_KIND) {
  Branch (9909:14): [True: 14, False: 19]
9910
        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
9911
                                      (Py_UCS2 *)data + len,
9912
                                      u1, u2, maxcount);
9913
    }
9914
    else {
9915
        assert(kind == PyUnicode_4BYTE_KIND);
9916
        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
9917
                                      (Py_UCS4 *)data + len,
9918
                                      u1, u2, maxcount);
9919
    }
9920
}
9921
9922
static PyObject *
9923
replace(PyObject *self, PyObject *str1,
9924
        PyObject *str2, Py_ssize_t maxcount)
9925
{
9926
    PyObject *u;
9927
    const char *sbuf = PyUnicode_DATA(self);
9928
    const void *buf1 = PyUnicode_DATA(str1);
9929
    const void *buf2 = PyUnicode_DATA(str2);
9930
    int srelease = 0, release1 = 0, release2 = 0;
9931
    int skind = PyUnicode_KIND(self);
9932
    int kind1 = PyUnicode_KIND(str1);
9933
    int kind2 = PyUnicode_KIND(str2);
9934
    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9935
    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9936
    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
9937
    int mayshrink;
9938
    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
9939
9940
    if (slen < len1)
  Branch (9940:9): [True: 134k, False: 828k]
9941
        goto nothing;
9942
9943
    if (maxcount < 0)
  Branch (9943:9): [True: 828k, False: 141]
9944
        maxcount = PY_SSIZE_T_MAX;
9945
    else if (maxcount == 0)
  Branch (9945:14): [True: 33, False: 108]
9946
        goto nothing;
9947
9948
    if (str1 == str2)
  Branch (9948:9): [True: 652, False: 828k]
9949
        goto nothing;
9950
9951
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9952
    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
9953
    if (maxchar < maxchar_str1)
  Branch (9953:9): [True: 40, False: 828k]
9954
        /* substring too wide to be present */
9955
        goto nothing;
9956
    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
9957
    /* Replacing str1 with str2 may cause a maxchar reduction in the
9958
       result string. */
9959
    mayshrink = (maxchar_str2 < maxchar_str1) && 
(maxchar == maxchar_str1)347
;
  Branch (9959:17): [True: 347, False: 827k]
  Branch (9959:50): [True: 341, False: 6]
9960
    maxchar = Py_MAX(maxchar, maxchar_str2);
9961
9962
    if (len1 == len2) {
  Branch (9962:9): [True: 52.1k, False: 775k]
9963
        /* same length */
9964
        if (len1 == 0)
  Branch (9964:13): [True: 0, False: 52.1k]
9965
            goto nothing;
9966
        if (len1 == 1) {
  Branch (9966:13): [True: 50.9k, False: 1.18k]
9967
            /* replace characters */
9968
            Py_UCS4 u1, u2;
9969
            Py_ssize_t pos;
9970
9971
            u1 = PyUnicode_READ(kind1, buf1, 0);
9972
            pos = findchar(sbuf, skind, slen, u1, 1);
9973
            if (pos < 0)
  Branch (9973:17): [True: 27.3k, False: 23.5k]
9974
                goto nothing;
9975
            u2 = PyUnicode_READ(kind2, buf2, 0);
9976
            u = PyUnicode_New(slen, maxchar);
9977
            if (!u)
  Branch (9977:17): [True: 0, False: 23.5k]
9978
                goto error;
9979
9980
            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
9981
            replace_1char_inplace(u, pos, u1, u2, maxcount);
9982
        }
9983
        else {
9984
            int rkind = skind;
9985
            char *res;
9986
            Py_ssize_t i;
9987
9988
            if (kind1 < rkind) {
  Branch (9988:17): [True: 0, False: 1.18k]
9989
                /* widen substring */
9990
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
9991
                if (!buf1) goto error;
  Branch (9991:21): [True: 0, False: 0]
9992
                release1 = 1;
9993
            }
9994
            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
9995
            if (i < 0)
  Branch (9995:17): [True: 801, False: 388]
9996
                goto nothing;
9997
            if (rkind > kind2) {
  Branch (9997:17): [True: 0, False: 388]
9998
                /* widen replacement */
9999
                buf2 = unicode_askind(kind2, buf2, len2, rkind);
10000
                if (!buf2) goto error;
  Branch (10000:21): [True: 0, False: 0]
10001
                release2 = 1;
10002
            }
10003
            else if (rkind < kind2) {
  Branch (10003:22): [True: 0, False: 388]
10004
                /* widen self and buf1 */
10005
                rkind = kind2;
10006
                if (release1) {
  Branch (10006:21): [True: 0, False: 0]
10007
                    assert(buf1 != PyUnicode_DATA(str1));
10008
                    PyMem_Free((void *)buf1);
10009
                    buf1 = PyUnicode_DATA(str1);
10010
                    release1 = 0;
10011
                }
10012
                sbuf = unicode_askind(skind, sbuf, slen, rkind);
10013
                if (!sbuf) goto error;
  Branch (10013:21): [True: 0, False: 0]
10014
                srelease = 1;
10015
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10016
                if (!buf1) goto error;
  Branch (10016:21): [True: 0, False: 0]
10017
                release1 = 1;
10018
            }
10019
            u = PyUnicode_New(slen, maxchar);
10020
            if (!u)
  Branch (10020:17): [True: 0, False: 388]
10021
                goto error;
10022
            assert(PyUnicode_KIND(u) == rkind);
10023
            res = PyUnicode_DATA(u);
10024
10025
            memcpy(res, sbuf, rkind * slen);
10026
            /* change everything in-place, starting with this one */
10027
            memcpy(res + rkind * i,
10028
                   buf2,
10029
                   rkind * len2);
10030
            i += len1;
10031
10032
            while ( --maxcount > 0) {
  Branch (10032:21): [True: 409, False: 6]
10033
                i = anylib_find(rkind, self,
10034
                                sbuf+rkind*i, slen-i,
10035
                                str1, buf1, len1, i);
10036
                if (i == -1)
  Branch (10036:21): [True: 382, False: 27]
10037
                    break;
10038
                memcpy(res + rkind * i,
10039
                       buf2,
10040
                       rkind * len2);
10041
                i += len1;
10042
            }
10043
        }
10044
    }
10045
    else {
10046
        Py_ssize_t n, i, j, ires;
10047
        Py_ssize_t new_size;
10048
        int rkind = skind;
10049
        char *res;
10050
10051
        if (kind1 < rkind) {
  Branch (10051:13): [True: 2.81k, False: 773k]
10052
            /* widen substring */
10053
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10054
            if (!buf1) 
goto error0
;
  Branch (10054:17): [True: 0, False: 2.81k]
10055
            release1 = 1;
10056
        }
10057
        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10058
        if (n == 0)
  Branch (10058:13): [True: 558k, False: 217k]
10059
            goto nothing;
10060
        if (kind2 < rkind) {
  Branch (10060:13): [True: 410, False: 216k]
10061
            /* widen replacement */
10062
            buf2 = unicode_askind(kind2, buf2, len2, rkind);
10063
            if (!buf2) 
goto error0
;
  Branch (10063:17): [True: 0, False: 410]
10064
            release2 = 1;
10065
        }
10066
        else if (kind2 > rkind) {
  Branch (10066:18): [True: 5, False: 216k]
10067
            /* widen self and buf1 */
10068
            rkind = kind2;
10069
            sbuf = unicode_askind(skind, sbuf, slen, rkind);
10070
            if (!sbuf) 
goto error0
;
  Branch (10070:17): [True: 0, False: 5]
10071
            srelease = 1;
10072
            if (release1) {
  Branch (10072:17): [True: 1, False: 4]
10073
                assert(buf1 != PyUnicode_DATA(str1));
10074
                PyMem_Free((void *)buf1);
10075
                buf1 = PyUnicode_DATA(str1);
10076
                release1 = 0;
10077
            }
10078
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10079
            if (!buf1) 
goto error0
;
  Branch (10079:17): [True: 0, False: 5]
10080
            release1 = 1;
10081
        }
10082
        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10083
           PyUnicode_GET_LENGTH(str1)); */
10084
        if (len1 < len2 && 
len2 - len1 > (20.4k
PY_SSIZE_T_MAX20.4k
- slen) / n) {
  Branch (10084:13): [True: 20.4k, False: 196k]
  Branch (10084:28): [True: 0, False: 20.4k]
10085
                PyErr_SetString(PyExc_OverflowError,
10086
                                "replace string is too long");
10087
                goto error;
10088
        }
10089
        new_size = slen + n * (len2 - len1);
10090
        if (new_size == 0) {
  Branch (10090:13): [True: 10.4k, False: 206k]
10091
            u = unicode_new_empty();
10092
            goto done;
10093
        }
10094
        if (new_size > (PY_SSIZE_T_MAX / rkind)) {
  Branch (10094:13): [True: 0, False: 206k]
10095
            PyErr_SetString(PyExc_OverflowError,
10096
                            "replace string is too long");
10097
            goto error;
10098
        }
10099
        u = PyUnicode_New(new_size, maxchar);
10100
        if (!u)
  Branch (10100:13): [True: 0, False: 206k]
10101
            goto error;
10102
        assert(PyUnicode_KIND(u) == rkind);
10103
        res = PyUnicode_DATA(u);
10104
        ires = i = 0;
10105
        if (len1 > 0) {
  Branch (10105:13): [True: 206k, False: 28]
10106
            while (n-- > 0) {
  Branch (10106:20): [True: 485k, False: 206k]
10107
                /* look for next match */
10108
                j = anylib_find(rkind, self,
10109
                                sbuf + rkind * i, slen-i,
10110
                                str1, buf1, len1, i);
10111
                if (j == -1)
  Branch (10111:21): [True: 0, False: 485k]
10112
                    break;
10113
                else if (j > i) {
  Branch (10113:26): [True: 472k, False: 12.8k]
10114
                    /* copy unchanged part [i:j] */
10115
                    memcpy(res + rkind * ires,
10116
                           sbuf + rkind * i,
10117
                           rkind * (j-i));
10118
                    ires += j - i;
10119
                }
10120
                /* copy substitution string */
10121
                if (len2 > 0) {
  Branch (10121:21): [True: 284k, False: 200k]
10122
                    memcpy(res + rkind * ires,
10123
                           buf2,
10124
                           rkind * len2);
10125
                    ires += len2;
10126
                }
10127
                i = j + len1;
10128
            }
10129
            if (i < slen)
  Branch (10129:17): [True: 46.9k, False: 159k]
10130
                /* copy tail [i:] */
10131
                memcpy(res + rkind * ires,
10132
                       sbuf + rkind * i,
10133
                       rkind * (slen-i));
10134
        }
10135
        else {
10136
            /* interleave */
10137
            while (n > 0) {
  Branch (10137:20): [True: 66, False: 0]
10138
                memcpy(res + rkind * ires,
10139
                       buf2,
10140
                       rkind * len2);
10141
                ires += len2;
10142
                if (--n <= 0)
  Branch (10142:21): [True: 28, False: 38]
10143
                    break;
10144
                memcpy(res + rkind * ires,
10145
                       sbuf + rkind * i,
10146
                       rkind);
10147
                ires++;
10148
                i++;
10149
            }
10150
            memcpy(res + rkind * ires,
10151
                   sbuf + rkind * i,
10152
                   rkind * (slen-i));
10153
        }
10154
    }
10155
10156
    if (mayshrink) {
  Branch (10156:9): [True: 141, False: 230k]
10157
        unicode_adjust_maxchar(&u);
10158
        if (u == NULL)
  Branch (10158:13): [True: 0, False: 141]
10159
            goto error;
10160
    }
10161
10162
  done:
10163
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10164
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10165
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10166
    if (srelease)
  Branch (10166:9): [True: 5, False: 241k]
10167
        PyMem_Free((void *)sbuf);
10168
    if (release1)
  Branch (10168:9): [True: 101, False: 241k]
10169
        PyMem_Free((void *)buf1);
10170
    if (release2)
  Branch (10170:9): [True: 410, False: 240k]
10171
        PyMem_Free((void *)buf2);
10172
    assert(_PyUnicode_CheckConsistency(u, 1));
10173
    return u;
10174
10175
  nothing:
10176
    /* nothing to replace; return original string (when possible) */
10177
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10178
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10179
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10180
    if (srelease)
  Branch (10180:9): [True: 0, False: 722k]
10181
        PyMem_Free((void *)sbuf);
10182
    if (release1)
  Branch (10182:9): [True: 2.72k, False: 719k]
10183
        PyMem_Free((void *)buf1);
10184
    if (release2)
  Branch (10184:9): [True: 0, False: 722k]
10185
        PyMem_Free((void *)buf2);
10186
    return unicode_result_unchanged(self);
10187
10188
  error:
10189
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10190
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10191
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10192
    if (srelease)
  Branch (10192:9): [True: 0, False: 0]
10193
        PyMem_Free((void *)sbuf);
10194
    if (release1)
  Branch (10194:9): [True: 0, False: 0]
10195
        PyMem_Free((void *)buf1);
10196
    if (release2)
  Branch (10196:9): [True: 0, False: 0]
10197
        PyMem_Free((void *)buf2);
10198
    return NULL;
10199
}
10200
10201
/* --- Unicode Object Methods --------------------------------------------- */
10202
10203
/*[clinic input]
10204
str.title as unicode_title
10205
10206
Return a version of the string where each word is titlecased.
10207
10208
More specifically, words start with uppercased characters and all remaining
10209
cased characters have lower case.
10210
[clinic start generated code]*/
10211
10212
static PyObject *
10213
unicode_title_impl(PyObject *self)
10214
/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
10215
{
10216
    return case_operation(self, do_title);
10217
}
10218
10219
/*[clinic input]
10220
str.capitalize as unicode_capitalize
10221
10222
Return a capitalized version of the string.
10223
10224
More specifically, make the first character have upper case and the rest lower
10225
case.
10226
[clinic start generated code]*/
10227
10228
static PyObject *
10229
unicode_capitalize_impl(PyObject *self)
10230
/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
10231
{
10232
    if (PyUnicode_GET_LENGTH(self) == 0)
  Branch (10232:9): [True: 3, False: 1.07k]
10233
        return unicode_result_unchanged(self);
10234
    return case_operation(self, do_capitalize);
10235
}
10236
10237
/*[clinic input]
10238
str.casefold as unicode_casefold
10239
10240
Return a version of the string suitable for caseless comparisons.
10241
[clinic start generated code]*/
10242
10243
static PyObject *
10244
unicode_casefold_impl(PyObject *self)
10245
/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10246
{
10247
    if (PyUnicode_IS_ASCII(self))
10248
        return ascii_upper_or_lower(self, 1);
10249
    return case_operation(self, do_casefold);
10250
}
10251
10252
10253
/* Argument converter. Accepts a single Unicode character. */
10254
10255
static int
10256
convert_uc(PyObject *obj, void *addr)
10257
{
10258
    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10259
10260
    if (!PyUnicode_Check(obj)) {
  Branch (10260:9): [True: 0, False: 63]
10261
        PyErr_Format(PyExc_TypeError,
10262
                     "The fill character must be a unicode character, "
10263
                     "not %.100s", Py_TYPE(obj)->tp_name);
10264
        return 0;
10265
    }
10266
    if (PyUnicode_GET_LENGTH(obj) != 1) {
  Branch (10266:9): [True: 0, False: 63]
10267
        PyErr_SetString(PyExc_TypeError,
10268
                        "The fill character must be exactly one character long");
10269
        return 0;
10270
    }
10271
    *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10272
    return 1;
10273
}
10274
10275
/*[clinic input]
10276
str.center as unicode_center
10277
10278
    width: Py_ssize_t
10279
    fillchar: Py_UCS4 = ' '
10280
    /
10281
10282
Return a centered string of length width.
10283
10284
Padding is done using the specified fill character (default is a space).
10285
[clinic start generated code]*/
10286
10287
static PyObject *
10288
unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10289
/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10290
{
10291
    Py_ssize_t marg, left;
10292
10293
    if (PyUnicode_GET_LENGTH(self) >= width)
  Branch (10293:9): [True: 2.43k, False: 483]
10294
        return unicode_result_unchanged(self);
10295
10296
    marg = width - PyUnicode_GET_LENGTH(self);
10297
    left = marg / 2 + (marg & width & 1);
10298
10299
    return pad(self, left, marg - left, fillchar);
10300
}
10301
10302
/* This function assumes that str1 and str2 are readied by the caller. */
10303
10304
static int
10305
unicode_compare(PyObject *str1, PyObject *str2)
10306
{
10307
#define COMPARE(TYPE1, TYPE2) \
10308
    
do 4.33k
{ \
10309
        TYPE1* p1 = (TYPE1 *)data1; \
10310
        TYPE2* p2 = (TYPE2 *)data2; \
10311
        TYPE1* end = p1 + len; \
10312
        Py_UCS4 c1, c2; \
10313
        for (; p1 != end; 
p1++, p2++138
) { \
10314
            c1 = *p1; \
10315
            c2 = *p2; \
10316
            if (c1 != c2) \
10317
                
return 4.19k
(c1 < c2)4.19k
?
-12.22k
:
11.97k
; \
10318
        } \
10319
    } \
10320
    while (
0138
)
10321
10322
    int kind1, kind2;
10323
    const void *data1, *data2;
10324
    Py_ssize_t len1, len2, len;
10325
10326
    kind1 = PyUnicode_KIND(str1);
10327
    kind2 = PyUnicode_KIND(str2);
10328
    data1 = PyUnicode_DATA(str1);
10329
    data2 = PyUnicode_DATA(str2);
10330
    len1 = PyUnicode_GET_LENGTH(str1);
10331
    len2 = PyUnicode_GET_LENGTH(str2);
10332
    len = Py_MIN(len1, len2);
10333
10334
    switch(kind1) {
10335
    case PyUnicode_1BYTE_KIND:
  Branch (10335:5): [True: 1.78M, False: 3.84k]
10336
    {
10337
        switch(kind2) {
10338
        case PyUnicode_1BYTE_KIND:
  Branch (10338:9): [True: 1.78M, False: 626]
10339
        {
10340
            int cmp = memcmp(data1, data2, len);
10341
            /* normalize result of memcmp() into the range [-1; 1] */
10342
            if (cmp < 0)
  Branch (10342:17): [True: 907k, False: 879k]
10343
                return -1;
10344
            if (cmp > 0)
  Branch (10344:17): [True: 847k, False: 31.4k]
10345
                return 1;
10346
            break;
10347
        }
10348
        case PyUnicode_2BYTE_KIND:
  Branch (10348:9): [True: 622, False: 1.78M]
10349
            COMPARE(Py_UCS1, Py_UCS2);
10350
            break;
10351
        case PyUnicode_4BYTE_KIND:
  Branch (10351:9): [True: 4, False: 1.78M]
10352
            COMPARE(Py_UCS1, Py_UCS4);
10353
            break;
10354
        default:
  Branch (10354:9): [True: 0, False: 1.78M]
10355
            Py_UNREACHABLE();
10356
        }
10357
        break;
10358
    }
10359
    case PyUnicode_2BYTE_KIND:
  Branch (10359:5): [True: 3.70k, False: 1.78M]
10360
    {
10361
        switch(kind2) {
10362
        case PyUnicode_1BYTE_KIND:
  Branch (10362:9): [True: 343, False: 3.36k]
10363
            COMPARE(Py_UCS2, Py_UCS1);
10364
            break;
10365
        case PyUnicode_2BYTE_KIND:
  Branch (10365:9): [True: 3.35k, False: 345]
10366
        {
10367
            COMPARE(Py_UCS2, Py_UCS2);
10368
            break;
10369
        }
10370
        case PyUnicode_4BYTE_KIND:
  Branch (10370:9): [True: 2, False: 3.70k]
10371
            COMPARE(Py_UCS2, Py_UCS4);
10372
            break;
10373
        default:
  Branch (10373:9): [True: 0, False: 3.70k]
10374
            Py_UNREACHABLE();
10375
        }
10376
        break;
10377
    }
10378
    case PyUnicode_4BYTE_KIND:
  Branch (10378:5): [True: 137, False: 1.79M]
10379
    {
10380
        switch(kind2) {
10381
        case PyUnicode_1BYTE_KIND:
  Branch (10381:9): [True: 4, False: 133]
10382
            COMPARE(Py_UCS4, Py_UCS1);
10383
            break;
10384
        case PyUnicode_2BYTE_KIND:
  Branch (10384:9): [True: 2, False: 135]
10385
            COMPARE(Py_UCS4, Py_UCS2);
10386
            break;
10387
        case PyUnicode_4BYTE_KIND:
  Branch (10387:9): [True: 131, False: 6]
10388
        {
10389
#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10390
            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10391
            /* normalize result of wmemcmp() into the range [-1; 1] */
10392
            if (cmp < 0)
  Branch (10392:17): [True: 2, False: 129]
10393
                return -1;
10394
            if (cmp > 0)
  Branch (10394:17): [True: 0, False: 129]
10395
                return 1;
10396
#else
10397
            COMPARE(Py_UCS4, Py_UCS4);
10398
#endif
10399
            break;
10400
        }
10401
        default:
  Branch (10401:9): [True: 0, False: 137]
10402
            Py_UNREACHABLE();
10403
        }
10404
        break;
10405
    }
10406
    default:
  Branch (10406:5): [True: 0, False: 1.79M]
10407
        Py_UNREACHABLE();
10408
    }
10409
10410
    if (len1 == len2)
  Branch (10410:9): [True: 20.4k, False: 11.2k]
10411
        return 0;
10412
    if (len1 < len2)
  Branch (10412:9): [True: 1.25k, False: 9.96k]
10413
        return -1;
10414
    else
10415
        return 1;
10416
10417
#undef COMPARE
10418
}
10419
10420
static int
10421
unicode_compare_eq(PyObject *str1, PyObject *str2)
10422
{
10423
    int kind;
10424
    const void *data1, *data2;
10425
    Py_ssize_t len;
10426
    int cmp;
10427
10428
    len = PyUnicode_GET_LENGTH(str1);
10429
    if (PyUnicode_GET_LENGTH(str2) != len)
  Branch (10429:9): [True: 11.2M, False: 13.4M]
10430
        return 0;
10431
    kind = PyUnicode_KIND(str1);
10432
    if (PyUnicode_KIND(str2) != kind)
  Branch (10432:9): [True: 2.52k, False: 13.4M]
10433
        return 0;
10434
    data1 = PyUnicode_DATA(str1);
10435
    data2 = PyUnicode_DATA(str2);
10436
10437
    cmp = memcmp(data1, data2, len * kind);
10438
    return (cmp == 0);
10439
}
10440
10441
int
10442
_PyUnicode_Equal(PyObject *str1, PyObject *str2)
10443
{
10444
    assert(PyUnicode_CheckExact(str1));
10445
    assert(PyUnicode_CheckExact(str2));
10446
    if (str1 == str2) {
  Branch (10446:9): [True: 4.75M, False: 14.6M]
10447
        return 1;
10448
    }
10449
    return unicode_compare_eq(str1, str2);
10450
}
10451
10452
10453
int
10454
PyUnicode_Compare(PyObject *left, PyObject *right)
10455
{
10456
    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10457
        /* a string is equal to itself */
10458
        if (left == right)
  Branch (10458:13): [True: 2.86M, False: 159k]
10459
            return 0;
10460
10461
        return unicode_compare(left, right);
10462
    }
10463
    PyErr_Format(PyExc_TypeError,
10464
                 "Can't compare %.100s and %.100s",
10465
                 Py_TYPE(left)->tp_name,
10466
                 Py_TYPE(right)->tp_name);
10467
    return -1;
10468
}
10469
10470
int
10471
PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10472
{
10473
    Py_ssize_t i;
10474
    int kind;
10475
    Py_UCS4 chr;
10476
10477
    assert(_PyUnicode_CHECK(uni));
10478
    kind = PyUnicode_KIND(uni);
10479
    if (kind == PyUnicode_1BYTE_KIND) {
  Branch (10479:9): [True: 411k, False: 0]
10480
        const void *data = PyUnicode_1BYTE_DATA(uni);
10481
        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
10482
        size_t len, len2 = strlen(str);
10483
        int cmp;
10484
10485
        len = Py_MIN(len1, len2);
10486
        cmp = memcmp(data, str, len);
10487
        if (cmp != 0) {
  Branch (10487:13): [True: 244k, False: 167k]
10488
            if (cmp < 0)
  Branch (10488:17): [True: 205k, False: 39.1k]
10489
                return -1;
10490
            else
10491
                return 1;
10492
        }
10493
        if (len1 > len2)
  Branch (10493:13): [True: 5, False: 167k]
10494
            return 1; /* uni is longer */
10495
        if (len1 < len2)
  Branch (10495:13): [True: 24, False: 167k]
10496
            return -1; /* str is longer */
10497
        return 0;
10498
    }
10499
    else {
10500
        const void *data = PyUnicode_DATA(uni);
10501
        /* Compare Unicode string and source character set string */
10502
        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
  Branch (10502:21): [True: 0, False: 0]
  Branch (10502:62): [True: 0, False: 0]
10503
            if (chr != (unsigned char)str[i])
  Branch (10503:17): [True: 0, False: 0]
10504
                return (chr < (unsigned char)(str[i])) ? -1 : 1;
  Branch (10504:24): [True: 0, False: 0]
10505
        /* This check keeps Python strings that end in '\0' from comparing equal
10506
         to C strings identical up to that point. */
10507
        if (PyUnicode_GET_LENGTH(uni) != i || chr)
  Branch (10507:13): [True: 0, False: 0]
  Branch (10507:47): [True: 0, False: 0]
10508
            return 1; /* uni is longer */
10509
        if (str[i])
  Branch (10509:13): [True: 0, False: 0]
10510
            return -1; /* str is longer */
10511
        return 0;
10512
    }
10513
}
10514
10515
int
10516
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
10517
{
10518
    size_t len;
10519
    assert(_PyUnicode_CHECK(unicode));
10520
    assert(str);
10521
#ifndef NDEBUG
10522
    for (const char *p = str; *p; p++) {
10523
        assert((unsigned char)*p < 128);
10524
    }
10525
#endif
10526
    if (!PyUnicode_IS_ASCII(unicode))
  Branch (10526:9): [True: 1.58k, False: 2.30M]
10527
        return 0;
10528
    len = (size_t)PyUnicode_GET_LENGTH(unicode);
10529
    return strlen(str) == len &&
  Branch (10529:12): [True: 210k, False: 2.09M]
10530
           
memcmp(210k
PyUnicode_1BYTE_DATA210k
(unicode), str, len) == 0;
  Branch (10530:12): [True: 41.5k, False: 168k]
10531
}
10532
10533
int
10534
_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
10535
{
10536
    PyObject *right_uni;
10537
10538
    assert(_PyUnicode_CHECK(left));
10539
    assert(right->string);
10540
#ifndef NDEBUG
10541
    for (const char *p = right->string; *p; p++) {
10542
        assert((unsigned char)*p < 128);
10543
    }
10544
#endif
10545
10546
    if (!PyUnicode_IS_ASCII(left))
  Branch (10546:9): [True: 0, False: 0]
10547
        return 0;
10548
10549
    right_uni = _PyUnicode_FromId(right);       /* borrowed */
10550
    if (right_uni == NULL) {
  Branch (10550:9): [True: 0, False: 0]
10551
        /* memory error or bad data */
10552
        PyErr_Clear();
10553
        return _PyUnicode_EqualToASCIIString(left, right->string);
10554
    }
10555
10556
    if (left == right_uni)
  Branch (10556:9): [True: 0, False: 0]
10557
        return 1;
10558
10559
    if (PyUnicode_CHECK_INTERNED(left))
10560
        return 0;
10561
10562
    assert(_PyUnicode_HASH(right_uni) != -1);
10563
    Py_hash_t hash = _PyUnicode_HASH(left);
10564
    if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) {
  Branch (10564:9): [True: 0, False: 0]
  Branch (10564:23): [True: 0, False: 0]
10565
        return 0;
10566
    }
10567
10568
    return unicode_compare_eq(left, right_uni);
10569
}
10570
10571
PyObject *
10572
PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
10573
{
10574
    int result;
10575
10576
    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
  Branch (10576:9): [True: 0, False: 12.9M]
  Branch (10576:35): [True: 454k, False: 12.5M]
10577
        Py_RETURN_NOTIMPLEMENTED;
10578
10579
    if (left == right) {
  Branch (10579:9): [True: 850k, False: 11.6M]
10580
        switch (op) {
10581
        case Py_EQ:
  Branch (10581:9): [True: 718k, False: 131k]
10582
        case Py_LE:
  Branch (10582:9): [True: 73, False: 850k]
10583
        case Py_GE:
  Branch (10583:9): [True: 1, False: 850k]
10584
            /* a string is equal to itself */
10585
            Py_RETURN_TRUE;
10586
        case Py_NE:
  Branch (10586:9): [True: 121k, False: 729k]
10587
        case Py_LT:
  Branch (10587:9): [True: 8.64k, False: 841k]
10588
        case Py_GT:
  Branch (10588:9): [True: 2.00k, False: 848k]
10589
            Py_RETURN_FALSE;
10590
        default:
  Branch (10590:9): [True: 0, False: 850k]
10591
            PyErr_BadArgument();
10592
            return NULL;
10593
        }
10594
    }
10595
    else if (op == Py_EQ || 
op == 2.49M
Py_NE2.49M
) {
  Branch (10595:14): [True: 9.17M, False: 2.49M]
  Branch (10595:29): [True: 860k, False: 1.63M]
10596
        result = unicode_compare_eq(left, right);
10597
        result ^= (op == Py_NE);
10598
        return PyBool_FromLong(result);
10599
    }
10600
    else {
10601
        result = unicode_compare(left, right);
10602
        Py_RETURN_RICHCOMPARE(result, 0, op);
10603
    }
10604
}
10605
10606
int
10607
_PyUnicode_EQ(PyObject *aa, PyObject *bb)
10608
{
10609
    return unicode_eq(aa, bb);
10610
}
10611
10612
int
10613
PyUnicode_Contains(PyObject *str, PyObject *substr)
10614
{
10615
    int kind1, kind2;
10616
    const void *buf1, *buf2;
10617
    Py_ssize_t len1, len2;
10618
    int result;
10619
10620
    if (!PyUnicode_Check(substr)) {
  Branch (10620:9): [True: 2, False: 5.80M]
10621
        PyErr_Format(PyExc_TypeError,
10622
                     "'in <string>' requires string as left operand, not %.100s",
10623
                     Py_TYPE(substr)->tp_name);
10624
        return -1;
10625
    }
10626
    if (ensure_unicode(str) < 0)
  Branch (10626:9): [True: 0, False: 5.80M]
10627
        return -1;
10628
10629
    kind1 = PyUnicode_KIND(str);
10630
    kind2 = PyUnicode_KIND(substr);
10631
    if (kind1 < kind2)
  Branch (10631:9): [True: 494, False: 5.80M]
10632
        return 0;
10633
    len1 = PyUnicode_GET_LENGTH(str);
10634
    len2 = PyUnicode_GET_LENGTH(substr);
10635
    if (len1 < len2)
  Branch (10635:9): [True: 191k, False: 5.61M]
10636
        return 0;
10637
    buf1 = PyUnicode_DATA(str);
10638
    buf2 = PyUnicode_DATA(substr);
10639
    if (len2 == 1) {
  Branch (10639:9): [True: 4.21M, False: 1.39M]
10640
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
10641
        result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
10642
        return result;
10643
    }
10644
    if (kind2 != kind1) {
  Branch (10644:9): [True: 63, False: 1.39M]
10645
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10646
        if (!buf2)
  Branch (10646:13): [True: 0, False: 63]
10647
            return -1;
10648
    }
10649
10650
    switch (kind1) {
10651
    case PyUnicode_1BYTE_KIND:
  Branch (10651:5): [True: 1.39M, False: 78]
10652
        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10653
        break;
10654
    case PyUnicode_2BYTE_KIND:
  Branch (10654:5): [True: 57, False: 1.39M]
10655
        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10656
        break;
10657
    case PyUnicode_4BYTE_KIND:
  Branch (10657:5): [True: 21, False: 1.39M]
10658
        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10659
        break;
10660
    default:
  Branch (10660:5): [True: 0, False: 1.39M]
10661
        Py_UNREACHABLE();
10662
    }
10663
10664
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
10665
    if (kind2 != kind1)
  Branch (10665:9): [True: 63, False: 1.39M]
10666
        PyMem_Free((void *)buf2);
10667
10668
    return result;
10669
}
10670
10671
/* Concat to string or Unicode object giving a new Unicode object. */
10672
10673
PyObject *
10674
PyUnicode_Concat(PyObject *left, PyObject *right)
10675
{
10676
    PyObject *result;
10677
    Py_UCS4 maxchar, maxchar2;
10678
    Py_ssize_t left_len, right_len, new_len;
10679
10680
    if (ensure_unicode(left) < 0)
  Branch (10680:9): [True: 0, False: 6.16M]
10681
        return NULL;
10682
10683
    if (!PyUnicode_Check(right)) {
  Branch (10683:9): [True: 22, False: 6.16M]
10684
        PyErr_Format(PyExc_TypeError,
10685
                     "can only concatenate str (not \"%.200s\") to str",
10686
                     Py_TYPE(right)->tp_name);
10687
        return NULL;
10688
    }
10689
10690
    /* Shortcuts */
10691
    PyObject *empty = unicode_get_empty();  // Borrowed reference
10692
    if (left == empty) {
  Branch (10692:9): [True: 321k, False: 5.84M]
10693
        return PyUnicode_FromObject(right);
10694
    }
10695
    if (right == empty) {
  Branch (10695:9): [True: 221k, False: 5.62M]
10696
        return PyUnicode_FromObject(left);
10697
    }
10698
10699
    left_len = PyUnicode_GET_LENGTH(left);
10700
    right_len = PyUnicode_GET_LENGTH(right);
10701
    if (left_len > PY_SSIZE_T_MAX - right_len) {
  Branch (10701:9): [True: 0, False: 5.62M]
10702
        PyErr_SetString(PyExc_OverflowError,
10703
                        "strings are too large to concat");
10704
        return NULL;
10705
    }
10706
    new_len = left_len + right_len;
10707
10708
    maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10709
    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
10710
    maxchar = Py_MAX(maxchar, maxchar2);
10711
10712
    /* Concat the two Unicode strings */
10713
    result = PyUnicode_New(new_len, maxchar);
10714
    if (result == NULL)
  Branch (10714:9): [True: 0, False: 5.62M]
10715
        return NULL;
10716
    _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
10717
    _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
10718
    assert(_PyUnicode_CheckConsistency(result, 1));
10719
    return result;
10720
}
10721
10722
void
10723
PyUnicode_Append(PyObject **p_left, PyObject *right)
10724
{
10725
    PyObject *left, *res;
10726
    Py_UCS4 maxchar, maxchar2;
10727
    Py_ssize_t left_len, right_len, new_len;
10728
10729
    if (p_left == NULL) {
  Branch (10729:9): [True: 0, False: 2.06M]
10730
        if (!PyErr_Occurred())
  Branch (10730:13): [True: 0, False: 0]
10731
            PyErr_BadInternalCall();
10732
        return;
10733
    }
10734
    left = *p_left;
10735
    if (right == NULL || left == NULL
  Branch (10735:9): [True: 0, False: 2.06M]
  Branch (10735:26): [True: 0, False: 2.06M]
10736
        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
  Branch (10736:12): [True: 0, False: 2.06M]
  Branch (10736:38): [True: 0, False: 2.06M]
10737
        if (!PyErr_Occurred())
  Branch (10737:13): [True: 0, False: 0]
10738
            PyErr_BadInternalCall();
10739
        goto error;
10740
    }
10741
10742
    /* Shortcuts */
10743
    PyObject *empty = unicode_get_empty();  // Borrowed reference
10744
    if (left == empty) {
  Branch (10744:9): [True: 237k, False: 1.82M]
10745
        Py_DECREF(left);
10746
        Py_INCREF(right);
10747
        *p_left = right;
10748
        return;
10749
    }
10750
    if (right == empty) {
  Branch (10750:9): [True: 29.0k, False: 1.79M]
10751
        return;
10752
    }
10753
10754
    left_len = PyUnicode_GET_LENGTH(left);
10755
    right_len = PyUnicode_GET_LENGTH(right);
10756
    if (left_len > PY_SSIZE_T_MAX - right_len) {
  Branch (10756:9): [True: 0, False: 1.79M]
10757
        PyErr_SetString(PyExc_OverflowError,
10758
                        "strings are too large to concat");
10759
        goto error;
10760
    }
10761
    new_len = left_len + right_len;
10762
10763
    if (unicode_modifiable(left)
  Branch (10763:9): [True: 868k, False: 926k]
10764
        && PyUnicode_CheckExact(right)
10765
        && 
PyUnicode_KIND868k
(right) <= 868k
PyUnicode_KIND868k
(left)
  Branch (10765:12): [True: 868k, False: 143]
10766
        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10767
           to change the structure size, but characters are stored just after
10768
           the structure, and so it requires to move all characters which is
10769
           not so different than duplicating the string. */
10770
        && 
!(868k
PyUnicode_IS_ASCII868k
(left) &&
!841k
PyUnicode_IS_ASCII841k
(right)))
  Branch (10770:42): [True: 364, False: 841k]
10771
    {
10772
        /* append inplace */
10773
        if (unicode_resize(p_left, new_len) != 0)
  Branch (10773:13): [True: 0, False: 867k]
10774
            goto error;
10775
10776
        /* copy 'right' into the newly allocated area of 'left' */
10777
        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
10778
    }
10779
    else {
10780
        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10781
        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
10782
        maxchar = Py_MAX(maxchar, maxchar2);
10783
10784
        /* Concat the two Unicode strings */
10785
        res = PyUnicode_New(new_len, maxchar);
10786
        if (res == NULL)
  Branch (10786:13): [True: 0, False: 927k]
10787
            goto error;
10788
        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10789
        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
10790
        Py_DECREF(left);
10791
        *p_left = res;
10792
    }
10793
    assert(_PyUnicode_CheckConsistency(*p_left, 1));
10794
    return;
10795
10796
error:
10797
    Py_CLEAR(*p_left);
10798
}
10799
10800
void
10801
PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10802
{
10803
    PyUnicode_Append(pleft, right);
10804
    Py_XDECREF(right);
10805
}
10806
10807
/*
10808
Wraps asciilib_parse_args_finds() and additionally ensures that the
10809
first argument is a unicode object.
10810
*/
10811
10812
static inline int
10813
parse_args_finds_unicode(const char * function_name, PyObject *args,
10814
                         PyObject **substring,
10815
                         Py_ssize_t *start, Py_ssize_t *end)
10816
{
10817
    if (asciilib_parse_args_finds(function_name, args, substring, start, end)) {
  Branch (10817:9): [True: 2.13M, False: 16]
10818
        if (ensure_unicode(*substring) < 0)
  Branch (10818:13): [True: 11, False: 2.13M]
10819
            return 0;
10820
        return 1;
10821
    }
10822
    return 0;
10823
}
10824
10825
PyDoc_STRVAR(count__doc__,
10826
             "S.count(sub[, start[, end]]) -> int\n\
10827
\n\
10828
Return the number of non-overlapping occurrences of substring sub in\n\
10829
string S[start:end].  Optional arguments start and end are\n\
10830
interpreted as in slice notation.");
10831
10832
static PyObject *
10833
unicode_count(PyObject *self, PyObject *args)
10834
{
10835
    PyObject *substring = NULL;   /* initialize to fix a compiler warning */
10836
    Py_ssize_t start = 0;
10837
    Py_ssize_t end = PY_SSIZE_T_MAX;
10838
    PyObject *result;
10839
    int kind1, kind2;
10840
    const void *buf1, *buf2;
10841
    Py_ssize_t len1, len2, iresult;
10842
10843
    if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
  Branch (10843:9): [True: 5, False: 318k]
10844
        return NULL;
10845
10846
    kind1 = PyUnicode_KIND(self);
10847
    kind2 = PyUnicode_KIND(substring);
10848
    if (kind1 < kind2)
  Branch (10848:9): [True: 6, False: 318k]
10849
        return PyLong_FromLong(0);
10850
10851
    len1 = PyUnicode_GET_LENGTH(self);
10852
    len2 = PyUnicode_GET_LENGTH(substring);
10853
    ADJUST_INDICES(start, end, len1);
10854
    if (end - start < len2)
  Branch (10854:9): [True: 104k, False: 214k]
10855
        return PyLong_FromLong(0);
10856
10857
    buf1 = PyUnicode_DATA(self);
10858
    buf2 = PyUnicode_DATA(substring);
10859
    if (kind2 != kind1) {
  Branch (10859:9): [True: 11.8k, False: 202k]
10860
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10861
        if (!buf2)
  Branch (10861:13): [True: 0, False: 11.8k]
10862
            return NULL;
10863
    }
10864
    switch (kind1) {
10865
    case PyUnicode_1BYTE_KIND:
  Branch (10865:5): [True: 202k, False: 11.8k]
10866
        iresult = ucs1lib_count(
10867
            ((const Py_UCS1*)buf1) + start, end - start,
10868
            buf2, len2, PY_SSIZE_T_MAX
10869
            );
10870
        break;
10871
    case PyUnicode_2BYTE_KIND:
  Branch (10871:5): [True: 11.8k, False: 202k]
10872
        iresult = ucs2lib_count(
10873
            ((const Py_UCS2*)buf1) + start, end - start,
10874
            buf2, len2, PY_SSIZE_T_MAX
10875
            );
10876
        break;
10877
    case PyUnicode_4BYTE_KIND:
  Branch (10877:5): [True: 4, False: 214k]
10878
        iresult = ucs4lib_count(
10879
            ((const Py_UCS4*)buf1) + start, end - start,
10880
            buf2, len2, PY_SSIZE_T_MAX
10881
            );
10882
        break;
10883
    default:
  Branch (10883:5): [True: 0, False: 214k]
10884
        Py_UNREACHABLE();
10885
    }
10886
10887
    result = PyLong_FromSsize_t(iresult);
10888
10889
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
10890
    if (kind2 != kind1)
  Branch (10890:9): [True: 11.8k, False: 202k]
10891
        PyMem_Free((void *)buf2);
10892
10893
    return result;
10894
}
10895
10896
/*[clinic input]
10897
str.encode as unicode_encode
10898
10899
    encoding: str(c_default="NULL") = 'utf-8'
10900
        The encoding in which to encode the string.
10901
    errors: str(c_default="NULL") = 'strict'
10902
        The error handling scheme to use for encoding errors.
10903
        The default is 'strict' meaning that encoding errors raise a
10904
        UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
10905
        'xmlcharrefreplace' as well as any other name registered with
10906
        codecs.register_error that can handle UnicodeEncodeErrors.
10907
10908
Encode the string using the codec registered for encoding.
10909
[clinic start generated code]*/
10910
10911
static PyObject *
10912
unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
10913
/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
10914
{
10915
    return PyUnicode_AsEncodedString(self, encoding, errors);
10916
}
10917
10918
/*[clinic input]
10919
str.expandtabs as unicode_expandtabs
10920
10921
    tabsize: int = 8
10922
10923
Return a copy where all tab characters are expanded using spaces.
10924
10925
If tabsize is not given, a tab size of 8 characters is assumed.
10926
[clinic start generated code]*/
10927
10928
static PyObject *
10929
unicode_expandtabs_impl(PyObject *self, int tabsize)
10930
/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
10931
{
10932
    Py_ssize_t i, j, line_pos, src_len, incr;
10933
    Py_UCS4 ch;
10934
    PyObject *u;
10935
    const void *src_data;
10936
    void *dest_data;
10937
    int kind;
10938
    int found;
10939
10940
    /* First pass: determine size of output string */
10941
    src_len = PyUnicode_GET_LENGTH(self);
10942
    i = j = line_pos = 0;
10943
    kind = PyUnicode_KIND(self);
10944
    src_data = PyUnicode_DATA(self);
10945
    found = 0;
10946
    for (; i < src_len; 
i++2.67M
) {
  Branch (10946:12): [True: 2.67M, False: 39.4k]
10947
        ch = PyUnicode_READ(kind, src_data, i);
10948
        if (ch == '\t') {
  Branch (10948:13): [True: 781, False: 2.67M]
10949
            found = 1;
10950
            if (tabsize > 0) {
  Branch (10950:17): [True: 781, False: 0]
10951
                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
10952
                if (j > PY_SSIZE_T_MAX - incr)
  Branch (10952:21): [True: 0, False: 781]
10953
                    goto overflow;
10954
                line_pos += incr;
10955
                j += incr;
10956
            }
10957
        }
10958
        else {
10959
            if (j > PY_SSIZE_T_MAX - 1)
  Branch (10959:17): [True: 0, False: 2.67M]
10960
                goto overflow;
10961
            line_pos++;
10962
            j++;
10963
            if (ch == '\n' || 
ch == '\r'2.60M
)
  Branch (10963:17): [True: 74.2k, False: 2.60M]
  Branch (10963:31): [True: 25, False: 2.60M]
10964
                line_pos = 0;
10965
        }
10966
    }
10967
    if (!found)
  Branch (10967:9): [True: 39.3k, False: 59]
10968
        return unicode_result_unchanged(self);
10969
10970
    /* Second pass: create output string and fill it */
10971
    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
10972
    if (!u)
  Branch (10972:9): [True: 0, False: 59]
10973
        return NULL;
10974
    dest_data = PyUnicode_DATA(u);
10975
10976
    i = j = line_pos = 0;
10977
10978
    for (; i < src_len; 
i++6.88k
) {
  Branch (10978:12): [True: 6.88k, False: 59]
10979
        ch = PyUnicode_READ(kind, src_data, i);
10980
        if (ch == '\t') {
  Branch (10980:13): [True: 781, False: 6.09k]
10981
            if (tabsize > 0) {
  Branch (10981:17): [True: 781, False: 0]
10982
                incr = tabsize - (line_pos % tabsize);
10983
                line_pos += incr;
10984
                unicode_fill(kind, dest_data, ' ', j, incr);
10985
                j += incr;
10986
            }
10987
        }
10988
        else {
10989
            line_pos++;
10990
            PyUnicode_WRITE(kind, dest_data, j, ch);
10991
            j++;
10992
            if (ch == '\n' || 
ch == '\r'6.00k
)
  Branch (10992:17): [True: 94, False: 6.00k]
  Branch (10992:31): [True: 16, False: 5.98k]
10993
                line_pos = 0;
10994
        }
10995
    }
10996
    assert (j == PyUnicode_GET_LENGTH(u));
10997
    return unicode_result(u);
10998
10999
  overflow:
11000
    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11001
    return NULL;
11002
}
11003
11004
PyDoc_STRVAR(find__doc__,
11005
             "S.find(sub[, start[, end]]) -> int\n\
11006
\n\
11007
Return the lowest index in S where substring sub is found,\n\
11008
such that sub is contained within S[start:end].  Optional\n\
11009
arguments start and end are interpreted as in slice notation.\n\
11010
\n\
11011
Return -1 on failure.");
11012
11013
static PyObject *
11014
unicode_find(PyObject *self, PyObject *args)
11015
{
11016
    /* initialize variables to prevent gcc warning */
11017
    PyObject *substring = NULL;
11018
    Py_ssize_t start = 0;
11019
    Py_ssize_t end = 0;
11020
    Py_ssize_t result;
11021
11022
    if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
  Branch (11022:9): [True: 7, False: 819k]
11023
        return NULL;
11024
11025
    result = any_find_slice(self, substring, start, end, 1);
11026
11027
    if (result == -2)
  Branch (11027:9): [True: 0, False: 819k]
11028
        return NULL;
11029
11030
    return PyLong_FromSsize_t(result);
11031
}
11032
11033
static PyObject *
11034
unicode_getitem(PyObject *self, Py_ssize_t index)
11035
{
11036
    const void *data;
11037
    int kind;
11038
    Py_UCS4 ch;
11039
11040
    if (!PyUnicode_Check(self)) {
  Branch (11040:9): [True: 0, False: 11.5M]
11041
        PyErr_BadArgument();
11042
        return NULL;
11043
    }
11044
    if (index < 0 || 
index >= 11.5M
PyUnicode_GET_LENGTH11.5M
(self)) {
  Branch (11044:9): [True: 1, False: 11.5M]
  Branch (11044:22): [True: 10.6k, False: 11.5M]
11045
        PyErr_SetString(PyExc_IndexError, "string index out of range");
11046
        return NULL;
11047
    }
11048
    kind = PyUnicode_KIND(self);
11049
    data = PyUnicode_DATA(self);
11050
    ch = PyUnicode_READ(kind, data, index);
11051
    return unicode_char(ch);
11052
}
11053
11054
/* Believe it or not, this produces the same value for ASCII strings
11055
   as bytes_hash(). */
11056
static Py_hash_t
11057
unicode_hash(PyObject *self)
11058
{
11059
    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11060
11061
#ifdef Py_DEBUG
11062
    assert(_Py_HashSecret_Initialized);
11063
#endif
11064
    if (_PyUnicode_HASH(self) != -1)
  Branch (11064:9): [True: 13.2M, False: 15.3M]
11065
        return _PyUnicode_HASH(self);
11066
11067
    x = _Py_HashBytes(PyUnicode_DATA(self),
11068
                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11069
    _PyUnicode_HASH(self) = x;
11070
    return x;
11071
}
11072
11073
PyDoc_STRVAR(index__doc__,
11074
             "S.index(sub[, start[, end]]) -> int\n\
11075
\n\
11076
Return the lowest index in S where substring sub is found,\n\
11077
such that sub is contained within S[start:end].  Optional\n\
11078
arguments start and end are interpreted as in slice notation.\n\
11079
\n\
11080
Raises ValueError when the substring is not found.");
11081
11082
static PyObject *
11083
unicode_index(PyObject *self, PyObject *args)
11084
{
11085
    /* initialize variables to prevent gcc warning */
11086
    Py_ssize_t result;
11087
    PyObject *substring = NULL;
11088
    Py_ssize_t start = 0;
11089
    Py_ssize_t end = 0;
11090
11091
    if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
  Branch (11091:9): [True: 5, False: 21.0k]
11092
        return NULL;
11093
11094
    result = any_find_slice(self, substring, start, end, 1);
11095
11096
    if (result == -2)
  Branch (11096:9): [True: 0, False: 21.0k]
11097
        return NULL;
11098
11099
    if (result < 0) {
  Branch (11099:9): [True: 1.42k, False: 19.6k]
11100
        PyErr_SetString(PyExc_ValueError, "substring not found");
11101
        return NULL;
11102
    }
11103
11104
    return PyLong_FromSsize_t(result);
11105
}
11106
11107
/*[clinic input]
11108
str.isascii as unicode_isascii
11109
11110
Return True if all characters in the string are ASCII, False otherwise.
11111
11112
ASCII characters have code points in the range U+0000-U+007F.
11113
Empty string is ASCII too.
11114
[clinic start generated code]*/
11115
11116
static PyObject *
11117
unicode_isascii_impl(PyObject *self)
11118
/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11119
{
11120
    return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11121
}
11122
11123
/*[clinic input]
11124
str.islower as unicode_islower
11125
11126
Return True if the string is a lowercase string, False otherwise.
11127
11128
A string is lowercase if all cased characters in the string are lowercase and
11129
there is at least one cased character in the string.
11130
[clinic start generated code]*/
11131
11132
static PyObject *
11133
unicode_islower_impl(PyObject *self)
11134
/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
11135
{
11136
    Py_ssize_t i, length;
11137
    int kind;
11138
    const void *data;
11139
    int cased;
11140
11141
    length = PyUnicode_GET_LENGTH(self);
11142
    kind = PyUnicode_KIND(self);
11143
    data = PyUnicode_DATA(self);
11144
11145
    /* Shortcut for single character strings */
11146
    if (length == 1)
  Branch (11146:9): [True: 907, False: 163]
11147
        return PyBool_FromLong(
11148
            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11149
11150
    /* Special case for empty strings */
11151
    if (length == 0)
  Branch (11151:9): [True: 2, False: 161]
11152
        Py_RETURN_FALSE;
11153
11154
    cased = 0;
11155
    for (i = 0; i < length; 
i++11.2k
) {
  Branch (11155:17): [True: 11.2k, False: 147]
11156
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11157
11158
        if (Py_UNICODE_ISUPPER(ch) || 
Py_UNICODE_ISTITLE11.2k
(ch))
11159
            Py_RETURN_FALSE;
11160
        else if (!cased && 
Py_UNICODE_ISLOWER366
(ch))
  Branch (11160:18): [True: 366, False: 10.8k]
11161
            cased = 1;
11162
    }
11163
    return PyBool_FromLong(cased);
11164
}
11165
11166
/*[clinic input]
11167
str.isupper as unicode_isupper
11168
11169
Return True if the string is an uppercase string, False otherwise.
11170
11171
A string is uppercase if all cased characters in the string are uppercase and
11172
there is at least one cased character in the string.
11173
[clinic start generated code]*/
11174
11175
static PyObject *
11176
unicode_isupper_impl(PyObject *self)
11177
/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
11178
{
11179
    Py_ssize_t i, length;
11180
    int kind;
11181
    const void *data;
11182
    int cased;
11183
11184
    length = PyUnicode_GET_LENGTH(self);
11185
    kind = PyUnicode_KIND(self);
11186
    data = PyUnicode_DATA(self);
11187
11188
    /* Shortcut for single character strings */
11189
    if (length == 1)
  Branch (11189:9): [True: 1.46k, False: 15.8k]
11190
        return PyBool_FromLong(
11191
            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11192
11193
    /* Special case for empty strings */
11194
    if (length == 0)
  Branch (11194:9): [True: 2, False: 15.8k]
11195
        Py_RETURN_FALSE;
11196
11197
    cased = 0;
11198
    for (i = 0; i < length; 
i++53.9k
) {
  Branch (11198:17): [True: 66.0k, False: 3.73k]
11199
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11200
11201
        if (Py_UNICODE_ISLOWER(ch) || 
Py_UNICODE_ISTITLE53.9k
(ch))
11202
            Py_RETURN_FALSE;
11203
        else if (!cased && 
Py_UNICODE_ISUPPER4.29k
(ch))
  Branch (11203:18): [True: 4.29k, False: 49.6k]
11204
            cased = 1;
11205
    }
11206
    return PyBool_FromLong(cased);
11207
}
11208
11209
/*[clinic input]
11210
str.istitle as unicode_istitle
11211
11212
Return True if the string is a title-cased string, False otherwise.
11213
11214
In a title-cased string, upper- and title-case characters may only
11215
follow uncased characters and lowercase characters only cased ones.
11216
[clinic start generated code]*/
11217
11218
static PyObject *
11219
unicode_istitle_impl(PyObject *self)
11220
/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11221
{
11222
    Py_ssize_t i, length;
11223
    int kind;
11224
    const void *data;
11225
    int cased, previous_is_cased;
11226
11227
    length = PyUnicode_GET_LENGTH(self);
11228
    kind = PyUnicode_KIND(self);
11229
    data = PyUnicode_DATA(self);
11230
11231
    /* Shortcut for single character strings */
11232
    if (length == 1) {
  Branch (11232:9): [True: 15, False: 32]
11233
        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11234
        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
  Branch (11234:32): [True: 1, False: 14]
11235
                               
(14
Py_UNICODE_ISUPPER14
(ch) != 0));
  Branch (11235:32): [True: 3, False: 11]
11236
    }
11237
11238
    /* Special case for empty strings */
11239
    if (length == 0)
  Branch (11239:9): [True: 2, False: 30]
11240
        Py_RETURN_FALSE;
11241
11242
    cased = 0;
11243
    previous_is_cased = 0;
11244
    for (i = 0; i < length; 
i++15.6k
) {
  Branch (11244:17): [True: 15.6k, False: 17]
11245
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11246
11247
        if (Py_UNICODE_ISUPPER(ch) || 
Py_UNICODE_ISTITLE15.6k
(ch)) {
11248
            if (previous_is_cased)
  Branch (11248:17): [True: 3, False: 45]
11249
                Py_RETURN_FALSE;
11250
            previous_is_cased = 1;
11251
            cased = 1;
11252
        }
11253
        else if (Py_UNICODE_ISLOWER(ch)) {
11254
            if (!previous_is_cased)
  Branch (11254:17): [True: 10, False: 15.5k]
11255
                Py_RETURN_FALSE;
11256
            previous_is_cased = 1;
11257
            cased = 1;
11258
        }
11259
        else
11260
            previous_is_cased = 0;
11261
    }
11262
    return PyBool_FromLong(cased);
11263
}
11264
11265
/*[clinic input]
11266
str.isspace as unicode_isspace
11267
11268
Return True if the string is a whitespace string, False otherwise.
11269
11270
A string is whitespace if all characters in the string are whitespace and there
11271
is at least one character in the string.
11272
[clinic start generated code]*/
11273
11274
static PyObject *
11275
unicode_isspace_impl(PyObject *self)
11276
/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
11277
{
11278
    Py_ssize_t i, length;
11279
    int kind;
11280
    const void *data;
11281
11282
    length = PyUnicode_GET_LENGTH(self);
11283
    kind = PyUnicode_KIND(self);
11284
    data = PyUnicode_DATA(self);
11285
11286
    /* Shortcut for single character strings */
11287
    if (length == 1)
  Branch (11287:9): [True: 66.4k, False: 722]
11288
        return PyBool_FromLong(
11289
            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11290
11291
    /* Special case for empty strings */
11292
    if (length == 0)
  Branch (11292:9): [True: 2, False: 720]
11293
        Py_RETURN_FALSE;
11294
11295
    
for (i = 0; 720
i < length;
i++12.9k
) {
  Branch (11295:17): [True: 13.7k, False: 4]
11296
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11297
        if (!Py_UNICODE_ISSPACE(ch))
  Branch (11297:13): [True: 716, False: 12.9k]
11298
            Py_RETURN_FALSE;
11299
    }
11300
    
Py_RETURN_TRUE4
;
11301
}
11302
11303
/*[clinic input]
11304
str.isalpha as unicode_isalpha
11305
11306
Return True if the string is an alphabetic string, False otherwise.
11307
11308
A string is alphabetic if all characters in the string are alphabetic and there
11309
is at least one character in the string.
11310
[clinic start generated code]*/
11311
11312
static PyObject *
11313
unicode_isalpha_impl(PyObject *self)
11314
/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
11315
{
11316
    Py_ssize_t i, length;
11317
    int kind;
11318
    const void *data;
11319
11320
    length = PyUnicode_GET_LENGTH(self);
11321
    kind = PyUnicode_KIND(self);
11322
    data = PyUnicode_DATA(self);
11323
11324
    /* Shortcut for single character strings */
11325
    if (length == 1)
  Branch (11325:9): [True: 28.9k, False: 18]
11326
        return PyBool_FromLong(
11327
            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11328
11329
    /* Special case for empty strings */
11330
    if (length == 0)
  Branch (11330:9): [True: 2, False: 16]
11331
        Py_RETURN_FALSE;
11332
11333
    
for (i = 0; 16
i < length;
i++10.3k
) {
  Branch (11333:17): [True: 10.3k, False: 4]
11334
        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
  Branch (11334:13): [True: 12, False: 10.3k]
11335
            Py_RETURN_FALSE;
11336
    }
11337
    
Py_RETURN_TRUE4
;
11338
}
11339
11340
/*[clinic input]
11341
str.isalnum as unicode_isalnum
11342
11343
Return True if the string is an alpha-numeric string, False otherwise.
11344
11345
A string is alpha-numeric if all characters in the string are alpha-numeric and
11346
there is at least one character in the string.
11347
[clinic start generated code]*/
11348
11349
static PyObject *
11350
unicode_isalnum_impl(PyObject *self)
11351
/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
11352
{
11353
    int kind;
11354
    const void *data;
11355
    Py_ssize_t len, i;
11356
11357
    kind = PyUnicode_KIND(self);
11358
    data = PyUnicode_DATA(self);
11359
    len = PyUnicode_GET_LENGTH(self);
11360
11361
    /* Shortcut for single character strings */
11362
    if (len == 1) {
  Branch (11362:9): [True: 86.6k, False: 108]
11363
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11364
        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11365
    }
11366
11367
    /* Special case for empty strings */
11368
    if (len == 0)
  Branch (11368:9): [True: 4, False: 104]
11369
        Py_RETURN_FALSE;
11370
11371
    
for (i = 0; 104
i < len;
i++10.8k
) {
  Branch (11371:17): [True: 10.8k, False: 90]
11372
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11373
        if (!Py_UNICODE_ISALNUM(ch))
  Branch (11373:13): [True: 14, False: 10.8k]
11374
            Py_RETURN_FALSE;
11375
    }
11376
    
Py_RETURN_TRUE90
;
11377
}
11378
11379
/*[clinic input]
11380
str.isdecimal as unicode_isdecimal
11381
11382
Return True if the string is a decimal string, False otherwise.
11383
11384
A string is a decimal string if all characters in the string are decimal and
11385
there is at least one character in the string.
11386
[clinic start generated code]*/
11387
11388
static PyObject *
11389
unicode_isdecimal_impl(PyObject *self)
11390
/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
11391
{
11392
    Py_ssize_t i, length;
11393
    int kind;
11394
    const void *data;
11395
11396
    length = PyUnicode_GET_LENGTH(self);
11397
    kind = PyUnicode_KIND(self);
11398
    data = PyUnicode_DATA(self);
11399
11400
    /* Shortcut for single character strings */
11401
    if (length == 1)
  Branch (11401:9): [True: 113, False: 47]
11402
        return PyBool_FromLong(
11403
            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
11404
11405
    /* Special case for empty strings */
11406
    if (length == 0)
  Branch (11406:9): [True: 1, False: 46]
11407
        Py_RETURN_FALSE;
11408
11409
    
for (i = 0; 46
i < length;
i++66
) {
  Branch (11409:17): [True: 102, False: 10]
11410
        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
  Branch (11410:13): [True: 36, False: 66]
11411
            Py_RETURN_FALSE;
11412
    }
11413
    
Py_RETURN_TRUE10
;
11414
}
11415
11416
/*[clinic input]
11417
str.isdigit as unicode_isdigit
11418
11419
Return True if the string is a digit string, False otherwise.
11420
11421
A string is a digit string if all characters in the string are digits and there
11422
is at least one character in the string.
11423
[clinic start generated code]*/
11424
11425
static PyObject *
11426
unicode_isdigit_impl(PyObject *self)
11427
/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
11428
{
11429
    Py_ssize_t i, length;
11430
    int kind;
11431
    const void *data;
11432
11433
    length = PyUnicode_GET_LENGTH(self);
11434
    kind = PyUnicode_KIND(self);
11435
    data = PyUnicode_DATA(self);
11436
11437
    /* Shortcut for single character strings */
11438
    if (length == 1) {
  Branch (11438:9): [True: 6.93k, False: 2.06k]
11439
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11440
        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11441
    }
11442
11443
    /* Special case for empty strings */
11444
    if (length == 0)
  Branch (11444:9): [True: 6, False: 2.06k]
11445
        Py_RETURN_FALSE;
11446
11447
    
for (i = 0; 2.06k
i < length;
i++14.1k
) {
  Branch (11447:17): [True: 14.8k, False: 1.29k]
11448
        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
  Branch (11448:13): [True: 765, False: 14.1k]
11449
            Py_RETURN_FALSE;
11450
    }
11451
    
Py_RETURN_TRUE1.29k
;
11452
}
11453
11454
/*[clinic input]
11455
str.isnumeric as unicode_isnumeric
11456
11457
Return True if the string is a numeric string, False otherwise.
11458
11459
A string is numeric if all characters in the string are numeric and there is at
11460
least one character in the string.
11461
[clinic start generated code]*/
11462
11463
static PyObject *
11464
unicode_isnumeric_impl(PyObject *self)
11465
/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
11466
{
11467
    Py_ssize_t i, length;
11468
    int kind;
11469
    const void *data;
11470
11471
    length = PyUnicode_GET_LENGTH(self);
11472
    kind = PyUnicode_KIND(self);
11473
    data = PyUnicode_DATA(self);
11474
11475
    /* Shortcut for single character strings */
11476
    if (length == 1)
  Branch (11476:9): [True: 18, False: 11]
11477
        return PyBool_FromLong(
11478
            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
11479
11480
    /* Special case for empty strings */
11481
    if (length == 0)
  Branch (11481:9): [True: 1, False: 10]
11482
        Py_RETURN_FALSE;
11483
11484
    
for (i = 0; 10
i < length;
i++24
) {
  Branch (11484:17): [True: 32, False: 2]
11485
        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
  Branch (11485:13): [True: 8, False: 24]
11486
            Py_RETURN_FALSE;
11487
    }
11488
    
Py_RETURN_TRUE2
;
11489
}
11490
11491
Py_ssize_t
11492
_PyUnicode_ScanIdentifier(PyObject *self)
11493
{
11494
    Py_ssize_t i;
11495
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
11496
    if (len == 0) {
  Branch (11496:9): [True: 13, False: 518k]
11497
        /* an empty string is not a valid identifier */
11498
        return 0;
11499
    }
11500
11501
    int kind = PyUnicode_KIND(self);
11502
    const void *data = PyUnicode_DATA(self);
11503
    Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11504
    /* PEP 3131 says that the first character must be in
11505
       XID_Start and subsequent characters in XID_Continue,
11506
       and for the ASCII range, the 2.x rules apply (i.e
11507
       start with letters and underscore, continue with
11508
       letters, digits, underscore). However, given the current
11509
       definition of XID_Start and XID_Continue, it is sufficient
11510
       to check just for these, except that _ must be allowed
11511
       as starting an identifier.  */
11512
    if (!_PyUnicode_IsXidStart(ch) && 
ch != 0x5F282k
/* LOW LINE */) {
  Branch (11512:9): [True: 282k, False: 236k]
  Branch (11512:39): [True: 273k, False: 8.37k]
11513
        return 0;
11514
    }
11515
11516
    
for (i = 1; 245k
i < len;
i++120k
) {
  Branch (11516:17): [True: 120k, False: 245k]
11517
        ch = PyUnicode_READ(kind, data, i);
11518
        if (!_PyUnicode_IsXidContinue(ch)) {
  Branch (11518:13): [True: 23, False: 120k]
11519
            return i;
11520
        }
11521
    }
11522
    return i;
11523
}
11524
11525
int
11526
PyUnicode_IsIdentifier(PyObject *self)
11527
{
11528
    Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
11529
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
11530
    /* an empty string is not a valid identifier */
11531
    return len && 
i == len518k
;
  Branch (11531:12): [True: 518k, False: 13]
  Branch (11531:19): [True: 244k, False: 273k]
11532
}
11533
11534
/*[clinic input]
11535
str.isidentifier as unicode_isidentifier
11536
11537
Return True if the string is a valid Python identifier, False otherwise.
11538
11539
Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
11540
such as "def" or "class".
11541
[clinic start generated code]*/
11542
11543
static PyObject *
11544
unicode_isidentifier_impl(PyObject *self)
11545
/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
11546
{
11547
    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11548
}
11549
11550
/*[clinic input]
11551
str.isprintable as unicode_isprintable
11552
11553
Return True if the string is printable, False otherwise.
11554
11555
A string is printable if all of its characters are considered printable in
11556
repr() or if it is empty.
11557
[clinic start generated code]*/
11558
11559
static PyObject *
11560
unicode_isprintable_impl(PyObject *self)
11561
/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
11562
{
11563
    Py_ssize_t i, length;
11564
    int kind;
11565
    const void *data;
11566
11567
    length = PyUnicode_GET_LENGTH(self);
11568
    kind = PyUnicode_KIND(self);
11569
    data = PyUnicode_DATA(self);
11570
11571
    /* Shortcut for single character strings */
11572
    if (length == 1)
  Branch (11572:9): [True: 83.1k, False: 9]
11573
        return PyBool_FromLong(
11574
            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
11575
11576
    
for (i = 0; 9
i < length;
i++18
) {
  Branch (11576:17): [True: 25, False: 2]
11577
        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
  Branch (11577:13): [True: 7, False: 18]
11578
            Py_RETURN_FALSE;
11579
        }
11580
    }
11581
    
Py_RETURN_TRUE2
;
11582
}
11583
11584
/*[clinic input]
11585
str.join as unicode_join
11586
11587
    iterable: object
11588
    /
11589
11590
Concatenate any number of strings.
11591
11592
The string whose method is called is inserted in between each given string.
11593
The result is returned as a new string.
11594
11595
Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
11596
[clinic start generated code]*/
11597
11598
static PyObject *
11599
unicode_join(PyObject *self, PyObject *iterable)
11600
/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
11601
{
11602
    return PyUnicode_Join(self, iterable);
11603
}
11604
11605
static Py_ssize_t
11606
unicode_length(PyObject *self)
11607
{
11608
    return PyUnicode_GET_LENGTH(self);
11609
}
11610
11611
/*[clinic input]
11612
str.ljust as unicode_ljust
11613
11614
    width: Py_ssize_t
11615
    fillchar: Py_UCS4 = ' '
11616
    /
11617
11618
Return a left-justified string of length width.
11619
11620
Padding is done using the specified fill character (default is a space).
11621
[clinic start generated code]*/
11622
11623
static PyObject *
11624
unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11625
/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
11626
{
11627
    if (PyUnicode_GET_LENGTH(self) >= width)
  Branch (11627:9): [True: 110, False: 25.4k]
11628
        return unicode_result_unchanged(self);
11629
11630
    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
11631
}
11632
11633
/*[clinic input]
11634
str.lower as unicode_lower
11635
11636
Return a copy of the string converted to lowercase.
11637
[clinic start generated code]*/
11638
11639
static PyObject *
11640
unicode_lower_impl(PyObject *self)
11641
/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
11642
{
11643
    if (PyUnicode_IS_ASCII(self))
11644
        return ascii_upper_or_lower(self, 1);
11645
    return case_operation(self, do_lower);
11646
}
11647
11648
#define LEFTSTRIP 0
11649
#define RIGHTSTRIP 1
11650
#define BOTHSTRIP 2
11651
11652
/* Arrays indexed by above */
11653
static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
11654
11655
#define STRIPNAME(i) (stripfuncnames[i])
11656
11657
/* externally visible for str.strip(unicode) */
11658
PyObject *
11659
_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
11660
{
11661
    const void *data;
11662
    int kind;
11663
    Py_ssize_t i, j, len;
11664
    BLOOM_MASK sepmask;
11665
    Py_ssize_t seplen;
11666
11667
    kind = PyUnicode_KIND(self);
11668
    data = PyUnicode_DATA(self);
11669
    len = PyUnicode_GET_LENGTH(self);
11670
    seplen = PyUnicode_GET_LENGTH(sepobj);
11671
    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11672
                              PyUnicode_DATA(sepobj),
11673
                              seplen);
11674
11675
    i = 0;
11676
    if (striptype != RIGHTSTRIP) {
  Branch (11676:9): [True: 205k, False: 1.02M]
11677
        while (i < len) {
  Branch (11677:16): [True: 604k, False: 6.01k]
11678
            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11679
            if (!BLOOM(sepmask, ch))
  Branch (11679:17): [True: 197k, False: 407k]
11680
                break;
11681
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
  Branch (11681:17): [True: 1.92k, False: 405k]
11682
                break;
11683
            i++;
11684
        }
11685
    }
11686
11687
    j = len;
11688
    if (striptype != LEFTSTRIP) {
  Branch (11688:9): [True: 1.06M, False: 167k]
11689
        j--;
11690
        while (j >= i) {
  Branch (11690:16): [True: 1.40M, False: 13.3k]
11691
            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11692
            if (!BLOOM(sepmask, ch))
  Branch (11692:17): [True: 852k, False: 555k]
11693
                break;
11694
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
  Branch (11694:17): [True: 196k, False: 358k]
11695
                break;
11696
            j--;
11697
        }
11698
11699
        j++;
11700
    }
11701
11702
    return PyUnicode_Substring(self, i, j);
11703
}
11704
11705
PyObject*
11706
PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11707
{
11708
    const unsigned char *data;
11709
    int kind;
11710
    Py_ssize_t length;
11711
11712
    length = PyUnicode_GET_LENGTH(self);
11713
    end = Py_MIN(end, length);
11714
11715
    if (start == 0 && 
end == length3.79M
)
  Branch (11715:9): [True: 3.79M, False: 4.72M]
  Branch (11715:23): [True: 1.25M, False: 2.53M]
11716
        return unicode_result_unchanged(self);
11717
11718
    if (start < 0 || end < 0) {
  Branch (11718:9): [True: 0, False: 7.26M]
  Branch (11718:22): [True: 0, False: 7.26M]
11719
        PyErr_SetString(PyExc_IndexError, "string index out of range");
11720
        return NULL;
11721
    }
11722
    if (start >= length || 
end < start7.24M
)
  Branch (11722:9): [True: 20.5k, False: 7.24M]
  Branch (11722:28): [True: 0, False: 7.24M]
11723
        _Py_RETURN_UNICODE_EMPTY();
11724
11725
    length = end - start;
11726
    if (PyUnicode_IS_ASCII(self)) {
11727
        data = PyUnicode_1BYTE_DATA(self);
11728
        return _PyUnicode_FromASCII((const char*)(data + start), length);
11729
    }
11730
    else {
11731
        kind = PyUnicode_KIND(self);
11732
        data = PyUnicode_1BYTE_DATA(self);
11733
        return PyUnicode_FromKindAndData(kind,
11734
                                         data + kind * start,
11735
                                         length);
11736
    }
11737
}
11738
11739
static PyObject *
11740
do_strip(PyObject *self, int striptype)
11741
{
11742
    Py_ssize_t len, i, j;
11743
11744
    len = PyUnicode_GET_LENGTH(self);
11745
11746
    if (PyUnicode_IS_ASCII(self)) {
11747
        const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
11748
11749
        i = 0;
11750
        if (striptype != RIGHTSTRIP) {
  Branch (11750:13): [True: 710k, False: 72.6k]
11751
            while (i < len) {
  Branch (11751:20): [True: 2.48M, False: 52.9k]
11752
                Py_UCS1 ch = data[i];
11753
                if (!_Py_ascii_whitespace[ch])
  Branch (11753:21): [True: 657k, False: 1.82M]
11754
                    break;
11755
                i++;
11756
            }
11757
        }
11758
11759
        j = len;
11760
        if (striptype != LEFTSTRIP) {
  Branch (11760:13): [True: 753k, False: 29.1k]
11761
            j--;
11762
            while (j >= i) {
  Branch (11762:20): [True: 1.18M, False: 52.1k]
11763
                Py_UCS1 ch = data[j];
11764
                if (!_Py_ascii_whitespace[ch])
  Branch (11764:21): [True: 701k, False: 479k]
11765
                    break;
11766
                j--;
11767
            }
11768
            j++;
11769
        }
11770
    }
11771
    else {
11772
        int kind = PyUnicode_KIND(self);
11773
        const void *data = PyUnicode_DATA(self);
11774
11775
        i = 0;
11776
        if (striptype != RIGHTSTRIP) {
  Branch (11776:13): [True: 777, False: 106]
11777
            while (i < len) {
  Branch (11777:20): [True: 1.49k, False: 2]
11778
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11779
                if (!Py_UNICODE_ISSPACE(ch))
  Branch (11779:21): [True: 775, False: 715]
11780
                    break;
11781
                i++;
11782
            }
11783
        }
11784
11785
        j = len;
11786
        if (striptype != LEFTSTRIP) {
  Branch (11786:13): [True: 404, False: 479]
11787
            j--;
11788
            while (j >= i) {
  Branch (11788:20): [True: 641, False: 2]
11789
                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11790
                if (!Py_UNICODE_ISSPACE(ch))
  Branch (11790:21): [True: 402, False: 239]
11791
                    break;
11792
                j--;
11793
            }
11794
            j++;
11795
        }
11796
    }
11797
11798
    return PyUnicode_Substring(self, i, j);
11799
}
11800
11801
11802
static PyObject *
11803
do_argstrip(PyObject *self, int striptype, PyObject *sep)
11804
{
11805
    if (sep != Py_None) {
  Branch (11805:9): [True: 1.22M, False: 783k]
11806
        if (PyUnicode_Check(sep))
11807
            return _PyUnicode_XStrip(self, striptype, sep);
11808
        else {
11809
            PyErr_Format(PyExc_TypeError,
11810
                         "%s arg must be None or str",
11811
                         STRIPNAME(striptype));
11812
            return NULL;
11813
        }
11814
    }
11815
11816
    return do_strip(self, striptype);
11817
}
11818
11819
11820
/*[clinic input]
11821
str.strip as unicode_strip
11822
11823
    chars: object = None
11824
    /
11825
11826
Return a copy of the string with leading and trailing whitespace removed.
11827
11828
If chars is given and not None, remove characters in chars instead.
11829
[clinic start generated code]*/
11830
11831
static PyObject *
11832
unicode_strip_impl(PyObject *self, PyObject *chars)
11833
/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
11834
{
11835
    return do_argstrip(self, BOTHSTRIP, chars);
11836
}
11837
11838
11839
/*[clinic input]
11840
str.lstrip as unicode_lstrip
11841
11842
    chars: object = None
11843
    /
11844
11845
Return a copy of the string with leading whitespace removed.
11846
11847
If chars is given and not None, remove characters in chars instead.
11848
[clinic start generated code]*/
11849
11850
static PyObject *
11851
unicode_lstrip_impl(PyObject *self, PyObject *chars)
11852
/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
11853
{
11854
    return do_argstrip(self, LEFTSTRIP, chars);
11855
}
11856
11857
11858
/*[clinic input]
11859
str.rstrip as unicode_rstrip
11860
11861
    chars: object = None
11862
    /
11863
11864
Return a copy of the string with trailing whitespace removed.
11865
11866
If chars is given and not None, remove characters in chars instead.
11867
[clinic start generated code]*/
11868
11869
static PyObject *
11870
unicode_rstrip_impl(PyObject *self, PyObject *chars)
11871
/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
11872
{
11873
    return do_argstrip(self, RIGHTSTRIP, chars);
11874
}
11875
11876
11877
static PyObject*
11878
unicode_repeat(PyObject *str, Py_ssize_t len)
11879
{
11880
    PyObject *u;
11881
    Py_ssize_t nchars, n;
11882
11883
    if (len < 1)
  Branch (11883:9): [True: 51.6k, False: 624k]
11884
        _Py_RETURN_UNICODE_EMPTY();
11885
11886
    /* no repeat, return original string */
11887
    if (len == 1)
  Branch (11887:9): [True: 62.4k, False: 562k]
11888
        return unicode_result_unchanged(str);
11889
11890
    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
  Branch (11890:9): [True: 0, False: 562k]
11891
        PyErr_SetString(PyExc_OverflowError,
11892
                        "repeated string is too long");
11893
        return NULL;
11894
    }
11895
    nchars = len * PyUnicode_GET_LENGTH(str);
11896
11897
    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
11898
    if (!u)
  Branch (11898:9): [True: 8, False: 562k]
11899
        return NULL;
11900
    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
11901
11902
    if (PyUnicode_GET_LENGTH(str) == 1) {
  Branch (11902:9): [True: 510k, False: 52.4k]
11903
        int kind = PyUnicode_KIND(str);
11904
        Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11905
        if (kind == PyUnicode_1BYTE_KIND) {
  Branch (11905:13): [True: 509k, False: 320]
11906
            void *to = PyUnicode_DATA(u);
11907
            memset(to, (unsigned char)fill_char, len);
11908
        }
11909
        else if (kind == PyUnicode_2BYTE_KIND) {
  Branch (11909:18): [True: 224, False: 96]
11910
            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
11911
            for (n = 0; n < len; 
++n50.5k
)
  Branch (11911:25): [True: 50.5k, False: 224]
11912
                ucs2[n] = fill_char;
11913
        } else {
11914
            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11915
            assert(kind == PyUnicode_4BYTE_KIND);
11916
            for (n = 0; n < len; 
++n5.47k
)
  Branch (11916:25): [True: 5.47k, False: 96]
11917
                ucs4[n] = fill_char;
11918
        }
11919
    }
11920
    else {
11921
        Py_ssize_t char_size = PyUnicode_KIND(str);
11922
        char *to = (char *) PyUnicode_DATA(u);
11923
        _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str),
11924
            PyUnicode_GET_LENGTH(str) * char_size);
11925
    }
11926
11927
    assert(_PyUnicode_CheckConsistency(u, 1));
11928
    return u;
11929
}
11930
11931
PyObject *
11932
PyUnicode_Replace(PyObject *str,
11933
                  PyObject *substr,
11934
                  PyObject *replstr,
11935
                  Py_ssize_t maxcount)
11936
{
11937
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
  Branch (11937:9): [True: 0, False: 329]
  Branch (11937:36): [True: 0, False: 329]
11938
            ensure_unicode(replstr) < 0)
  Branch (11938:13): [True: 0, False: 329]
11939
        return NULL;
11940
    return replace(str, substr, replstr, maxcount);
11941
}
11942
11943
/*[clinic input]
11944
str.replace as unicode_replace
11945
11946
    old: unicode
11947
    new: unicode
11948
    count: Py_ssize_t = -1
11949
        Maximum number of occurrences to replace.
11950
        -1 (the default value) means replace all occurrences.
11951
    /
11952
11953
Return a copy with all occurrences of substring old replaced by new.
11954
11955
If the optional argument count is given, only the first count occurrences are
11956
replaced.
11957
[clinic start generated code]*/
11958
11959
static PyObject *
11960
unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
11961
                     Py_ssize_t count)
11962
/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
11963
{
11964
    return replace(self, old, new, count);
11965
}
11966
11967
/*[clinic input]
11968
str.removeprefix as unicode_removeprefix
11969
11970
    prefix: unicode
11971
    /
11972
11973
Return a str with the given prefix string removed if present.
11974
11975
If the string starts with the prefix string, return string[len(prefix):].
11976
Otherwise, return a copy of the original string.
11977
[clinic start generated code]*/
11978
11979
static PyObject *
11980
unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
11981
/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
11982
{
11983
    int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
11984
    if (match == -1) {
  Branch (11984:9): [True: 0, False: 30.3k]
11985
        return NULL;
11986
    }
11987
    if (match) {
  Branch (11987:9): [True: 30.0k, False: 298]
11988
        return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
11989
                                   PyUnicode_GET_LENGTH(self));
11990
    }
11991
    return unicode_result_unchanged(self);
11992
}
11993
11994
/*[clinic input]
11995
str.removesuffix as unicode_removesuffix
11996
11997
    suffix: unicode
11998
    /
11999
12000
Return a str with the given suffix string removed if present.
12001
12002
If the string ends with the suffix string and that suffix is not empty,
12003
return string[:-len(suffix)]. Otherwise, return a copy of the original
12004
string.
12005
[clinic start generated code]*/
12006
12007
static PyObject *
12008
unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12009
/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12010
{
12011
    int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12012
    if (match == -1) {
  Branch (12012:9): [True: 0, False: 28]
12013
        return NULL;
12014
    }
12015
    if (match) {
  Branch (12015:9): [True: 16, False: 12]
12016
        return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12017
                                            - PyUnicode_GET_LENGTH(suffix));
12018
    }
12019
    return unicode_result_unchanged(self);
12020
}
12021
12022
static PyObject *
12023
unicode_repr(PyObject *unicode)
12024
{
12025
    PyObject *repr;
12026
    Py_ssize_t isize;
12027
    Py_ssize_t osize, squote, dquote, i, o;
12028
    Py_UCS4 max, quote;
12029
    int ikind, okind, unchanged;
12030
    const void *idata;
12031
    void *odata;
12032
12033
    isize = PyUnicode_GET_LENGTH(unicode);
12034
    idata = PyUnicode_DATA(unicode);
12035
12036
    /* Compute length of output, quote characters, and
12037
       maximum character */
12038
    osize = 0;
12039
    max = 127;
12040
    squote = dquote = 0;
12041
    ikind = PyUnicode_KIND(unicode);
12042
    for (i = 0; i < isize; 
i++8.92M
) {
  Branch (12042:17): [True: 8.92M, False: 484k]
12043
        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12044
        Py_ssize_t incr = 1;
12045
        switch (ch) {
12046
        case '\'': squote++; break;
  Branch (12046:9): [True: 25.1k, False: 8.89M]
12047
        case '"':  dquote++; break;
  Branch (12047:9): [True: 32.4k, False: 8.89M]
12048
        
case '\\': 1.68k
case '\t': 23.6k
case '\r': 23.9k
case '\n':
  Branch (12048:9): [True: 1.68k, False: 8.92M]
  Branch (12048:20): [True: 21.9k, False: 8.90M]
  Branch (12048:31): [True: 253, False: 8.92M]
  Branch (12048:42): [True: 18.6k, False: 8.90M]
12049
            incr = 2;
12050
            break;
12051
        default:
  Branch (12051:9): [True: 8.82M, False: 100k]
12052
            /* Fast-path ASCII */
12053
            if (ch < ' ' || 
ch == 0x7f8.81M
)
  Branch (12053:17): [True: 7.83k, False: 8.81M]
  Branch (12053:29): [True: 17, False: 8.81M]
12054
                incr = 4; /* \xHH */
12055
            else if (ch < 0x7f)
  Branch (12055:22): [True: 8.72M, False: 87.0k]
12056
                ;
12057
            else if (Py_UNICODE_ISPRINTABLE(ch))
12058
                max = ch > max ? 
ch36.6k
:
max12.3k
;
  Branch (12058:23): [True: 36.6k, False: 12.3k]
12059
            else if (ch < 0x100)
  Branch (12059:22): [True: 143, False: 37.8k]
12060
                incr = 4; /* \xHH */
12061
            else if (ch < 0x10000)
  Branch (12061:22): [True: 27.5k, False: 10.3k]
12062
                incr = 6; /* \uHHHH */
12063
            else
12064
                incr = 10; /* \uHHHHHHHH */
12065
        }
12066
        if (osize > PY_SSIZE_T_MAX - incr) {
  Branch (12066:13): [True: 0, False: 8.92M]
12067
            PyErr_SetString(PyExc_OverflowError,
12068
                            "string is too long to generate repr");
12069
            return NULL;
12070
        }
12071
        osize += incr;
12072
    }
12073
12074
    quote = '\'';
12075
    unchanged = (osize == isize);
12076
    if (squote) {
  Branch (12076:9): [True: 18.3k, False: 465k]
12077
        unchanged = 0;
12078
        if (dquote)
  Branch (12078:13): [True: 3.76k, False: 14.5k]
12079
            /* Both squote and dquote present. Use squote,
12080
               and escape them */
12081
            osize += squote;
12082
        else
12083
            quote = '"';
12084
    }
12085
    osize += 2;   /* quotes */
12086
12087
    repr = PyUnicode_New(osize, max);
12088
    if (repr == NULL)
  Branch (12088:9): [True: 0, False: 484k]
12089
        return NULL;
12090
    okind = PyUnicode_KIND(repr);
12091
    odata = PyUnicode_DATA(repr);
12092
12093
    PyUnicode_WRITE(okind, odata, 0, quote);
12094
    PyUnicode_WRITE(okind, odata, osize-1, quote);
12095
    if (unchanged) {
  Branch (12095:9): [True: 451k, False: 32.4k]
12096
        _PyUnicode_FastCopyCharacters(repr, 1,
12097
                                      unicode, 0,
12098
                                      isize);
12099
    }
12100
    else {
12101
        for (i = 0, o = 1; i < isize; 
i++1.15M
) {
  Branch (12101:28): [True: 1.15M, False: 32.4k]
12102
            Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12103
12104
            /* Escape quotes and backslashes */
12105
            if ((ch == quote) || 
(ch == '\\')1.14M
) {
  Branch (12105:17): [True: 9.16k, False: 1.14M]
  Branch (12105:34): [True: 1.68k, False: 1.14M]
12106
                PyUnicode_WRITE(okind, odata, o++, '\\');
12107
                PyUnicode_WRITE(okind, odata, o++, ch);
12108
                continue;
12109
            }
12110
12111
            /* Map special whitespace to '\t', \n', '\r' */
12112
            if (ch == '\t') {
  Branch (12112:17): [True: 21.9k, False: 1.12M]
12113
                PyUnicode_WRITE(okind, odata, o++, '\\');
12114
                PyUnicode_WRITE(okind, odata, o++, 't');
12115
            }
12116
            else if (ch == '\n') {
  Branch (12116:22): [True: 18.6k, False: 1.10M]
12117
                PyUnicode_WRITE(okind, odata, o++, '\\');
12118
                PyUnicode_WRITE(okind, odata, o++, 'n');
12119
            }
12120
            else if (ch == '\r') {
  Branch (12120:22): [True: 253, False: 1.10M]
12121
                PyUnicode_WRITE(okind, odata, o++, '\\');
12122
                PyUnicode_WRITE(okind, odata, o++, 'r');
12123
            }
12124
12125
            /* Map non-printable US ASCII to '\xhh' */
12126
            else if (ch < ' ' || 
ch == 0x7F1.09M
) {
  Branch (12126:22): [True: 7.83k, False: 1.09M]
  Branch (12126:34): [True: 17, False: 1.09M]
12127
                PyUnicode_WRITE(okind, odata, o++, '\\');
12128
                PyUnicode_WRITE(okind, odata, o++, 'x');
12129
                PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12130
                PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12131
            }
12132
12133
            /* Copy ASCII characters as-is */
12134
            else if (ch < 0x7F) {
  Branch (12134:22): [True: 1.03M, False: 59.4k]
12135
                PyUnicode_WRITE(okind, odata, o++, ch);
12136
            }
12137
12138
            /* Non-ASCII characters */
12139
            else {
12140
                /* Map Unicode whitespace and control characters
12141
                   (categories Z* and C* except ASCII space)
12142
                */
12143
                if (!Py_UNICODE_ISPRINTABLE(ch)) {
  Branch (12143:21): [True: 38.0k, False: 21.4k]
12144
                    PyUnicode_WRITE(okind, odata, o++, '\\');
12145
                    /* Map 8-bit characters to '\xhh' */
12146
                    if (ch <= 0xff) {
  Branch (12146:25): [True: 143, False: 37.8k]
12147
                        PyUnicode_WRITE(okind, odata, o++, 'x');
12148
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12149
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12150
                    }
12151
                    /* Map 16-bit characters to '\uxxxx' */
12152
                    else if (ch <= 0xffff) {
  Branch (12152:30): [True: 27.5k, False: 10.3k]
12153
                        PyUnicode_WRITE(okind, odata, o++, 'u');
12154
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12155
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12156
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12157
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12158
                    }
12159
                    /* Map 21-bit characters to '\U00xxxxxx' */
12160
                    else {
12161
                        PyUnicode_WRITE(okind, odata, o++, 'U');
12162
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12163
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12164
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12165
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12166
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12167
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12168
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12169
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12170
                    }
12171
                }
12172
                /* Copy characters as-is */
12173
                else {
12174
                    PyUnicode_WRITE(okind, odata, o++, ch);
12175
                }
12176
            }
12177
        }
12178
    }
12179
    /* Closing quote already added at the beginning */
12180
    assert(_PyUnicode_CheckConsistency(repr, 1));
12181
    return repr;
12182
}
12183
12184
PyDoc_STRVAR(rfind__doc__,
12185
             "S.rfind(sub[, start[, end]]) -> int\n\
12186
\n\
12187
Return the highest index in S where substring sub is found,\n\
12188
such that sub is contained within S[start:end].  Optional\n\
12189
arguments start and end are interpreted as in slice notation.\n\
12190
\n\
12191
Return -1 on failure.");
12192
12193
static PyObject *
12194
unicode_rfind(PyObject *self, PyObject *args)
12195
{
12196
    /* initialize variables to prevent gcc warning */
12197
    PyObject *substring = NULL;
12198
    Py_ssize_t start = 0;
12199
    Py_ssize_t end = 0;
12200
    Py_ssize_t result;
12201
12202
    if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
  Branch (12202:9): [True: 5, False: 971k]
12203
        return NULL;
12204
12205
    result = any_find_slice(self, substring, start, end, -1);
12206
12207
    if (result == -2)
  Branch (12207:9): [True: 0, False: 971k]
12208
        return NULL;
12209
12210
    return PyLong_FromSsize_t(result);
12211
}
12212
12213
PyDoc_STRVAR(rindex__doc__,
12214
             "S.rindex(sub[, start[, end]]) -> int\n\
12215
\n\
12216
Return the highest index in S where substring sub is found,\n\
12217
such that sub is contained within S[start:end].  Optional\n\
12218
arguments start and end are interpreted as in slice notation.\n\
12219
\n\
12220
Raises ValueError when the substring is not found.");
12221
12222
static PyObject *
12223
unicode_rindex(PyObject *self, PyObject *args)
12224
{
12225
    /* initialize variables to prevent gcc warning */
12226
    PyObject *substring = NULL;
12227
    Py_ssize_t start = 0;
12228
    Py_ssize_t end = 0;
12229
    Py_ssize_t result;
12230
12231
    if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
  Branch (12231:9): [True: 5, False: 193]
12232
        return NULL;
12233
12234
    result = any_find_slice(self, substring, start, end, -1);
12235
12236
    if (result == -2)
  Branch (12236:9): [True: 0, False: 193]
12237
        return NULL;
12238
12239
    if (result < 0) {
  Branch (12239:9): [True: 25, False: 168]
12240
        PyErr_SetString(PyExc_ValueError, "substring not found");
12241
        return NULL;
12242
    }
12243
12244
    return PyLong_FromSsize_t(result);
12245
}
12246
12247
/*[clinic input]
12248
str.rjust as unicode_rjust
12249
12250
    width: Py_ssize_t
12251
    fillchar: Py_UCS4 = ' '
12252
    /
12253
12254
Return a right-justified string of length width.
12255
12256
Padding is done using the specified fill character (default is a space).
12257
[clinic start generated code]*/
12258
12259
static PyObject *
12260
unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12261
/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12262
{
12263
    if (PyUnicode_GET_LENGTH(self) >= width)
  Branch (12263:9): [True: 7.22k, False: 40.4k]
12264
        return unicode_result_unchanged(self);
12265
12266
    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12267
}
12268
12269
PyObject *
12270
PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12271
{
12272
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
  Branch (12272:9): [True: 0, False: 189k]
  Branch (12272:35): [True: 189k, False: 0]
  Branch (12272:50): [True: 0, False: 189k]
12273
        return NULL;
12274
12275
    return split(s, sep, maxsplit);
12276
}
12277
12278
/*[clinic input]
12279
str.split as unicode_split
12280
12281
    sep: object = None
12282
        The separator used to split the string.
12283
12284
        When set to None (the default value), will split on any whitespace
12285
        character (including \\n \\r \\t \\f and spaces) and will discard
12286
        empty strings from the result.
12287
    maxsplit: Py_ssize_t = -1
12288
        Maximum number of splits (starting from the left).
12289
        -1 (the default value) means no limit.
12290
12291
Return a list of the substrings in the string, using sep as the separator string.
12292
12293
Note, str.split() is mainly useful for data that has been intentionally
12294
delimited.  With natural text that includes punctuation, consider using
12295
the regular expression module.
12296
12297
[clinic start generated code]*/
12298
12299
static PyObject *
12300
unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12301
/*[clinic end generated code: output=3a65b1db356948dc input=906d953b44efc43b]*/
12302
{
12303
    if (sep == Py_None)
  Branch (12303:9): [True: 55.2k, False: 1.52M]
12304
        return split(self, NULL, maxsplit);
12305
    if (PyUnicode_Check(sep))
12306
        return split(self, sep, maxsplit);
12307
12308
    PyErr_Format(PyExc_TypeError,
12309
                 "must be str or None, not %.100s",
12310
                 Py_TYPE(sep)->tp_name);
12311
    return NULL;
12312
}
12313
12314
PyObject *
12315
PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12316
{
12317
    PyObject* out;
12318
    int kind1, kind2;
12319
    const void *buf1, *buf2;
12320
    Py_ssize_t len1, len2;
12321
12322
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
  Branch (12322:9): [True: 0, False: 81.9k]
  Branch (12322:40): [True: 2, False: 81.9k]
12323
        return NULL;
12324
12325
    kind1 = PyUnicode_KIND(str_obj);
12326
    kind2 = PyUnicode_KIND(sep_obj);
12327
    len1 = PyUnicode_GET_LENGTH(str_obj);
12328
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12329
    if (kind1 < kind2 || 
len1 < len281.9k
) {
  Branch (12329:9): [True: 7, False: 81.9k]
  Branch (12329:26): [True: 3.76k, False: 78.2k]
12330
        PyObject *empty = unicode_get_empty();  // Borrowed reference
12331
        return PyTuple_Pack(3, str_obj, empty, empty);
12332
    }
12333
    buf1 = PyUnicode_DATA(str_obj);
12334
    buf2 = PyUnicode_DATA(sep_obj);
12335
    if (kind2 != kind1) {
  Branch (12335:9): [True: 67, False: 78.1k]
12336
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12337
        if (!buf2)
  Branch (12337:13): [True: 0, False: 67]
12338
            return NULL;
12339
    }
12340
12341
    switch (kind1) {
12342
    case PyUnicode_1BYTE_KIND:
  Branch (12342:5): [True: 78.1k, False: 81]
12343
        if (PyUnicode_IS_ASCII(str_obj) && 
PyUnicode_IS_ASCII75.9k
(sep_obj))
12344
            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12345
        else
12346
            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12347
        break;
12348
    case PyUnicode_2BYTE_KIND:
  Branch (12348:5): [True: 65, False: 78.1k]
12349
        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12350
        break;
12351
    case PyUnicode_4BYTE_KIND:
  Branch (12351:5): [True: 16, False: 78.1k]
12352
        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12353
        break;
12354
    default:
  Branch (12354:5): [True: 0, False: 78.2k]
12355
        Py_UNREACHABLE();
12356
    }
12357
12358
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12359
    if (kind2 != kind1)
  Branch (12359:9): [True: 67, False: 78.1k]
12360
        PyMem_Free((void *)buf2);
12361
12362
    return out;
12363
}
12364
12365
12366
PyObject *
12367
PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12368
{
12369
    PyObject* out;
12370
    int kind1, kind2;
12371
    const void *buf1, *buf2;
12372
    Py_ssize_t len1, len2;
12373
12374
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
  Branch (12374:9): [True: 0, False: 141k]
  Branch (12374:40): [True: 2, False: 141k]
12375
        return NULL;
12376
12377
    kind1 = PyUnicode_KIND(str_obj);
12378
    kind2 = PyUnicode_KIND(sep_obj);
12379
    len1 = PyUnicode_GET_LENGTH(str_obj);
12380
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12381
    if (kind1 < kind2 || 
len1 < len2141k
) {
  Branch (12381:9): [True: 7, False: 141k]
  Branch (12381:26): [True: 136, False: 141k]
12382
        PyObject *empty = unicode_get_empty();  // Borrowed reference
12383
        return PyTuple_Pack(3, empty, empty, str_obj);
12384
    }
12385
    buf1 = PyUnicode_DATA(str_obj);
12386
    buf2 = PyUnicode_DATA(sep_obj);
12387
    if (kind2 != kind1) {
  Branch (12387:9): [True: 72, False: 141k]
12388
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12389
        if (!buf2)
  Branch (12389:13): [True: 0, False: 72]
12390
            return NULL;
12391
    }
12392
12393
    switch (kind1) {
12394
    case PyUnicode_1BYTE_KIND:
  Branch (12394:5): [True: 141k, False: 86]
12395
        if (PyUnicode_IS_ASCII(str_obj) && 
PyUnicode_IS_ASCII140k
(sep_obj))
12396
            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12397
        else
12398
            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12399
        break;
12400
    case PyUnicode_2BYTE_KIND:
  Branch (12400:5): [True: 70, False: 141k]
12401
        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12402
        break;
12403
    case PyUnicode_4BYTE_KIND:
  Branch (12403:5): [True: 16, False: 141k]
12404
        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12405
        break;
12406
    default:
  Branch (12406:5): [True: 0, False: 141k]
12407
        Py_UNREACHABLE();
12408
    }
12409
12410
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12411
    if (kind2 != kind1)
  Branch (12411:9): [True: 72, False: 141k]
12412
        PyMem_Free((void *)buf2);
12413
12414
    return out;
12415
}
12416
12417
/*[clinic input]
12418
str.partition as unicode_partition
12419
12420
    sep: object
12421
    /
12422
12423
Partition the string into three parts using the given separator.
12424
12425
This will search for the separator in the string.  If the separator is found,
12426
returns a 3-tuple containing the part before the separator, the separator
12427
itself, and the part after it.
12428
12429
If the separator is not found, returns a 3-tuple containing the original string
12430
and two empty strings.
12431
[clinic start generated code]*/
12432
12433
static PyObject *
12434
unicode_partition(PyObject *self, PyObject *sep)
12435
/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
12436
{
12437
    return PyUnicode_Partition(self, sep);
12438
}
12439
12440
/*[clinic input]
12441
str.rpartition as unicode_rpartition = str.partition
12442
12443
Partition the string into three parts using the given separator.
12444
12445
This will search for the separator in the string, starting at the end. If
12446
the separator is found, returns a 3-tuple containing the part before the
12447
separator, the separator itself, and the part after it.
12448
12449
If the separator is not found, returns a 3-tuple containing two empty strings
12450
and the original string.
12451
[clinic start generated code]*/
12452
12453
static PyObject *
12454
unicode_rpartition(PyObject *self, PyObject *sep)
12455
/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
12456
{
12457
    return PyUnicode_RPartition(self, sep);
12458
}
12459
12460
PyObject *
12461
PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12462
{
12463
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
  Branch (12463:9): [True: 0, False: 0]
  Branch (12463:35): [True: 0, False: 0]
  Branch (12463:50): [True: 0, False: 0]
12464
        return NULL;
12465
12466
    return rsplit(s, sep, maxsplit);
12467
}
12468
12469
/*[clinic input]
12470
str.rsplit as unicode_rsplit = str.split
12471
12472
Return a list of the substrings in the string, using sep as the separator string.
12473
12474
Splitting starts at the end of the string and works to the front.
12475
[clinic start generated code]*/
12476
12477
static PyObject *
12478
unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12479
/*[clinic end generated code: output=c2b815c63bcabffc input=ea78406060fce33c]*/
12480
{
12481
    if (sep == Py_None)
  Branch (12481:9): [True: 76, False: 3.20k]
12482
        return rsplit(self, NULL, maxsplit);
12483
    if (PyUnicode_Check(sep))
12484
        return rsplit(self, sep, maxsplit);
12485
12486
    PyErr_Format(PyExc_TypeError,
12487
                 "must be str or None, not %.100s",
12488
                 Py_TYPE(sep)->tp_name);
12489
    return NULL;
12490
}
12491
12492
/*[clinic input]
12493
str.splitlines as unicode_splitlines
12494
12495
    keepends: bool(accept={int}) = False
12496
12497
Return a list of the lines in the string, breaking at line boundaries.
12498
12499
Line breaks are not included in the resulting list unless keepends is given and
12500
true.
12501
[clinic start generated code]*/
12502
12503
static PyObject *
12504
unicode_splitlines_impl(PyObject *self, int keepends)
12505
/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
12506
{
12507
    return PyUnicode_Splitlines(self, keepends);
12508
}
12509
12510
static
12511
PyObject *unicode_str(PyObject *self)
12512
{
12513
    return unicode_result_unchanged(self);
12514
}
12515
12516
/*[clinic input]
12517
str.swapcase as unicode_swapcase
12518
12519
Convert uppercase characters to lowercase and lowercase characters to uppercase.
12520
[clinic start generated code]*/
12521
12522
static PyObject *
12523
unicode_swapcase_impl(PyObject *self)
12524
/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
12525
{
12526
    return case_operation(self, do_swapcase);
12527
}
12528
12529
/*[clinic input]
12530
12531
@staticmethod
12532
str.maketrans as unicode_maketrans
12533
12534
  x: object
12535
12536
  y: unicode=NULL
12537
12538
  z: unicode=NULL
12539
12540
  /
12541
12542
Return a translation table usable for str.translate().
12543
12544
If there is only one argument, it must be a dictionary mapping Unicode
12545
ordinals (integers) or characters to Unicode ordinals, strings or None.
12546
Character keys will be then converted to ordinals.
12547
If there are two arguments, they must be strings of equal length, and
12548
in the resulting dictionary, each character in x will be mapped to the
12549
character at the same position in y. If there is a third argument, it
12550
must be a string, whose characters will be mapped to None in the result.
12551
[clinic start generated code]*/
12552
12553
static PyObject *
12554
unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
12555
/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
12556
{
12557
    PyObject *new = NULL, *key, *value;
12558
    Py_ssize_t i = 0;
12559
    int res;
12560
12561
    new = PyDict_New();
12562
    if (!new)
  Branch (12562:9): [True: 0, False: 118]
12563
        return NULL;
12564
    if (y != NULL) {
  Branch (12564:9): [True: 7, False: 111]
12565
        int x_kind, y_kind, z_kind;
12566
        const void *x_data, *y_data, *z_data;
12567
12568
        /* x must be a string too, of equal length */
12569
        if (!PyUnicode_Check(x)) {
  Branch (12569:13): [True: 1, False: 6]
12570
            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12571
                            "be a string if there is a second argument");
12572
            goto err;
12573
        }
12574
        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
  Branch (12574:13): [True: 1, False: 5]
12575
            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12576
                            "arguments must have equal length");
12577
            goto err;
12578
        }
12579
        /* create entries for translating chars in x to those in y */
12580
        x_kind = PyUnicode_KIND(x);
12581
        y_kind = PyUnicode_KIND(y);
12582
        x_data = PyUnicode_DATA(x);
12583
        y_data = PyUnicode_DATA(y);
12584
        for (i = 0; i < PyUnicode_GET_LENGTH(x); 
i++12
) {
  Branch (12584:21): [True: 12, False: 5]
12585
            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12586
            if (!key)
  Branch (12586:17): [True: 0, False: 12]
12587
                goto err;
12588
            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
12589
            if (!value) {
  Branch (12589:17): [True: 0, False: 12]
12590
                Py_DECREF(key);
12591
                goto err;
12592
            }
12593
            res = PyDict_SetItem(new, key, value);
12594
            Py_DECREF(key);
12595
            Py_DECREF(value);
12596
            if (res < 0)
  Branch (12596:17): [True: 0, False: 12]
12597
                goto err;
12598
        }
12599
        /* create entries for deleting chars in z */
12600
        if (z != NULL) {
  Branch (12600:13): [True: 2, False: 3]
12601
            z_kind = PyUnicode_KIND(z);
12602
            z_data = PyUnicode_DATA(z);
12603
            for (i = 0; i < PyUnicode_GET_LENGTH(z); 
i++4
) {
  Branch (12603:25): [True: 4, False: 2]
12604
                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
12605
                if (!key)
  Branch (12605:21): [True: 0, False: 4]
12606
                    goto err;
12607
                res = PyDict_SetItem(new, key, Py_None);
12608
                Py_DECREF(key);
12609
                if (res < 0)
  Branch (12609:21): [True: 0, False: 4]
12610
                    goto err;
12611
            }
12612
        }
12613
    } else {
12614
        int kind;
12615
        const void *data;
12616
12617
        /* x must be a dict */
12618
        if (!PyDict_CheckExact(x)) {
  Branch (12618:13): [True: 0, False: 111]
12619
            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12620
                            "to maketrans it must be a dict");
12621
            goto err;
12622
        }
12623
        /* copy entries into the new dict, converting string keys to int keys */
12624
        
while (111
PyDict_Next(x, &i, &key, &value)) {
  Branch (12624:16): [True: 640, False: 109]
12625
            if (PyUnicode_Check(key)) {
12626
                /* convert string keys to integer keys */
12627
                PyObject *newkey;
12628
                if (PyUnicode_GET_LENGTH(key) != 1) {
  Branch (12628:21): [True: 1, False: 638]
12629
                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
12630
                                    "table must be of length 1");
12631
                    goto err;
12632
                }
12633
                kind = PyUnicode_KIND(key);
12634
                data = PyUnicode_DATA(key);
12635
                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
12636
                if (!newkey)
  Branch (12636:21): [True: 0, False: 638]
12637
                    goto err;
12638
                res = PyDict_SetItem(new, newkey, value);
12639
                Py_DECREF(newkey);
12640
                if (res < 0)
  Branch (12640:21): [True: 0, False: 638]
12641
                    goto err;
12642
            } else 
if (1
PyLong_Check1
(key)) {
12643
                /* just keep integer keys */
12644
                if (PyDict_SetItem(new, key, value) < 0)
  Branch (12644:21): [True: 0, False: 0]
12645
                    goto err;
12646
            } else {
12647
                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12648
                                "be strings or integers");
12649
                goto err;
12650
            }
12651
        }
12652
    }
12653
    return new;
12654
  err:
12655
    Py_DECREF(new);
12656
    return NULL;
12657
}
12658
12659
/*[clinic input]
12660
str.translate as unicode_translate
12661
12662
    table: object
12663
        Translation table, which must be a mapping of Unicode ordinals to
12664
        Unicode ordinals, strings, or None.
12665
    /
12666
12667
Replace each character in the string using the given translation table.
12668
12669
The table must implement lookup/indexing via __getitem__, for instance a
12670
dictionary or list.  If this operation raises LookupError, the character is
12671
left untouched.  Characters mapped to None are deleted.
12672
[clinic start generated code]*/
12673
12674
static PyObject *
12675
unicode_translate(PyObject *self, PyObject *table)
12676
/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
12677
{
12678
    return _PyUnicode_TranslateCharmap(self, table, "ignore");
12679
}
12680
12681
/*[clinic input]
12682
str.upper as unicode_upper
12683
12684
Return a copy of the string converted to uppercase.
12685
[clinic start generated code]*/
12686
12687
static PyObject *
12688
unicode_upper_impl(PyObject *self)
12689
/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
12690
{
12691
    if (PyUnicode_IS_ASCII(self))
12692
        return ascii_upper_or_lower(self, 0);
12693
    return case_operation(self, do_upper);
12694
}
12695
12696
/*[clinic input]
12697
str.zfill as unicode_zfill
12698
12699
    width: Py_ssize_t
12700
    /
12701
12702
Pad a numeric string with zeros on the left, to fill a field of the given width.
12703
12704
The string is never truncated.
12705
[clinic start generated code]*/
12706
12707
static PyObject *
12708
unicode_zfill_impl(PyObject *self, Py_ssize_t width)
12709
/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
12710
{
12711
    Py_ssize_t fill;
12712
    PyObject *u;
12713
    int kind;
12714
    const void *data;
12715
    Py_UCS4 chr;
12716
12717
    if (PyUnicode_GET_LENGTH(self) >= width)
  Branch (12717:9): [True: 21, False: 11]
12718
        return unicode_result_unchanged(self);
12719
12720
    fill = width - PyUnicode_GET_LENGTH(self);
12721
12722
    u = pad(self, fill, 0, '0');
12723
12724
    if (u == NULL)
  Branch (12724:9): [True: 0, False: 11]
12725
        return NULL;
12726
12727
    kind = PyUnicode_KIND(u);
12728
    data = PyUnicode_DATA(u);
12729
    chr = PyUnicode_READ(kind, data, fill);
12730
12731
    if (chr == '+' || 
chr == '-'9
) {
  Branch (12731:9): [True: 2, False: 9]
  Branch (12731:23): [True: 3, False: 6]
12732
        /* move sign to beginning of string */
12733
        PyUnicode_WRITE(kind, data, 0, chr);
12734
        PyUnicode_WRITE(kind, data, fill, '0');
12735
    }
12736
12737
    assert(_PyUnicode_CheckConsistency(u, 1));
12738
    return u;
12739
}
12740
12741
PyDoc_STRVAR(startswith__doc__,
12742
             "S.startswith(prefix[, start[, end]]) -> bool\n\
12743
\n\
12744
Return True if S starts with the specified prefix, False otherwise.\n\
12745
With optional start, test S beginning at that position.\n\
12746
With optional end, stop comparing S at that position.\n\
12747
prefix can also be a tuple of strings to try.");
12748
12749
static PyObject *
12750
unicode_startswith(PyObject *self,
12751
                   PyObject *args)
12752
{
12753
    PyObject *subobj;
12754
    PyObject *substring;
12755
    Py_ssize_t start = 0;
12756
    Py_ssize_t end = PY_SSIZE_T_MAX;
12757
    int result;
12758
12759
    if (!asciilib_parse_args_finds("startswith", args, &subobj, &start, &end))
  Branch (12759:9): [True: 3, False: 5.82M]
12760
        return NULL;
12761
    if (PyTuple_Check(subobj)) {
12762
        Py_ssize_t i;
12763
        for (i = 0; i < PyTuple_GET_SIZE(subobj); 
i++918k
) {
  Branch (12763:21): [True: 919k, False: 392k]
12764
            substring = PyTuple_GET_ITEM(subobj, i);
12765
            if (!PyUnicode_Check(substring)) {
  Branch (12765:17): [True: 2, False: 919k]
12766
                PyErr_Format(PyExc_TypeError,
12767
                             "tuple for startswith must only contain str, "
12768
                             "not %.100s",
12769
                             Py_TYPE(substring)->tp_name);
12770
                return NULL;
12771
            }
12772
            result = tailmatch(self, substring, start, end, -1);
12773
            if (result == -1)
  Branch (12773:17): [True: 0, False: 919k]
12774
                return NULL;
12775
            if (result) {
  Branch (12775:17): [True: 1.81k, False: 918k]
12776
                Py_RETURN_TRUE;
12777
            }
12778
        }
12779
        /* nothing matched */
12780
        
Py_RETURN_FALSE392k
;
12781
    }
12782
    if (!PyUnicode_Check(subobj)) {
  Branch (12782:9): [True: 32, False: 5.42M]
12783
        PyErr_Format(PyExc_TypeError,
12784
                     "startswith first arg must be str or "
12785
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
12786
        return NULL;
12787
    }
12788
    result = tailmatch(self, subobj, start, end, -1);
12789
    if (result == -1)
  Branch (12789:9): [True: 0, False: 5.42M]
12790
        return NULL;
12791
    return PyBool_FromLong(result);
12792
}
12793
12794
12795
PyDoc_STRVAR(endswith__doc__,
12796
             "S.endswith(suffix[, start[, end]]) -> bool\n\
12797
\n\
12798
Return True if S ends with the specified suffix, False otherwise.\n\
12799
With optional start, test S beginning at that position.\n\
12800
With optional end, stop comparing S at that position.\n\
12801
suffix can also be a tuple of strings to try.");
12802
12803
static PyObject *
12804
unicode_endswith(PyObject *self,
12805
                 PyObject *args)
12806
{
12807
    PyObject *subobj;
12808
    PyObject *substring;
12809
    Py_ssize_t start = 0;
12810
    Py_ssize_t end = PY_SSIZE_T_MAX;
12811
    int result;
12812
12813
    if (!asciilib_parse_args_finds("endswith", args, &subobj, &start, &end))
  Branch (12813:9): [True: 3, False: 954k]
12814
        return NULL;
12815
    if (PyTuple_Check(subobj)) {
12816
        Py_ssize_t i;
12817
        for (i = 0; i < PyTuple_GET_SIZE(subobj); 
i++5.23k
) {
  Branch (12817:21): [True: 17.2k, False: 2.68k]
12818
            substring = PyTuple_GET_ITEM(subobj, i);
12819
            if (!PyUnicode_Check(substring)) {
  Branch (12819:17): [True: 2, False: 17.2k]
12820
                PyErr_Format(PyExc_TypeError,
12821
                             "tuple for endswith must only contain str, "
12822
                             "not %.100s",
12823
                             Py_TYPE(substring)->tp_name);
12824
                return NULL;
12825
            }
12826
            result = tailmatch(self, substring, start, end, +1);
12827
            if (result == -1)
  Branch (12827:17): [True: 0, False: 17.2k]
12828
                return NULL;
12829
            if (result) {
  Branch (12829:17): [True: 12.0k, False: 5.23k]
12830
                Py_RETURN_TRUE;
12831
            }
12832
        }
12833
        
Py_RETURN_FALSE2.68k
;
12834
    }
12835
    if (!PyUnicode_Check(subobj)) {
  Branch (12835:9): [True: 3, False: 939k]
12836
        PyErr_Format(PyExc_TypeError,
12837
                     "endswith first arg must be str or "
12838
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
12839
        return NULL;
12840
    }
12841
    result = tailmatch(self, subobj, start, end, +1);
12842
    if (result == -1)
  Branch (12842:9): [True: 0, False: 939k]
12843
        return NULL;
12844
    return PyBool_FromLong(result);
12845
}
12846
12847
static inline void
12848
_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
12849
{
12850
    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12851
    writer->data = PyUnicode_DATA(writer->buffer);
12852
12853
    if (!writer->readonly) {
  Branch (12853:9): [True: 13.0M, False: 294k]
12854
        writer->kind = PyUnicode_KIND(writer->buffer);
12855
        writer->size = PyUnicode_GET_LENGTH(writer->buffer);
12856
    }
12857
    else {
12858
        /* use a value smaller than PyUnicode_1BYTE_KIND() so
12859
           _PyUnicodeWriter_PrepareKind() will copy the buffer. */
12860
        writer->kind = 0;
12861
        assert(writer->kind <= PyUnicode_1BYTE_KIND);
12862
12863
        /* Copy-on-write mode: set buffer size to 0 so
12864
         * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
12865
         * next write. */
12866
        writer->size = 0;
12867
    }
12868
}
12869
12870
void
12871
_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
12872
{
12873
    memset(writer, 0, sizeof(*writer));
12874
12875
    /* ASCII is the bare minimum */
12876
    writer->min_char = 127;
12877
12878
    /* use a value smaller than PyUnicode_1BYTE_KIND() so
12879
       _PyUnicodeWriter_PrepareKind() will copy the buffer. */
12880
    writer->kind = 0;
12881
    assert(writer->kind <= PyUnicode_1BYTE_KIND);
12882
}
12883
12884
// Initialize _PyUnicodeWriter with initial buffer
12885
static inline void
12886
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
12887
{
12888
    memset(writer, 0, sizeof(*writer));
12889
    writer->buffer = buffer;
12890
    _PyUnicodeWriter_Update(writer);
12891
    writer->min_length = writer->size;
12892
}
12893
12894
int
12895
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12896
                                 Py_ssize_t length, Py_UCS4 maxchar)
12897
{
12898
    Py_ssize_t newlen;
12899
    PyObject *newbuffer;
12900
12901
    assert(maxchar <= MAX_UNICODE);
12902
12903
    /* ensure that the _PyUnicodeWriter_Prepare macro was used */
12904
    assert((maxchar > writer->maxchar && length >= 0)
12905
           || length > 0);
12906
12907
    if (length > PY_SSIZE_T_MAX - writer->pos) {
  Branch (12907:9): [True: 0, False: 12.8M]
12908
        PyErr_NoMemory();
12909
        return -1;
12910
    }
12911
    newlen = writer->pos + length;
12912
12913
    maxchar = Py_MAX(maxchar, writer->min_char);
12914
12915
    if (writer->buffer == NULL) {
  Branch (12915:9): [True: 12.3M, False: 511k]
12916
        assert(!writer->readonly);
12917
        if (writer->overallocate
  Branch (12917:13): [True: 11.5M, False: 758k]
12918
            && 
newlen <= (11.5M
PY_SSIZE_T_MAX11.5M
- newlen /
OVERALLOCATE_FACTOR11.5M
)) {
  Branch (12918:16): [True: 11.5M, False: 0]
12919
            /* overallocate to limit the number of realloc() */
12920
            newlen += newlen / OVERALLOCATE_FACTOR;
12921
        }
12922
        if (newlen < writer->min_length)
  Branch (12922:13): [True: 11.9M, False: 377k]
12923
            newlen = writer->min_length;
12924
12925
        writer->buffer = PyUnicode_New(newlen, maxchar);
12926
        if (writer->buffer == NULL)
  Branch (12926:13): [True: 0, False: 12.3M]
12927
            return -1;
12928
    }
12929
    else if (newlen > writer->size) {
  Branch (12929:14): [True: 235k, False: 275k]
12930
        if (writer->overallocate
  Branch (12930:13): [True: 151k, False: 83.7k]
12931
            && 
newlen <= (151k
PY_SSIZE_T_MAX151k
- newlen /
OVERALLOCATE_FACTOR151k
)) {
  Branch (12931:16): [True: 151k, False: 0]
12932
            /* overallocate to limit the number of realloc() */
12933
            newlen += newlen / OVERALLOCATE_FACTOR;
12934
        }
12935
        if (newlen < writer->min_length)
  Branch (12935:13): [True: 0, False: 235k]
12936
            newlen = writer->min_length;
12937
12938
        if (maxchar > writer->maxchar || 
writer->readonly235k
) {
  Branch (12938:13): [True: 406, False: 235k]
  Branch (12938:42): [True: 185, False: 234k]
12939
            /* resize + widen */
12940
            maxchar = Py_MAX(maxchar, writer->maxchar);
12941
            newbuffer = PyUnicode_New(newlen, maxchar);
12942
            if (newbuffer == NULL)
  Branch (12942:17): [True: 0, False: 591]
12943
                return -1;
12944
            _PyUnicode_FastCopyCharacters(newbuffer, 0,
12945
                                          writer->buffer, 0, writer->pos);
12946
            Py_DECREF(writer->buffer);
12947
            writer->readonly = 0;
12948
        }
12949
        else {
12950
            newbuffer = resize_compact(writer->buffer, newlen);
12951
            if (newbuffer == NULL)
  Branch (12951:17): [True: 0, False: 234k]
12952
                return -1;
12953
        }
12954
        writer->buffer = newbuffer;
12955
    }
12956
    else if (maxchar > writer->maxchar) {
  Branch (12956:14): [True: 275k, False: 0]
12957
        assert(!writer->readonly);
12958
        newbuffer = PyUnicode_New(writer->size, maxchar);
12959
        if (newbuffer == NULL)
  Branch (12959:13): [True: 0, False: 275k]
12960
            return -1;
12961
        _PyUnicode_FastCopyCharacters(newbuffer, 0,
12962
                                      writer->buffer, 0, writer->pos);
12963
        Py_SETREF(writer->buffer, newbuffer);
12964
    }
12965
    _PyUnicodeWriter_Update(writer);
12966
    return 0;
12967
12968
#undef OVERALLOCATE_FACTOR
12969
}
12970
12971
int
12972
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
12973
                                     int kind)
12974
{
12975
    Py_UCS4 maxchar;
12976
12977
    /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
12978
    assert(writer->kind < kind);
12979
12980
    switch (kind)
12981
    {
12982
    case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
  Branch (12982:5): [True: 0, False: 4.51k]
12983
    case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
  Branch (12983:5): [True: 4.51k, False: 0]
12984
    case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
  Branch (12984:5): [True: 0, False: 4.51k]
12985
    default:
  Branch (12985:5): [True: 0, False: 4.51k]
12986
        Py_UNREACHABLE();
12987
    }
12988
12989
    return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
12990
}
12991
12992
static inline int
12993
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
12994
{
12995
    assert(ch <= MAX_UNICODE);
12996
    if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
  Branch (12996:9): [True: 0, False: 1.95M]
12997
        return -1;
12998
    PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
12999
    writer->pos++;
13000
    return 0;
13001
}
13002
13003
int
13004
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13005
{
13006
    return _PyUnicodeWriter_WriteCharInline(writer, ch);
13007
}
13008
13009
int
13010
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13011
{
13012
    Py_UCS4 maxchar;
13013
    Py_ssize_t len;
13014
13015
    len = PyUnicode_GET_LENGTH(str);
13016
    if (len == 0)
  Branch (13016:9): [True: 2.86M, False: 21.5M]
13017
        return 0;
13018
    maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13019
    if (maxchar > writer->maxchar || 
len > writer->size - writer->pos20.8M
) {
  Branch (13019:9): [True: 683k, False: 20.8M]
  Branch (13019:38): [True: 132k, False: 20.7M]
13020
        if (writer->buffer == NULL && 
!writer->overallocate669k
) {
  Branch (13020:13): [True: 669k, False: 147k]
  Branch (13020:39): [True: 284k, False: 384k]
13021
            assert(_PyUnicode_CheckConsistency(str, 1));
13022
            writer->readonly = 1;
13023
            Py_INCREF(str);
13024
            writer->buffer = str;
13025
            _PyUnicodeWriter_Update(writer);
13026
            writer->pos += len;
13027
            return 0;
13028
        }
13029
        if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
  Branch (13029:13): [True: 0, False: 531k]
13030
            return -1;
13031
    }
13032
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13033
                                  str, 0, len);
13034
    writer->pos += len;
13035
    return 0;
13036
}
13037
13038
int
13039
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13040
                                Py_ssize_t start, Py_ssize_t end)
13041
{
13042
    Py_UCS4 maxchar;
13043
    Py_ssize_t len;
13044
13045
    assert(0 <= start);
13046
    assert(end <= PyUnicode_GET_LENGTH(str));
13047
    assert(start <= end);
13048
13049
    if (end == 0)
  Branch (13049:9): [True: 0, False: 16.1M]
13050
        return 0;
13051
13052
    if (start == 0 && 
end == 3.50M
PyUnicode_GET_LENGTH3.50M
(str))
  Branch (13052:9): [True: 3.50M, False: 12.6M]
  Branch (13052:23): [True: 920, False: 3.50M]
13053
        return _PyUnicodeWriter_WriteStr(writer, str);
13054
13055
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
  Branch (13055:9): [True: 3.53M, False: 12.6M]
13056
        maxchar = _PyUnicode_FindMaxChar(str, start, end);
13057
    else
13058
        maxchar = writer->maxchar;
13059
    len = end - start;
13060
13061
    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
  Branch (13061:9): [True: 0, False: 16.1M]
13062
        return -1;
13063
13064
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13065
                                  str, start, len);
13066
    writer->pos += len;
13067
    return 0;
13068
}
13069
13070
int
13071
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13072
                                  const char *ascii, Py_ssize_t len)
13073
{
13074
    if (len == -1)
  Branch (13074:9): [True: 6.45k, False: 25.0M]
13075
        len = strlen(ascii);
13076
13077
    assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
13078
13079
    if (writer->buffer == NULL && 
!writer->overallocate7.52M
) {
  Branch (13079:9): [True: 7.52M, False: 17.4M]
  Branch (13079:35): [True: 10.0k, False: 7.51M]
13080
        PyObject *str;
13081
13082
        str = _PyUnicode_FromASCII(ascii, len);
13083
        if (str == NULL)
  Branch (13083:13): [True: 0, False: 10.0k]
13084
            return -1;
13085
13086
        writer->readonly = 1;
13087
        writer->buffer = str;
13088
        _PyUnicodeWriter_Update(writer);
13089
        writer->pos += len;
13090
        return 0;
13091
    }
13092
13093
    if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
  Branch (13093:9): [True: 0, False: 24.9M]
13094
        return -1;
13095
13096
    switch (writer->kind)
13097
    {
13098
    case PyUnicode_1BYTE_KIND:
  Branch (13098:5): [True: 24.9M, False: 14.5k]
13099
    {
13100
        const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13101
        Py_UCS1 *data = writer->data;
13102
13103
        memcpy(data + writer->pos, str, len);
13104
        break;
13105
    }
13106
    case PyUnicode_2BYTE_KIND:
  Branch (13106:5): [True: 14.4k, False: 24.9M]
13107
    {
13108
        _PyUnicode_CONVERT_BYTES(
13109
            Py_UCS1, Py_UCS2,
13110
            ascii, ascii + len,
13111
            (Py_UCS2 *)writer->data + writer->pos);
13112
        break;
13113
    }
13114
    case PyUnicode_4BYTE_KIND:
  Branch (13114:5): [True: 86, False: 24.9M]
13115
    {
13116
        _PyUnicode_CONVERT_BYTES(
13117
            Py_UCS1, Py_UCS4,
13118
            ascii, ascii + len,
13119
            (Py_UCS4 *)writer->data + writer->pos);
13120
        break;
13121
    }
13122
    default:
  Branch (13122:5): [True: 0, False: 24.9M]
13123
        Py_UNREACHABLE();
13124
    }
13125
13126
    writer->pos += len;
13127
    return 0;
13128
}
13129
13130
int
13131
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13132
                                   const char *str, Py_ssize_t len)
13133
{
13134
    Py_UCS4 maxchar;
13135
13136
    maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
13137
    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
  Branch (13137:9): [True: 0, False: 3]
13138
        return -1;
13139
    unicode_write_cstr(writer->buffer, writer->pos, str, len);
13140
    writer->pos += len;
13141
    return 0;
13142
}
13143
13144
PyObject *
13145
_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
13146
{
13147
    PyObject *str;
13148
13149
    if (writer->pos == 0) {
  Branch (13149:9): [True: 57.1k, False: 12.8M]
13150
        Py_CLEAR(writer->buffer);
13151
        _Py_RETURN_UNICODE_EMPTY();
13152
    }
13153
13154
    str = writer->buffer;
13155
    writer->buffer = NULL;
13156
13157
    if (writer->readonly) {
  Branch (13157:9): [True: 294k, False: 12.5M]
13158
        assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13159
        return str;
13160
    }
13161
13162
    if (PyUnicode_GET_LENGTH(str) != writer->pos) {
  Branch (13162:9): [True: 12.0M, False: 442k]
13163
        PyObject *str2;
13164
        str2 = resize_compact(str, writer->pos);
13165
        if (str2 == NULL) {
  Branch (13165:13): [True: 0, False: 12.0M]
13166
            Py_DECREF(str);
13167
            return NULL;
13168
        }
13169
        str = str2;
13170
    }
13171
13172
    assert(_PyUnicode_CheckConsistency(str, 1));
13173
    return unicode_result(str);
13174
}
13175
13176
void
13177
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
13178
{
13179
    Py_CLEAR(writer->buffer);
13180
}
13181
13182
#include "stringlib/unicode_format.h"
13183
13184
PyDoc_STRVAR(format__doc__,
13185
             "S.format(*args, **kwargs) -> str\n\
13186
\n\
13187
Return a formatted version of S, using substitutions from args and kwargs.\n\
13188
The substitutions are identified by braces ('{' and '}').");
13189
13190
PyDoc_STRVAR(format_map__doc__,
13191
             "S.format_map(mapping) -> str\n\
13192
\n\
13193
Return a formatted version of S, using substitutions from mapping.\n\
13194
The substitutions are identified by braces ('{' and '}').");
13195
13196
/*[clinic input]
13197
str.__format__ as unicode___format__
13198
13199
    format_spec: unicode
13200
    /
13201
13202
Return a formatted version of the string as described by format_spec.
13203
[clinic start generated code]*/
13204
13205
static PyObject *
13206
unicode___format___impl(PyObject *self, PyObject *format_spec)
13207
/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
13208
{
13209
    _PyUnicodeWriter writer;
13210
    int ret;
13211
13212
    _PyUnicodeWriter_Init(&writer);
13213
    ret = _PyUnicode_FormatAdvancedWriter(&writer,
13214
                                          self, format_spec, 0,
13215
                                          PyUnicode_GET_LENGTH(format_spec));
13216
    if (ret == -1) {
  Branch (13216:9): [True: 6, False: 7.79k]
13217
        _PyUnicodeWriter_Dealloc(&writer);
13218
        return NULL;
13219
    }
13220
    return _PyUnicodeWriter_Finish(&writer);
13221
}
13222
13223
/*[clinic input]
13224
str.__sizeof__ as unicode_sizeof
13225
13226
Return the size of the string in memory, in bytes.
13227
[clinic start generated code]*/
13228
13229
static PyObject *
13230
unicode_sizeof_impl(PyObject *self)
13231
/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13232
{
13233
    Py_ssize_t size;
13234
13235
    /* If it's a compact object, account for base structure +
13236
       character data. */
13237
    if (PyUnicode_IS_COMPACT_ASCII(self)) {
13238
        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13239
    }
13240
    else if (PyUnicode_IS_COMPACT(self)) {
13241
        size = sizeof(PyCompactUnicodeObject) +
13242
            (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13243
    }
13244
    else {
13245
        /* If it is a two-block object, account for base object, and
13246
           for character block if present. */
13247
        size = sizeof(PyUnicodeObject);
13248
        if (_PyUnicode_DATA_ANY(self))
13249
            size += (PyUnicode_GET_LENGTH(self) + 1) *
13250
                PyUnicode_KIND(self);
13251
    }
13252
    if (_PyUnicode_HAS_UTF8_MEMORY(self))
13253
        size += PyUnicode_UTF8_LENGTH(self) + 1;
13254
13255
    return PyLong_FromSsize_t(size);
13256
}
13257
13258
static PyObject *
13259
unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
13260
{
13261
    PyObject *copy = _PyUnicode_Copy(v);
13262
    if (!copy)
  Branch (13262:9): [True: 0, False: 121]
13263
        return NULL;
13264
    return Py_BuildValue("(N)", copy);
13265
}
13266
13267
static PyMethodDef unicode_methods[] = {
13268
    UNICODE_ENCODE_METHODDEF
13269
    UNICODE_REPLACE_METHODDEF
13270
    UNICODE_SPLIT_METHODDEF
13271
    UNICODE_RSPLIT_METHODDEF
13272
    UNICODE_JOIN_METHODDEF
13273
    UNICODE_CAPITALIZE_METHODDEF
13274
    UNICODE_CASEFOLD_METHODDEF
13275
    UNICODE_TITLE_METHODDEF
13276
    UNICODE_CENTER_METHODDEF
13277
    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13278
    UNICODE_EXPANDTABS_METHODDEF
13279
    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
13280
    UNICODE_PARTITION_METHODDEF
13281
    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13282
    UNICODE_LJUST_METHODDEF
13283
    UNICODE_LOWER_METHODDEF
13284
    UNICODE_LSTRIP_METHODDEF
13285
    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13286
    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13287
    UNICODE_RJUST_METHODDEF
13288
    UNICODE_RSTRIP_METHODDEF
13289
    UNICODE_RPARTITION_METHODDEF
13290
    UNICODE_SPLITLINES_METHODDEF
13291
    UNICODE_STRIP_METHODDEF
13292
    UNICODE_SWAPCASE_METHODDEF
13293
    UNICODE_TRANSLATE_METHODDEF
13294
    UNICODE_UPPER_METHODDEF
13295
    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13296
    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13297
    UNICODE_REMOVEPREFIX_METHODDEF
13298
    UNICODE_REMOVESUFFIX_METHODDEF
13299
    UNICODE_ISASCII_METHODDEF
13300
    UNICODE_ISLOWER_METHODDEF
13301
    UNICODE_ISUPPER_METHODDEF
13302
    UNICODE_ISTITLE_METHODDEF
13303
    UNICODE_ISSPACE_METHODDEF
13304
    UNICODE_ISDECIMAL_METHODDEF
13305
    UNICODE_ISDIGIT_METHODDEF
13306
    UNICODE_ISNUMERIC_METHODDEF
13307
    UNICODE_ISALPHA_METHODDEF
13308
    UNICODE_ISALNUM_METHODDEF
13309
    UNICODE_ISIDENTIFIER_METHODDEF
13310
    UNICODE_ISPRINTABLE_METHODDEF
13311
    UNICODE_ZFILL_METHODDEF
13312
    {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
13313
    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13314
    UNICODE___FORMAT___METHODDEF
13315
    UNICODE_MAKETRANS_METHODDEF
13316
    UNICODE_SIZEOF_METHODDEF
13317
    {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
13318
    {NULL, NULL}
13319
};
13320
13321
static PyObject *
13322
unicode_mod(PyObject *v, PyObject *w)
13323
{
13324
    if (!PyUnicode_Check(v))
  Branch (13324:9): [True: 0, False: 3.86M]
13325
        Py_RETURN_NOTIMPLEMENTED;
13326
    return PyUnicode_Format(v, w);
13327
}
13328
13329
static PyNumberMethods unicode_as_number = {
13330
    0,              /*nb_add*/
13331
    0,              /*nb_subtract*/
13332
    0,              /*nb_multiply*/
13333
    unicode_mod,            /*nb_remainder*/
13334
};
13335
13336
static PySequenceMethods unicode_as_sequence = {
13337
    (lenfunc) unicode_length,       /* sq_length */
13338
    PyUnicode_Concat,           /* sq_concat */
13339
    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
13340
    (ssizeargfunc) unicode_getitem,     /* sq_item */
13341
    0,                  /* sq_slice */
13342
    0,                  /* sq_ass_item */
13343
    0,                  /* sq_ass_slice */
13344
    PyUnicode_Contains,         /* sq_contains */
13345
};
13346
13347
static PyObject*
13348
unicode_subscript(PyObject* self, PyObject* item)
13349
{
13350
    if (_PyIndex_Check(item)) {
  Branch (13350:9): [True: 11.5M, False: 4.95M]
13351
        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13352
        if (i == -1 && 
PyErr_Occurred()96.2k
)
  Branch (13352:13): [True: 96.2k, False: 11.4M]
  Branch (13352:24): [True: 2, False: 96.2k]
13353
            return NULL;
13354
        if (i < 0)
  Branch (13354:13): [True: 113k, False: 11.4M]
13355
            i += PyUnicode_GET_LENGTH(self);
13356
        return unicode_getitem(self, i);
13357
    } else 
if (4.95M
PySlice_Check4.95M
(item)) {
13358
        Py_ssize_t start, stop, step, slicelength, i;
13359
        size_t cur;
13360
        PyObject *result;
13361
        const void *src_data;
13362
        void *dest_data;
13363
        int src_kind, dest_kind;
13364
        Py_UCS4 ch, max_char, kind_limit;
13365
13366
        if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
  Branch (13366:13): [True: 2, False: 4.95M]
13367
            return NULL;
13368
        }
13369
        slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13370
                                            &start, &stop, step);
13371
13372
        if (slicelength <= 0) {
  Branch (13372:13): [True: 193k, False: 4.75M]
13373
            _Py_RETURN_UNICODE_EMPTY();
13374
        } else if (start == 0 && 
step == 12.89M
&&
  Branch (13374:20): [True: 2.89M, False: 1.86M]
  Branch (13374:34): [True: 2.89M, False: 207]
13375
                   
slicelength == 2.89M
PyUnicode_GET_LENGTH2.89M
(self)) {
  Branch (13375:20): [True: 1.34M, False: 1.54M]
13376
            return unicode_result_unchanged(self);
13377
        } else if (step == 1) {
  Branch (13377:20): [True: 3.40M, False: 937]
13378
            return PyUnicode_Substring(self,
13379
                                       start, start + slicelength);
13380
        }
13381
        /* General case */
13382
        src_kind = PyUnicode_KIND(self);
13383
        src_data = PyUnicode_DATA(self);
13384
        if (!PyUnicode_IS_ASCII(self)) {
  Branch (13384:13): [True: 9, False: 928]
13385
            kind_limit = kind_maxchar_limit(src_kind);
13386
            max_char = 0;
13387
            for (cur = start, i = 0; i < slicelength; 
cur += step, i++0
) {
  Branch (13387:38): [True: 9, False: 0]
13388
                ch = PyUnicode_READ(src_kind, src_data, cur);
13389
                if (ch > max_char) {
  Branch (13389:21): [True: 9, False: 0]
13390
                    max_char = ch;
13391
                    if (max_char >= kind_limit)
  Branch (13391:25): [True: 9, False: 0]
13392
                        break;
13393
                }
13394
            }
13395
        }
13396
        else
13397
            max_char = 127;
13398
        result = PyUnicode_New(slicelength, max_char);
13399
        if (result == NULL)
  Branch (13399:13): [True: 0, False: 937]
13400
            return NULL;
13401
        dest_kind = PyUnicode_KIND(result);
13402
        dest_data = PyUnicode_DATA(result);
13403
13404
        for (cur = start, i = 0; i < slicelength; 
cur += step, i++8.11k
) {
  Branch (13404:34): [True: 8.11k, False: 937]
13405
            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13406
            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13407
        }
13408
        assert(_PyUnicode_CheckConsistency(result, 1));
13409
        return result;
13410
    } else {
13411
        PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
13412
                     Py_TYPE(item)->tp_name);
13413
        return NULL;
13414
    }
13415
}
13416
13417
static PyMappingMethods unicode_as_mapping = {
13418
    (lenfunc)unicode_length,        /* mp_length */
13419
    (binaryfunc)unicode_subscript,  /* mp_subscript */
13420
    (objobjargproc)0,           /* mp_ass_subscript */
13421
};
13422
13423
13424
/* Helpers for PyUnicode_Format() */
13425
13426
struct unicode_formatter_t {
13427
    PyObject *args;
13428
    int args_owned;
13429
    Py_ssize_t arglen, argidx;
13430
    PyObject *dict;
13431
13432
    int fmtkind;
13433
    Py_ssize_t fmtcnt, fmtpos;
13434
    const void *fmtdata;
13435
    PyObject *fmtstr;
13436
13437
    _PyUnicodeWriter writer;
13438
};
13439
13440
struct unicode_format_arg_t {
13441
    Py_UCS4 ch;
13442
    int flags;
13443
    Py_ssize_t width;
13444
    int prec;
13445
    int sign;
13446
};
13447
13448
static PyObject *
13449
unicode_format_getnextarg(struct unicode_formatter_t *ctx)
13450
{
13451
    Py_ssize_t argidx = ctx->argidx;
13452
13453
    if (argidx < ctx->arglen) {
  Branch (13453:9): [True: 15.4M, False: 20]
13454
        ctx->argidx++;
13455
        if (ctx->arglen < 0)
  Branch (13455:13): [True: 876k, False: 14.5M]
13456
            return ctx->args;
13457
        else
13458
            return PyTuple_GetItem(ctx->args, argidx);
13459
    }
13460
    PyErr_SetString(PyExc_TypeError,
13461
                    "not enough arguments for format string");
13462
    return NULL;
13463
}
13464
13465
/* Returns a new reference to a PyUnicode object, or NULL on failure. */
13466
13467
/* Format a float into the writer if the writer is not NULL, or into *p_output
13468
   otherwise.
13469
13470
   Return 0 on success, raise an exception and return -1 on error. */
13471
static int
13472
formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13473
            PyObject **p_output,
13474
            _PyUnicodeWriter *writer)
13475
{
13476
    char *p;
13477
    double x;
13478
    Py_ssize_t len;
13479
    int prec;
13480
    int dtoa_flags = 0;
13481
13482
    x = PyFloat_AsDouble(v);
13483
    if (x == -1.0 && 
PyErr_Occurred()13
)
  Branch (13483:9): [True: 13, False: 41.6k]
  Branch (13483:22): [True: 2, False: 11]
13484
        return -1;
13485
13486
    prec = arg->prec;
13487
    if (prec < 0)
  Branch (13487:9): [True: 8.97k, False: 32.6k]
13488
        prec = 6;
13489
13490
    if (arg->flags & F_ALT)
  Branch (13490:9): [True: 10.9k, False: 30.6k]
13491
        dtoa_flags |= Py_DTSF_ALT;
13492
    if (arg->flags & F_NO_NEG_0)
  Branch (13492:9): [True: 0, False: 41.6k]
13493
        dtoa_flags |= Py_DTSF_NO_NEG_0;
13494
    p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
13495
    if (p == NULL)
  Branch (13495:9): [True: 0, False: 41.6k]
13496
        return -1;
13497
    len = strlen(p);
13498
    if (writer) {
  Branch (13498:9): [True: 4.70k, False: 36.9k]
13499
        if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
  Branch (13499:13): [True: 0, False: 4.70k]
13500
            PyMem_Free(p);
13501
            return -1;
13502
        }
13503
    }
13504
    else
13505
        *p_output = _PyUnicode_FromASCII(p, len);
13506
    PyMem_Free(p);
13507
    return 0;
13508
}
13509
13510
/* formatlong() emulates the format codes d, u, o, x and X, and
13511
 * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
13512
 * Python's regular ints.
13513
 * Return value:  a new PyUnicodeObject*, or NULL if error.
13514
 *     The output string is of the form
13515
 *         "-"? ("0x" | "0X")? digit+
13516
 *     "0x"/"0X" are present only for x and X conversions, with F_ALT
13517
 *         set in flags.  The case of hex digits will be correct,
13518
 *     There will be at least prec digits, zero-filled on the left if
13519
 *         necessary to get that many.
13520
 * val          object to be converted
13521
 * flags        bitmask of format flags; only F_ALT is looked at
13522
 * prec         minimum number of digits; 0-fill on left if needed
13523
 * type         a character in [duoxX]; u acts the same as d
13524
 *
13525
 * CAUTION:  o, x and X conversions on regular ints can never
13526
 * produce a '-' sign, but can for Python's unbounded ints.
13527
 */
13528
PyObject *
13529
_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
13530
{
13531
    PyObject *result = NULL;
13532
    char *buf;
13533
    Py_ssize_t i;
13534
    int sign;           /* 1 if '-', else 0 */
13535
    int len;            /* number of characters */
13536
    Py_ssize_t llen;
13537
    int numdigits;      /* len == numnondigits + numdigits */
13538
    int numnondigits = 0;
13539
13540
    /* Avoid exceeding SSIZE_T_MAX */
13541
    if (prec > INT_MAX-3) {
  Branch (13541:9): [True: 0, False: 511k]
13542
        PyErr_SetString(PyExc_OverflowError,
13543
                        "precision too large");
13544
        return NULL;
13545
    }
13546
13547
    assert(PyLong_Check(val));
13548
13549
    switch (type) {
13550
    default:
  Branch (13550:5): [True: 0, False: 511k]
13551
        Py_UNREACHABLE();
13552
    case 'd':
  Branch (13552:5): [True: 51.1k, False: 460k]
13553
    case 'i':
  Branch (13553:5): [True: 5.48k, False: 505k]
13554
    case 'u':
  Branch (13554:5): [True: 3.55k, False: 507k]
13555
        /* int and int subclasses should print numerically when a numeric */
13556
        /* format code is used (see issue18780) */
13557
        result = PyNumber_ToBase(val, 10);
13558
        break;
13559
    case 'o':
  Branch (13559:5): [True: 9.33k, False: 501k]
13560
        numnondigits = 2;
13561
        result = PyNumber_ToBase(val, 8);
13562
        break;
13563
    case 'x':
  Branch (13563:5): [True: 439k, False: 71.7k]
13564
    case 'X':
  Branch (13564:5): [True: 2.19k, False: 509k]
13565
        numnondigits = 2;
13566
        result = PyNumber_ToBase(val, 16);
13567
        break;
13568
    }
13569
    if (!result)
  Branch (13569:9): [True: 0, False: 511k]
13570
        return NULL;
13571
13572
    assert(unicode_modifiable(result));
13573
    assert(PyUnicode_IS_ASCII(result));
13574
13575
    /* To modify the string in-place, there can only be one reference. */
13576
    if (Py_REFCNT(result) != 1) {
  Branch (13576:9): [True: 0, False: 511k]
13577
        Py_DECREF(result);
13578
        PyErr_BadInternalCall();
13579
        return NULL;
13580
    }
13581
    buf = PyUnicode_DATA(result);
13582
    llen = PyUnicode_GET_LENGTH(result);
13583
    if (llen > INT_MAX) {
  Branch (13583:9): [True: 0, False: 511k]
13584
        Py_DECREF(result);
13585
        PyErr_SetString(PyExc_ValueError,
13586
                        "string too large in _PyUnicode_FormatLong");
13587
        return NULL;
13588
    }
13589
    len = (int)llen;
13590
    sign = buf[0] == '-';
13591
    numnondigits += sign;
13592
    numdigits = len - numnondigits;
13593
    assert(numdigits > 0);
13594
13595
    /* Get rid of base marker unless F_ALT */
13596
    if (((alt) == 0 &&
  Branch (13596:10): [True: 503k, False: 8.12k]
13597
        
(503k
type == 'o'503k
||
type == 'x'494k
||
type == 'X'56.1k
))) {
  Branch (13597:10): [True: 8.40k, False: 494k]
  Branch (13597:25): [True: 438k, False: 56.1k]
  Branch (13597:40): [True: 1.27k, False: 54.9k]
13598
        assert(buf[sign] == '0');
13599
        assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13600
               buf[sign+1] == 'o');
13601
        numnondigits -= 2;
13602
        buf += 2;
13603
        len -= 2;
13604
        if (sign)
  Branch (13604:13): [True: 1.37k, False: 446k]
13605
            buf[0] = '-';
13606
        assert(len == numnondigits + numdigits);
13607
        assert(numdigits > 0);
13608
    }
13609
13610
    /* Fill with leading zeroes to meet minimum width. */
13611
    if (prec > numdigits) {
  Branch (13611:9): [True: 6.52k, False: 504k]
13612
        PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13613
                                numnondigits + prec);
13614
        char *b1;
13615
        if (!r1) {
  Branch (13615:13): [True: 0, False: 6.52k]
13616
            Py_DECREF(result);
13617
            return NULL;
13618
        }
13619
        b1 = PyBytes_AS_STRING(r1);
13620
        for (i = 0; i < numnondigits; 
++i2.10k
)
  Branch (13620:21): [True: 2.10k, False: 6.52k]
13621
            *b1++ = *buf++;
13622
        for (i = 0; i < prec - numdigits; 
i++89.0k
)
  Branch (13622:21): [True: 89.0k, False: 6.52k]
13623
            *b1++ = '0';
13624
        for (i = 0; i < numdigits; 
i++23.7k
)
  Branch (13624:21): [True: 23.7k, False: 6.52k]
13625
            *b1++ = *buf++;
13626
        *b1 = '\0';
13627
        Py_DECREF(result);
13628
        result = r1;
13629
        buf = PyBytes_AS_STRING(result);
13630
        len = numnondigits + prec;
13631
    }
13632
13633
    /* Fix up case for hex conversions. */
13634
    if (type == 'X') {
  Branch (13634:9): [True: 2.19k, False: 509k]
13635
        /* Need to convert all lower case letters to upper case.
13636
           and need to convert 0x to 0X (and -0x to -0X). */
13637
        for (i = 0; i < len; 
i++18.8k
)
  Branch (13637:21): [True: 18.8k, False: 2.19k]
13638
            if (buf[i] >= 'a' && 
buf[i] <= 'x'3.20k
)
  Branch (13638:17): [True: 3.20k, False: 15.6k]
  Branch (13638:34): [True: 3.20k, False: 0]
13639
                buf[i] -= 'a'-'A';
13640
    }
13641
    if (!PyUnicode_Check(result)
  Branch (13641:9): [True: 6.52k, False: 504k]
13642
        || 
buf != 504k
PyUnicode_DATA504k
(result)) {
  Branch (13642:12): [True: 443k, False: 61.0k]
13643
        PyObject *unicode;
13644
        unicode = _PyUnicode_FromASCII(buf, len);
13645
        Py_DECREF(result);
13646
        result = unicode;
13647
    }
13648
    else if (len != PyUnicode_GET_LENGTH(result)) {
  Branch (13648:14): [True: 0, False: 61.0k]
13649
        if (PyUnicode_Resize(&result, len) < 0)
  Branch (13649:13): [True: 0, False: 0]
13650
            Py_CLEAR(result);
13651
    }
13652
    return result;
13653
}
13654
13655
/* Format an integer or a float as an integer.
13656
 * Return 1 if the number has been formatted into the writer,
13657
 *        0 if the number has been formatted into *p_output
13658
 *       -1 and raise an exception on error */
13659
static int
13660
mainformatlong(PyObject *v,
13661
               struct unicode_format_arg_t *arg,
13662
               PyObject **p_output,
13663
               _PyUnicodeWriter *writer)
13664
{
13665
    PyObject *iobj, *res;
13666
    char type = (char)arg->ch;
13667
13668
    if (!PyNumber_Check(v))
  Branch (13668:9): [True: 112, False: 9.28M]
13669
        goto wrongtype;
13670
13671
    /* make sure number is a type of integer for o, x, and X */
13672
    if (!PyLong_Check(v)) {
  Branch (13672:9): [True: 6.51k, False: 9.27M]
13673
        if (type == 'o' || 
type == 'x'6.51k
||
type == 'X'6.51k
) {
  Branch (13673:13): [True: 2, False: 6.51k]
  Branch (13673:28): [True: 5, False: 6.51k]
  Branch (13673:43): [True: 2, False: 6.51k]
13674
            iobj = _PyNumber_Index(v);
13675
        }
13676
        else {
13677
            iobj = PyNumber_Long(v);
13678
        }
13679
        if (iobj == NULL ) {
  Branch (13679:13): [True: 8, False: 6.51k]
13680
            if (PyErr_ExceptionMatches(PyExc_TypeError))
  Branch (13680:17): [True: 8, False: 0]
13681
                goto wrongtype;
13682
            return -1;
13683
        }
13684
        assert(PyLong_Check(iobj));
13685
    }
13686
    else {
13687
        iobj = v;
13688
        Py_INCREF(iobj);
13689
    }
13690
13691
    if (PyLong_CheckExact(v)
13692
        && 
arg->width == -19.27M
&&
arg->prec == -18.84M
  Branch (13692:12): [True: 8.84M, False: 433k]
  Branch (13692:32): [True: 8.77M, False: 68.6k]
13693
        && 
!(arg->flags & (8.77M
F_SIGN8.77M
|
F_BLANK8.77M
))
  Branch (13693:12): [True: 8.77M, False: 1.88k]
13694
        && 
type != 'X'8.77M
)
  Branch (13694:12): [True: 8.77M, False: 87]
13695
    {
13696
        /* Fast path */
13697
        int alternate = arg->flags & F_ALT;
13698
        int base;
13699
13700
        switch(type)
13701
        {
13702
            default:
  Branch (13702:13): [True: 0, False: 8.77M]
13703
                Py_UNREACHABLE();
13704
            case 'd':
  Branch (13704:13): [True: 8.73M, False: 32.4k]
13705
            case 'i':
  Branch (13705:13): [True: 1.45k, False: 8.76M]
13706
            case 'u':
  Branch (13706:13): [True: 32, False: 8.77M]
13707
                base = 10;
13708
                break;
13709
            case 'o':
  Branch (13709:13): [True: 628, False: 8.77M]
13710
                base = 8;
13711
                break;
13712
            case 'x':
  Branch (13712:13): [True: 30.3k, False: 8.74M]
13713
            case 'X':
  Branch (13713:13): [True: 0, False: 8.77M]
13714
                base = 16;
13715
                break;
13716
        }
13717
13718
        if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
  Branch (13718:13): [True: 0, False: 8.77M]
13719
            Py_DECREF(iobj);
13720
            return -1;
13721
        }
13722
        Py_DECREF(iobj);
13723
        return 1;
13724
    }
13725
13726
    res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
13727
    Py_DECREF(iobj);
13728
    if (res == NULL)
  Branch (13728:9): [True: 0, False: 511k]
13729
        return -1;
13730
    *p_output = res;
13731
    return 0;
13732
13733
wrongtype:
13734
    switch(type)
13735
    {
13736
        case 'o':
  Branch (13736:9): [True: 1, False: 119]
13737
        case 'x':
  Branch (13737:9): [True: 6, False: 114]
13738
        case 'X':
  Branch (13738:9): [True: 1, False: 119]
13739
            PyErr_Format(PyExc_TypeError,
13740
                    "%%%c format: an integer is required, "
13741
                    "not %.200s",
13742
                    type, Py_TYPE(v)->tp_name);
13743
            break;
13744
        default:
  Branch (13744:9): [True: 112, False: 8]
13745
            PyErr_Format(PyExc_TypeError,
13746
                    "%%%c format: a real number is required, "
13747
                    "not %.200s",
13748
                    type, Py_TYPE(v)->tp_name);
13749
            break;
13750
    }
13751
    return -1;
13752
}
13753
13754
static Py_UCS4
13755
formatchar(PyObject *v)
13756
{
13757
    /* presume that the buffer is at least 3 characters long */
13758
    if (PyUnicode_Check(v)) {
13759
        if (PyUnicode_GET_LENGTH(v) == 1) {
  Branch (13759:13): [True: 1.51k, False: 2]
13760
            return PyUnicode_READ_CHAR(v, 0);
13761
        }
13762
        goto onError;
13763
    }
13764
    else {
13765
        int overflow;
13766
        long x = PyLong_AsLongAndOverflow(v, &overflow);
13767
        if (x == -1 && 
PyErr_Occurred()6
) {
  Branch (13767:13): [True: 6, False: 728]
  Branch (13767:24): [True: 5, False: 1]
13768
            if (PyErr_ExceptionMatches(PyExc_TypeError)) {
  Branch (13768:17): [True: 5, False: 0]
13769
                goto onError;
13770
            }
13771
            return (Py_UCS4) -1;
13772
        }
13773
13774
        if (x < 0 || 
x > 726
MAX_UNICODE726
) {
  Branch (13774:13): [True: 3, False: 726]
  Branch (13774:22): [True: 4, False: 722]
13775
            /* this includes an overflow in converting to C long */
13776
            PyErr_SetString(PyExc_OverflowError,
13777
                            "%c arg not in range(0x110000)");
13778
            return (Py_UCS4) -1;
13779
        }
13780
13781
        return (Py_UCS4) x;
13782
    }
13783
13784
  onError:
13785
    PyErr_SetString(PyExc_TypeError,
13786
                    "%c requires int or char");
13787
    return (Py_UCS4) -1;
13788
}
13789
13790
/* Parse options of an argument: flags, width, precision.
13791
   Handle also "%(name)" syntax.
13792
13793
   Return 0 if the argument has been formatted into arg->str.
13794
   Return 1 if the argument has been written into ctx->writer,
13795
   Raise an exception and return -1 on error. */
13796
static int
13797
unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13798
                         struct unicode_format_arg_t *arg)
13799
{
13800
#define FORMAT_READ(ctx) \
13801
        PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13802
13803
    PyObject *v;
13804
13805
    if (arg->ch == '(') {
  Branch (13805:9): [True: 15.2k, False: 15.4M]
13806
        /* Get argument value from a dictionary. Example: "%(name)s". */
13807
        Py_ssize_t keystart;
13808
        Py_ssize_t keylen;
13809
        PyObject *key;
13810
        int pcount = 1;
13811
13812
        if (ctx->dict == NULL) {
  Branch (13812:13): [True: 5, False: 15.2k]
13813
            PyErr_SetString(PyExc_TypeError,
13814
                            "format requires a mapping");
13815
            return -1;
13816
        }
13817
        ++ctx->fmtpos;
13818
        --ctx->fmtcnt;
13819
        keystart = ctx->fmtpos;
13820
        /* Skip over balanced parentheses */
13821
        while (pcount > 0 && 
--ctx->fmtcnt >= 0112k
) {
  Branch (13821:16): [True: 112k, False: 15.2k]
  Branch (13821:30): [True: 112k, False: 3]
13822
            arg->ch = FORMAT_READ(ctx);
13823
            if (arg->ch == ')')
  Branch (13823:17): [True: 15.2k, False: 97.6k]
13824
                --pcount;
13825
            else if (arg->ch == '(')
  Branch (13825:22): [True: 2, False: 97.6k]
13826
                ++pcount;
13827
            ctx->fmtpos++;
13828
        }
13829
        keylen = ctx->fmtpos - keystart - 1;
13830
        if (ctx->fmtcnt < 0 || 
pcount > 015.2k
) {
  Branch (13830:13): [True: 3, False: 15.2k]
  Branch (13830:32): [True: 0, False: 15.2k]
13831
            PyErr_SetString(PyExc_ValueError,
13832
                            "incomplete format key");
13833
            return -1;
13834
        }
13835
        key = PyUnicode_Substring(ctx->fmtstr,
13836
                                  keystart, keystart + keylen);
13837
        if (key == NULL)
  Branch (13837:13): [True: 0, False: 15.2k]
13838
            return -1;
13839
        if (ctx->args_owned) {
  Branch (13839:13): [True: 7.05k, False: 8.19k]
13840
            ctx->args_owned = 0;
13841
            Py_DECREF(ctx->args);
13842
        }
13843
        ctx->args = PyObject_GetItem(ctx->dict, key);
13844
        Py_DECREF(key);
13845
        if (ctx->args == NULL)
  Branch (13845:13): [True: 3, False: 15.2k]
13846
            return -1;
13847
        ctx->args_owned = 1;
13848
        ctx->arglen = -1;
13849
        ctx->argidx = -2;
13850
    }
13851
13852
    /* Parse flags. Example: "%+i" => flags=F_SIGN. */
13853
    
while (15.4M
--ctx->fmtcnt >= 0) {
  Branch (13853:12): [True: 15.9M, False: 6]
13854
        arg->ch = FORMAT_READ(ctx);
13855
        ctx->fmtpos++;
13856
        switch (arg->ch) {
  Branch (13856:17): [True: 15.4M, False: 509k]
13857
        case '-': arg->flags |= F_LJUST; continue;
  Branch (13857:9): [True: 21.7k, False: 15.9M]
13858
        case '+': arg->flags |= F_SIGN; continue;
  Branch (13858:9): [True: 22.2k, False: 15.9M]
13859
        case ' ': arg->flags |= F_BLANK; continue;
  Branch (13859:9): [True: 20.8k, False: 15.9M]
13860
        case '#': arg->flags |= F_ALT; continue;
  Branch (13860:9): [True: 21.2k, False: 15.9M]
13861
        case '0': arg->flags |= F_ZERO; continue;
  Branch (13861:9): [True: 423k, False: 15.5M]
13862
        }
13863
        break;
13864
    }
13865
13866
    /* Parse width. Example: "%10s" => width=10 */
13867
    if (arg->ch == '*') {
  Branch (13867:9): [True: 25.0k, False: 15.3M]
13868
        v = unicode_format_getnextarg(ctx);
13869
        if (v == NULL)
  Branch (13869:13): [True: 0, False: 25.0k]
13870
            return -1;
13871
        if (!PyLong_Check(v)) {
  Branch (13871:13): [True: 4, False: 25.0k]
13872
            PyErr_SetString(PyExc_TypeError,
13873
                            "* wants int");
13874
            return -1;
13875
        }
13876
        arg->width = PyLong_AsSsize_t(v);
13877
        if (arg->width == -1 && 
PyErr_Occurred()6
)
  Branch (13877:13): [True: 6, False: 25.0k]
  Branch (13877:33): [True: 6, False: 0]
13878
            return -1;
13879
        if (arg->width < 0) {
  Branch (13879:13): [True: 1, False: 25.0k]
13880
            arg->flags |= F_LJUST;
13881
            arg->width = -arg->width;
13882
        }
13883
        if (--ctx->fmtcnt >= 0) {
  Branch (13883:13): [True: 25.0k, False: 0]
13884
            arg->ch = FORMAT_READ(ctx);
13885
            ctx->fmtpos++;
13886
        }
13887
    }
13888
    else if (arg->ch >= '0' && 
arg->ch <= '9'15.3M
) {
  Branch (13888:14): [True: 15.3M, False: 92.2k]
  Branch (13888:32): [True: 433k, False: 14.8M]
13889
        arg->width = arg->ch - '0';
13890
        while (--ctx->fmtcnt >= 0) {
  Branch (13890:16): [True: 451k, False: 2]
13891
            arg->ch = FORMAT_READ(ctx);
13892
            ctx->fmtpos++;
13893
            if (arg->ch < '0' || 
arg->ch > '9'430k
)
  Branch (13893:17): [True: 21.8k, False: 430k]
  Branch (13893:34): [True: 412k, False: 18.0k]
13894
                break;
13895
            /* Since arg->ch is unsigned, the RHS would end up as unsigned,
13896
               mixing signed and unsigned comparison. Since arg->ch is between
13897
               '0' and '9', casting to int is safe. */
13898
            if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
  Branch (13898:17): [True: 3, False: 18.0k]
13899
                PyErr_SetString(PyExc_ValueError,
13900
                                "width too big");
13901
                return -1;
13902
            }
13903
            arg->width = arg->width*10 + (arg->ch - '0');
13904
        }
13905
    }
13906
13907
    /* Parse precision. Example: "%.3f" => prec=3 */
13908
    if (arg->ch == '.') {
  Branch (13908:9): [True: 114k, False: 15.3M]
13909
        arg->prec = 0;
13910
        if (--ctx->fmtcnt >= 0) {
  Branch (13910:13): [True: 114k, False: 0]
13911
            arg->ch = FORMAT_READ(ctx);
13912
            ctx->fmtpos++;
13913
        }
13914
        if (arg->ch == '*') {
  Branch (13914:13): [True: 52, False: 114k]
13915
            v = unicode_format_getnextarg(ctx);
13916
            if (v == NULL)
  Branch (13916:17): [True: 0, False: 52]
13917
                return -1;
13918
            if (!PyLong_Check(v)) {
  Branch (13918:17): [True: 2, False: 50]
13919
                PyErr_SetString(PyExc_TypeError,
13920
                                "* wants int");
13921
                return -1;
13922
            }
13923
            arg->prec = _PyLong_AsInt(v);
13924
            if (arg->prec == -1 && 
PyErr_Occurred()7
)
  Branch (13924:17): [True: 7, False: 43]
  Branch (13924:36): [True: 7, False: 0]
13925
                return -1;
13926
            if (arg->prec < 0)
  Branch (13926:17): [True: 0, False: 43]
13927
                arg->prec = 0;
13928
            if (--ctx->fmtcnt >= 0) {
  Branch (13928:17): [True: 43, False: 0]
13929
                arg->ch = FORMAT_READ(ctx);
13930
                ctx->fmtpos++;
13931
            }
13932
        }
13933
        else if (arg->ch >= '0' && arg->ch <= '9') {
  Branch (13933:18): [True: 114k, False: 0]
  Branch (13933:36): [True: 105k, False: 8.94k]
13934
            arg->prec = arg->ch - '0';
13935
            while (--ctx->fmtcnt >= 0) {
  Branch (13935:20): [True: 122k, False: 0]
13936
                arg->ch = FORMAT_READ(ctx);
13937
                ctx->fmtpos++;
13938
                if (arg->ch < '0' || arg->ch > '9')
  Branch (13938:21): [True: 0, False: 122k]
  Branch (13938:38): [True: 105k, False: 16.9k]
13939
                    break;
13940
                if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
  Branch (13940:21): [True: 4, False: 16.9k]
13941
                    PyErr_SetString(PyExc_ValueError,
13942
                                    "precision too big");
13943
                    return -1;
13944
                }
13945
                arg->prec = arg->prec*10 + (arg->ch - '0');
13946
            }
13947
        }
13948
    }
13949
13950
    /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
13951
    if (ctx->fmtcnt >= 0) {
  Branch (13951:9): [True: 15.4M, False: 8]
13952
        if (arg->ch == 'h' || arg->ch == 'l' || 
arg->ch == 'L'15.4M
) {
  Branch (13952:13): [True: 0, False: 15.4M]
  Branch (13952:31): [True: 2, False: 15.4M]
  Branch (13952:49): [True: 0, False: 15.4M]
13953
            if (--ctx->fmtcnt >= 0) {
  Branch (13953:17): [True: 2, False: 0]
13954
                arg->ch = FORMAT_READ(ctx);
13955
                ctx->fmtpos++;
13956
            }
13957
        }
13958
    }
13959
    if (ctx->fmtcnt < 0) {
  Branch (13959:9): [True: 8, False: 15.4M]
13960
        PyErr_SetString(PyExc_ValueError,
13961
                        "incomplete format");
13962
        return -1;
13963
    }
13964
    return 0;
13965
13966
#undef FORMAT_READ
13967
}
13968
13969
/* Format one argument. Supported conversion specifiers:
13970
13971
   - "s", "r", "a": any type
13972
   - "i", "d", "u": int or float
13973
   - "o", "x", "X": int
13974
   - "e", "E", "f", "F", "g", "G": float
13975
   - "c": int or str (1 character)
13976
13977
   When possible, the output is written directly into the Unicode writer
13978
   (ctx->writer). A string is created when padding is required.
13979
13980
   Return 0 if the argument has been formatted into *p_str,
13981
          1 if the argument has been written into ctx->writer,
13982
         -1 on error. */
13983
static int
13984
unicode_format_arg_format(struct unicode_formatter_t *ctx,
13985
                          struct unicode_format_arg_t *arg,
13986
                          PyObject **p_str)
13987
{
13988
    PyObject *v;
13989
    _PyUnicodeWriter *writer = &ctx->writer;
13990
13991
    if (ctx->fmtcnt == 0)
  Branch (13991:9): [True: 725k, False: 14.6M]
13992
        ctx->writer.overallocate = 0;
13993
13994
    v = unicode_format_getnextarg(ctx);
13995
    if (v == NULL)
  Branch (13995:9): [True: 20, False: 15.4M]
13996
        return -1;
13997
13998
13999
    switch (arg->ch) {
14000
    case 's':
  Branch (14000:5): [True: 6.06M, False: 9.34M]
14001
    case 'r':
  Branch (14001:5): [True: 21.4k, False: 15.3M]
14002
    case 'a':
  Branch (14002:5): [True: 1.45k, False: 15.4M]
14003
        if (PyLong_CheckExact(v) && 
arg->width == -112.7k
&&
arg->prec == -112.7k
) {
  Branch (14003:37): [True: 12.7k, False: 0]
  Branch (14003:57): [True: 12.7k, False: 0]
14004
            /* Fast path */
14005
            if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
  Branch (14005:17): [True: 0, False: 12.7k]
14006
                return -1;
14007
            return 1;
14008
        }
14009
14010
        if (PyUnicode_CheckExact(v) && 
arg->ch == 's'6.06M
) {
  Branch (14010:40): [True: 6.05M, False: 13.8k]
14011
            *p_str = v;
14012
            Py_INCREF(*p_str);
14013
        }
14014
        else {
14015
            if (arg->ch == 's')
  Branch (14015:17): [True: 7.25k, False: 16.2k]
14016
                *p_str = PyObject_Str(v);
14017
            else if (arg->ch == 'r')
  Branch (14017:22): [True: 14.8k, False: 1.45k]
14018
                *p_str = PyObject_Repr(v);
14019
            else
14020
                *p_str = PyObject_ASCII(v);
14021
        }
14022
        break;
14023
14024
    case 'i':
  Branch (14024:5): [True: 6.94k, False: 15.4M]
14025
    case 'd':
  Branch (14025:5): [True: 8.78M, False: 6.62M]
14026
    case 'u':
  Branch (14026:5): [True: 3.58k, False: 15.4M]
14027
    case 'o':
  Branch (14027:5): [True: 9.89k, False: 15.4M]
14028
    case 'x':
  Branch (14028:5): [True: 469k, False: 14.9M]
14029
    case 'X':
  Branch (14029:5): [True: 2.17k, False: 15.4M]
14030
    {
14031
        int ret = mainformatlong(v, arg, p_str, writer);
14032
        if (ret != 0)
  Branch (14032:13): [True: 8.77M, False: 511k]
14033
            return ret;
14034
        arg->sign = 1;
14035
        break;
14036
    }
14037
14038
    case 'e':
  Branch (14038:5): [True: 4.04k, False: 15.4M]
14039
    case 'E':
  Branch (14039:5): [True: 3.58k, False: 15.4M]
14040
    case 'f':
  Branch (14040:5): [True: 22.9k, False: 15.3M]
14041
    case 'F':
  Branch (14041:5): [True: 3.58k, False: 15.4M]
14042
    case 'g':
  Branch (14042:5): [True: 3.90k, False: 15.4M]
14043
    case 'G':
  Branch (14043:5): [True: 3.58k, False: 15.4M]
14044
        if (arg->width == -1 && 
arg->prec == -126.8k
  Branch (14044:13): [True: 26.8k, False: 14.8k]
  Branch (14044:33): [True: 5.90k, False: 20.9k]
14045
            && 
!(arg->flags & (5.90k
F_SIGN5.90k
|
F_BLANK5.90k
)))
  Branch (14045:16): [True: 4.70k, False: 1.20k]
14046
        {
14047
            /* Fast path */
14048
            if (formatfloat(v, arg, NULL, writer) == -1)
  Branch (14048:17): [True: 2, False: 4.70k]
14049
                return -1;
14050
            return 1;
14051
        }
14052
14053
        arg->sign = 1;
14054
        if (formatfloat(v, arg, p_str, NULL) == -1)
  Branch (14054:13): [True: 0, False: 36.9k]
14055
            return -1;
14056
        break;
14057
14058
    case 'c':
  Branch (14058:5): [True: 2.25k, False: 15.4M]
14059
    {
14060
        Py_UCS4 ch = formatchar(v);
14061
        if (ch == (Py_UCS4) -1)
  Branch (14061:13): [True: 14, False: 2.24k]
14062
            return -1;
14063
        if (arg->width == -1 && arg->prec == -1) {
  Branch (14063:13): [True: 2.24k, False: 0]
  Branch (14063:33): [True: 2.24k, False: 0]
14064
            /* Fast path */
14065
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
  Branch (14065:17): [True: 0, False: 2.24k]
14066
                return -1;
14067
            return 1;
14068
        }
14069
        *p_str = PyUnicode_FromOrdinal(ch);
14070
        break;
14071
    }
14072
14073
    default:
  Branch (14073:5): [True: 5, False: 15.4M]
14074
        PyErr_Format(PyExc_ValueError,
14075
                     "unsupported format character '%c' (0x%x) "
14076
                     "at index %zd",
14077
                     (31<=arg->ch && arg->ch<=126) ? 
(char)arg->ch4
:
'?'1
,
  Branch (14077:23): [True: 5, False: 0]
  Branch (14077:38): [True: 4, False: 1]
14078
                     (int)arg->ch,
14079
                     ctx->fmtpos - 1);
14080
        return -1;
14081
    }
14082
    if (*p_str == NULL)
  Branch (14082:9): [True: 484, False: 6.62M]
14083
        return -1;
14084
    assert (PyUnicode_Check(*p_str));
14085
    return 0;
14086
}
14087
14088
static int
14089
unicode_format_arg_output(struct unicode_formatter_t *ctx,
14090
                          struct unicode_format_arg_t *arg,
14091
                          PyObject *str)
14092
{
14093
    Py_ssize_t len;
14094
    int kind;
14095
    const void *pbuf;
14096
    Py_ssize_t pindex;
14097
    Py_UCS4 signchar;
14098
    Py_ssize_t buflen;
14099
    Py_UCS4 maxchar;
14100
    Py_ssize_t sublen;
14101
    _PyUnicodeWriter *writer = &ctx->writer;
14102
    Py_UCS4 fill;
14103
14104
    fill = ' ';
14105
    if (arg->sign && 
arg->flags & 548k
F_ZERO548k
)
  Branch (14105:9): [True: 548k, False: 6.07M]
  Branch (14105:22): [True: 420k, False: 127k]
14106
        fill = '0';
14107
14108
    len = PyUnicode_GET_LENGTH(str);
14109
    if ((arg->width == -1 || 
arg->width <= len458k
)
  Branch (14109:10): [True: 6.16M, False: 458k]
  Branch (14109:30): [True: 305k, False: 153k]
14110
        && 
(6.47M
arg->prec == -16.47M
||
arg->prec >= len103k
)
  Branch (14110:13): [True: 6.37M, False: 103k]
  Branch (14110:32): [True: 67.9k, False: 35.1k]
14111
        && 
!(arg->flags & (6.43M
F_SIGN6.43M
|
F_BLANK6.43M
)))
  Branch (14111:12): [True: 6.43M, False: 7.51k]
14112
    {
14113
        /* Fast path */
14114
        if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
  Branch (14114:13): [True: 0, False: 6.43M]
14115
            return -1;
14116
        return 0;
14117
    }
14118
14119
    /* Truncate the string for "s", "r" and "a" formats
14120
       if the precision is set */
14121
    if (arg->ch == 's' || 
arg->ch == 'r'193k
||
arg->ch == 'a'191k
) {
  Branch (14121:9): [True: 3.14k, False: 193k]
  Branch (14121:27): [True: 1.25k, False: 191k]
  Branch (14121:45): [True: 1.25k, False: 190k]
14122
        if (arg->prec >= 0 && 
len > arg->prec3.04k
)
  Branch (14122:13): [True: 3.04k, False: 2.61k]
  Branch (14122:31): [True: 2.11k, False: 931]
14123
            len = arg->prec;
14124
    }
14125
14126
    /* Adjust sign and width */
14127
    kind = PyUnicode_KIND(str);
14128
    pbuf = PyUnicode_DATA(str);
14129
    pindex = 0;
14130
    signchar = '\0';
14131
    if (arg->sign) {
  Branch (14131:9): [True: 190k, False: 5.65k]
14132
        Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14133
        if (ch == '-' || 
ch == '+'170k
) {
  Branch (14133:13): [True: 20.2k, False: 170k]
  Branch (14133:26): [True: 0, False: 170k]
14134
            signchar = ch;
14135
            len--;
14136
            pindex++;
14137
        }
14138
        else if (arg->flags & F_SIGN)
  Branch (14138:18): [True: 10.1k, False: 160k]
14139
            signchar = '+';
14140
        else if (arg->flags & F_BLANK)
  Branch (14140:18): [True: 4.74k, False: 155k]
14141
            signchar = ' ';
14142
        else
14143
            arg->sign = 0;
14144
    }
14145
    if (arg->width < len)
  Branch (14145:9): [True: 42.1k, False: 154k]
14146
        arg->width = len;
14147
14148
    /* Prepare the writer */
14149
    maxchar = writer->maxchar;
14150
    if (!(arg->flags & F_LJUST)) {
  Branch (14150:9): [True: 175k, False: 20.5k]
14151
        if (arg->sign) {
  Branch (14151:13): [True: 19.0k, False: 156k]
14152
            if ((arg->width-1) > len)
  Branch (14152:17): [True: 5.30k, False: 13.7k]
14153
                maxchar = Py_MAX(maxchar, fill);
14154
        }
14155
        else {
14156
            if (arg->width > len)
  Branch (14156:17): [True: 141k, False: 15.3k]
14157
                maxchar = Py_MAX(maxchar, fill);
14158
        }
14159
    }
14160
    if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
  Branch (14160:9): [True: 178k, False: 17.3k]
14161
        Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14162
        maxchar = Py_MAX(maxchar, strmaxchar);
14163
    }
14164
14165
    buflen = arg->width;
14166
    if (arg->sign && 
len == arg->width35.1k
)
  Branch (14166:9): [True: 35.1k, False: 161k]
  Branch (14166:22): [True: 24.7k, False: 10.3k]
14167
        buflen++;
14168
    if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
  Branch (14168:9): [True: 0, False: 196k]
14169
        return -1;
14170
14171
    /* Write the sign if needed */
14172
    if (arg->sign) {
  Branch (14172:9): [True: 35.1k, False: 161k]
14173
        if (fill != ' ') {
  Branch (14173:13): [True: 17.0k, False: 18.0k]
14174
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14175
            writer->pos += 1;
14176
        }
14177
        if (arg->width > len)
  Branch (14177:13): [True: 10.3k, False: 24.7k]
14178
            arg->width--;
14179
    }
14180
14181
    /* Write the numeric prefix for "x", "X" and "o" formats
14182
       if the alternate form is used.
14183
       For example, write "0x" for the "%#x" format. */
14184
    if ((arg->flags & F_ALT) && 
(20.0k
arg->ch == 'x'20.0k
||
arg->ch == 'X'19.1k
||
arg->ch == 'o'18.3k
)) {
  Branch (14184:9): [True: 20.0k, False: 176k]
  Branch (14184:34): [True: 878, False: 19.1k]
  Branch (14184:52): [True: 868, False: 18.3k]
  Branch (14184:70): [True: 882, False: 17.4k]
14185
        assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14186
        assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14187
        if (fill != ' ') {
  Branch (14187:13): [True: 1.30k, False: 1.32k]
14188
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14189
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14190
            writer->pos += 2;
14191
            pindex += 2;
14192
        }
14193
        arg->width -= 2;
14194
        if (arg->width < 0)
  Branch (14194:13): [True: 0, False: 2.62k]
14195
            arg->width = 0;
14196
        len -= 2;
14197
    }
14198
14199
    /* Pad left with the fill character if needed */
14200
    if (arg->width > len && 
!(arg->flags & 154k
F_LJUST154k
)) {
  Branch (14200:9): [True: 154k, False: 42.1k]
  Branch (14200:29): [True: 146k, False: 7.53k]
14201
        sublen = arg->width - len;
14202
        unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
14203
        writer->pos += sublen;
14204
        arg->width = len;
14205
    }
14206
14207
    /* If padding with spaces: write sign if needed and/or numeric prefix if
14208
       the alternate form is used */
14209
    if (fill == ' ') {
  Branch (14209:9): [True: 54.2k, False: 142k]
14210
        if (arg->sign) {
  Branch (14210:13): [True: 18.0k, False: 36.2k]
14211
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14212
            writer->pos += 1;
14213
        }
14214
        if ((arg->flags & F_ALT) && 
(11.0k
arg->ch == 'x'11.0k
||
arg->ch == 'X'10.6k
||
arg->ch == 'o'10.2k
)) {
  Branch (14214:13): [True: 11.0k, False: 43.1k]
  Branch (14214:38): [True: 444, False: 10.6k]
  Branch (14214:56): [True: 434, False: 10.2k]
  Branch (14214:74): [True: 445, False: 9.76k]
14215
            assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14216
            assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14217
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14218
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14219
            writer->pos += 2;
14220
            pindex += 2;
14221
        }
14222
    }
14223
14224
    /* Write characters */
14225
    if (len) {
  Branch (14225:9): [True: 193k, False: 3.03k]
14226
        _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14227
                                      str, pindex, len);
14228
        writer->pos += len;
14229
    }
14230
14231
    /* Pad right with the fill character if needed */
14232
    if (arg->width > len) {
  Branch (14232:9): [True: 7.53k, False: 188k]
14233
        sublen = arg->width - len;
14234
        unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
14235
        writer->pos += sublen;
14236
    }
14237
    return 0;
14238
}
14239
14240
/* Helper of PyUnicode_Format(): format one arg.
14241
   Return 0 on success, raise an exception and return -1 on error. */
14242
static int
14243
unicode_format_arg(struct unicode_formatter_t *ctx)
14244
{
14245
    struct unicode_format_arg_t arg;
14246
    PyObject *str;
14247
    int ret;
14248
14249
    arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14250
    if (arg.ch == '%') {
  Branch (14250:9): [True: 4.38k, False: 15.4M]
14251
        ctx->fmtpos++;
14252
        ctx->fmtcnt--;
14253
        if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
  Branch (14253:13): [True: 0, False: 4.38k]
14254
            return -1;
14255
        return 0;
14256
    }
14257
    arg.flags = 0;
14258
    arg.width = -1;
14259
    arg.prec = -1;
14260
    arg.sign = 0;
14261
    str = NULL;
14262
14263
    ret = unicode_format_arg_parse(ctx, &arg);
14264
    if (ret == -1)
  Branch (14264:9): [True: 45, False: 15.4M]
14265
        return -1;
14266
14267
    ret = unicode_format_arg_format(ctx, &arg, &str);
14268
    if (ret == -1)
  Branch (14268:9): [True: 645, False: 15.4M]
14269
        return -1;
14270
14271
    if (ret != 1) {
  Branch (14271:9): [True: 6.62M, False: 8.79M]
14272
        ret = unicode_format_arg_output(ctx, &arg, str);
14273
        Py_DECREF(str);
14274
        if (ret == -1)
  Branch (14274:13): [True: 0, False: 6.62M]
14275
            return -1;
14276
    }
14277
14278
    if (ctx->dict && 
(ctx->argidx < ctx->arglen)16.7k
) {
  Branch (14278:9): [True: 16.7k, False: 15.4M]
  Branch (14278:22): [True: 0, False: 16.7k]
14279
        PyErr_SetString(PyExc_TypeError,
14280
                        "not all arguments converted during string formatting");
14281
        return -1;
14282
    }
14283
    return 0;
14284
}
14285
14286
PyObject *
14287
PyUnicode_Format(PyObject *format, PyObject *args)
14288
{
14289
    struct unicode_formatter_t ctx;
14290
14291
    if (format == NULL || args == NULL) {
  Branch (14291:9): [True: 0, False: 3.86M]
  Branch (14291:27): [True: 0, False: 3.86M]
14292
        PyErr_BadInternalCall();
14293
        return NULL;
14294
    }
14295
14296
    if (ensure_unicode(format) < 0)
  Branch (14296:9): [True: 0, False: 3.86M]
14297
        return NULL;
14298
14299
    ctx.fmtstr = format;
14300
    ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14301
    ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14302
    ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14303
    ctx.fmtpos = 0;
14304
14305
    _PyUnicodeWriter_Init(&ctx.writer);
14306
    ctx.writer.min_length = ctx.fmtcnt + 100;
14307
    ctx.writer.overallocate = 1;
14308
14309
    if (PyTuple_Check(args)) {
14310
        ctx.arglen = PyTuple_Size(args);
14311
        ctx.argidx = 0;
14312
    }
14313
    else {
14314
        ctx.arglen = -1;
14315
        ctx.argidx = -2;
14316
    }
14317
    ctx.args_owned = 0;
14318
    if (PyMapping_Check(args) && 
!3.20M
PyTuple_Check(args) &&
!209k
PyUnicode_Check209k
(args))
  Branch (14318:9): [True: 3.20M, False: 661k]
  Branch (14318:34): [True: 209k, False: 2.99M]
  Branch (14318:58): [True: 10.4k, False: 198k]
14319
        ctx.dict = args;
14320
    else
14321
        ctx.dict = NULL;
14322
    ctx.args = args;
14323
14324
    while (--ctx.fmtcnt >= 0) {
  Branch (14324:12): [True: 30.3M, False: 3.86M]
14325
        if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
  Branch (14325:13): [True: 14.9M, False: 15.4M]
14326
            Py_ssize_t nonfmtpos;
14327
14328
            nonfmtpos = ctx.fmtpos++;
14329
            while (ctx.fmtcnt >= 0 &&
  Branch (14329:20): [True: 19.7M, False: 3.13M]
14330
                   
PyUnicode_READ19.7M
(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%'19.7M
) {
  Branch (14330:20): [True: 7.96M, False: 11.8M]
14331
                ctx.fmtpos++;
14332
                ctx.fmtcnt--;
14333
            }
14334
            if (ctx.fmtcnt < 0) {
  Branch (14334:17): [True: 3.13M, False: 11.8M]
14335
                ctx.fmtpos--;
14336
                ctx.writer.overallocate = 0;
14337
            }
14338
14339
            if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
  Branch (14339:17): [True: 0, False: 14.9M]
14340
                                                nonfmtpos, ctx.fmtpos) < 0)
14341
                goto onError;
14342
        }
14343
        else {
14344
            ctx.fmtpos++;
14345
            if (unicode_format_arg(&ctx) == -1)
  Branch (14345:17): [True: 690, False: 15.4M]
14346
                goto onError;
14347
        }
14348
    }
14349
14350
    if (ctx.argidx < ctx.arglen && 
!ctx.dict752
) {
  Branch (14350:9): [True: 752, False: 3.86M]
  Branch (14350:36): [True: 19, False: 733]
14351
        PyErr_SetString(PyExc_TypeError,
14352
                        "not all arguments converted during string formatting");
14353
        goto onError;
14354
    }
14355
14356
    if (ctx.args_owned) {
  Branch (14356:9): [True: 8.19k, False: 3.85M]
14357
        Py_DECREF(ctx.args);
14358
    }
14359
    return _PyUnicodeWriter_Finish(&ctx.writer);
14360
14361
  onError:
14362
    _PyUnicodeWriter_Dealloc(&ctx.writer);
14363
    if (ctx.args_owned) {
  Branch (14363:9): [True: 3, False: 706]
14364
        Py_DECREF(ctx.args);
14365
    }
14366
    return NULL;
14367
}
14368
14369
static PyObject *
14370
unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
14371
14372
/*[clinic input]
14373
@classmethod
14374
str.__new__ as unicode_new
14375
14376
    object as x: object = NULL
14377
    encoding: str = NULL
14378
    errors: str = NULL
14379
14380
[clinic start generated code]*/
14381
14382
static PyObject *
14383
unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
14384
                 const char *errors)
14385
/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
14386
{
14387
    PyObject *unicode;
14388
    if (x == NULL) {
  Branch (14388:9): [True: 14.0k, False: 626k]
14389
        unicode = unicode_new_empty();
14390
    }
14391
    else if (encoding == NULL && 
errors == NULL356k
) {
  Branch (14391:14): [True: 356k, False: 270k]
  Branch (14391:34): [True: 356k, False: 4]
14392
        unicode = PyObject_Str(x);
14393
    }
14394
    else {
14395
        unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
14396
    }
14397
14398
    if (unicode != NULL && 
type != &PyUnicode_Type640k
) {
  Branch (14398:9): [True: 640k, False: 35]
  Branch (14398:28): [True: 114k, False: 526k]
14399
        Py_SETREF(unicode, unicode_subtype_new(type, unicode));
14400
    }
14401
    return unicode;
14402
}
14403
14404
static PyObject *
14405
unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
14406
{
14407
    PyObject *self;
14408
    Py_ssize_t length, char_size;
14409
    int share_utf8;
14410
    int kind;
14411
    void *data;
14412
14413
    assert(PyType_IsSubtype(type, &PyUnicode_Type));
14414
    assert(_PyUnicode_CHECK(unicode));
14415
14416
    self = type->tp_alloc(type, 0);
14417
    if (self == NULL) {
  Branch (14417:9): [True: 0, False: 114k]
14418
        return NULL;
14419
    }
14420
    kind = PyUnicode_KIND(unicode);
14421
    length = PyUnicode_GET_LENGTH(unicode);
14422
14423
    _PyUnicode_LENGTH(self) = length;
14424
#ifdef Py_DEBUG
14425
    _PyUnicode_HASH(self) = -1;
14426
#else
14427
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14428
#endif
14429
    _PyUnicode_STATE(self).interned = 0;
14430
    _PyUnicode_STATE(self).kind = kind;
14431
    _PyUnicode_STATE(self).compact = 0;
14432
    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
14433
    _PyUnicode_UTF8_LENGTH(self) = 0;
14434
    _PyUnicode_UTF8(self) = NULL;
14435
    _PyUnicode_DATA_ANY(self) = NULL;
14436
14437
    share_utf8 = 0;
14438
    if (kind == PyUnicode_1BYTE_KIND) {
  Branch (14438:9): [True: 114k, False: 253]
14439
        char_size = 1;
14440
        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
  Branch (14440:13): [True: 113k, False: 783]
14441
            share_utf8 = 1;
14442
    }
14443
    else if (kind == PyUnicode_2BYTE_KIND) {
  Branch (14443:14): [True: 232, False: 21]
14444
        char_size = 2;
14445
    }
14446
    else {
14447
        assert(kind == PyUnicode_4BYTE_KIND);
14448
        char_size = 4;
14449
    }
14450
14451
    /* Ensure we won't overflow the length. */
14452
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
  Branch (14452:9): [True: 0, False: 114k]
14453
        PyErr_NoMemory();
14454
        goto onError;
14455
    }
14456
    data = PyObject_Malloc((length + 1) * char_size);
14457
    if (data == NULL) {
  Branch (14457:9): [True: 0, False: 114k]
14458
        PyErr_NoMemory();
14459
        goto onError;
14460
    }
14461
14462
    _PyUnicode_DATA_ANY(self) = data;
14463
    if (share_utf8) {
  Branch (14463:9): [True: 113k, False: 1.03k]
14464
        _PyUnicode_UTF8_LENGTH(self) = length;
14465
        _PyUnicode_UTF8(self) = data;
14466
    }
14467
14468
    memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
14469
    assert(_PyUnicode_CheckConsistency(self, 1));
14470
#ifdef Py_DEBUG
14471
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14472
#endif
14473
    return self;
14474
14475
onError:
14476
    Py_DECREF(self);
14477
    return NULL;
14478
}
14479
14480
void
14481
_PyUnicode_ExactDealloc(PyObject *op)
14482
{
14483
    assert(PyUnicode_CheckExact(op));
14484
    unicode_dealloc(op);
14485
}
14486
14487
PyDoc_STRVAR(unicode_doc,
14488
"str(object='') -> str\n\
14489
str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
14490
\n\
14491
Create a new string object from the given object. If encoding or\n\
14492
errors is specified, then the object must expose a data buffer\n\
14493
that will be decoded using the given encoding and error handler.\n\
14494
Otherwise, returns the result of object.__str__() (if defined)\n\
14495
or repr(object).\n\
14496
encoding defaults to sys.getdefaultencoding().\n\
14497
errors defaults to 'strict'.");
14498
14499
static PyObject *unicode_iter(PyObject *seq);
14500
14501
PyTypeObject PyUnicode_Type = {
14502
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14503
    "str",                        /* tp_name */
14504
    sizeof(PyUnicodeObject),      /* tp_basicsize */
14505
    0,                            /* tp_itemsize */
14506
    /* Slots */
14507
    (destructor)unicode_dealloc,  /* tp_dealloc */
14508
    0,                            /* tp_vectorcall_offset */
14509
    0,                            /* tp_getattr */
14510
    0,                            /* tp_setattr */
14511
    0,                            /* tp_as_async */
14512
    unicode_repr,                 /* tp_repr */
14513
    &unicode_as_number,           /* tp_as_number */
14514
    &unicode_as_sequence,         /* tp_as_sequence */
14515
    &unicode_as_mapping,          /* tp_as_mapping */
14516
    (hashfunc) unicode_hash,      /* tp_hash*/
14517
    0,                            /* tp_call*/
14518
    (reprfunc) unicode_str,       /* tp_str */
14519
    PyObject_GenericGetAttr,      /* tp_getattro */
14520
    0,                            /* tp_setattro */
14521
    0,                            /* tp_as_buffer */
14522
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14523
        Py_TPFLAGS_UNICODE_SUBCLASS |
14524
        _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
14525
    unicode_doc,                  /* tp_doc */
14526
    0,                            /* tp_traverse */
14527
    0,                            /* tp_clear */
14528
    PyUnicode_RichCompare,        /* tp_richcompare */
14529
    0,                            /* tp_weaklistoffset */
14530
    unicode_iter,                 /* tp_iter */
14531
    0,                            /* tp_iternext */
14532
    unicode_methods,              /* tp_methods */
14533
    0,                            /* tp_members */
14534
    0,                            /* tp_getset */
14535
    0,                            /* tp_base */
14536
    0,                            /* tp_dict */
14537
    0,                            /* tp_descr_get */
14538
    0,                            /* tp_descr_set */
14539
    0,                            /* tp_dictoffset */
14540
    0,                            /* tp_init */
14541
    0,                            /* tp_alloc */
14542
    unicode_new,                  /* tp_new */
14543
    PyObject_Del,                 /* tp_free */
14544
};
14545
14546
/* Initialize the Unicode implementation */
14547
14548
void
14549
_PyUnicode_InitState(PyInterpreterState *interp)
14550
{
14551
    if (!_Py_IsMainInterpreter(interp)) {
  Branch (14551:9): [True: 171, False: 107]
14552
        return;
14553
    }
14554
14555
    /* initialize the linebreak bloom filter */
14556
    const Py_UCS2 linebreak[] = {
14557
        0x000A, /* LINE FEED */
14558
        0x000D, /* CARRIAGE RETURN */
14559
        0x001C, /* FILE SEPARATOR */
14560
        0x001D, /* GROUP SEPARATOR */
14561
        0x001E, /* RECORD SEPARATOR */
14562
        0x0085, /* NEXT LINE */
14563
        0x2028, /* LINE SEPARATOR */
14564
        0x2029, /* PARAGRAPH SEPARATOR */
14565
    };
14566
    bloom_linebreak = make_bloom_mask(
14567
        PyUnicode_2BYTE_KIND, linebreak,
14568
        Py_ARRAY_LENGTH(linebreak));
14569
}
14570
14571
14572
PyStatus
14573
_PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
14574
{
14575
    if (!_Py_IsMainInterpreter(interp)) {
  Branch (14575:9): [True: 171, False: 107]
14576
        return _PyStatus_OK();
14577
    }
14578
14579
#ifdef Py_DEBUG
14580
    assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
14581
14582
    for (int i = 0; i < 256; i++) {
14583
        assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
14584
    }
14585
#endif
14586
14587
    return _PyStatus_OK();
14588
}
14589
14590
14591
PyStatus
14592
_PyUnicode_InitTypes(PyInterpreterState *interp)
14593
{
14594
    if (!_Py_IsMainInterpreter(interp)) {
  Branch (14594:9): [True: 171, False: 107]
14595
        return _PyStatus_OK();
14596
    }
14597
14598
    if (PyType_Ready(&EncodingMapType) < 0) {
  Branch (14598:9): [True: 0, False: 107]
14599
        goto error;
14600
    }
14601
    if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
  Branch (14601:9): [True: 0, False: 107]
14602
        goto error;
14603
    }
14604
    if (PyType_Ready(&PyFormatterIter_Type) < 0) {
  Branch (14604:9): [True: 0, False: 107]
14605
        goto error;
14606
    }
14607
    return _PyStatus_OK();
14608
14609
error:
14610
    return _PyStatus_ERR("Can't initialize unicode types");
14611
}
14612
14613
14614
void
14615
PyUnicode_InternInPlace(PyObject **p)
14616
{
14617
    PyObject *s = *p;
14618
#ifdef Py_DEBUG
14619
    assert(s != NULL);
14620
    assert(_PyUnicode_CHECK(s));
14621
#else
14622
    if (s == NULL || !PyUnicode_Check(s)) {
  Branch (14622:9): [True: 0, False: 30.1M]
  Branch (14622:22): [True: 0, False: 30.1M]
14623
        return;
14624
    }
14625
#endif
14626
14627
    /* If it's a subclass, we don't really know what putting
14628
       it in the interned dict might do. */
14629
    if (!PyUnicode_CheckExact(s)) {
  Branch (14629:9): [True: 4, False: 30.1M]
14630
        return;
14631
    }
14632
14633
    if (PyUnicode_CHECK_INTERNED(s)) {
14634
        return;
14635
    }
14636
14637
    if (interned == NULL) {
  Branch (14637:9): [True: 107, False: 9.64M]
14638
        interned = PyDict_New();
14639
        if (interned == NULL) {
  Branch (14639:13): [True: 0, False: 107]
14640
            PyErr_Clear(); /* Don't leave an exception */
14641
            return;
14642
        }
14643
    }
14644
14645
    PyObject *t = PyDict_SetDefault(interned, s, s);
14646
    if (t == NULL) {
  Branch (14646:9): [True: 0, False: 9.64M]
14647
        PyErr_Clear();
14648
        return;
14649
    }
14650
14651
    if (t != s) {
  Branch (14651:9): [True: 9.10M, False: 543k]
14652
        Py_INCREF(t);
14653
        Py_SETREF(*p, t);
14654
        return;
14655
    }
14656
14657
    /* The two references in interned dict (key and value) are not counted by
14658
       refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
14659
       this. */
14660
    Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
14661
    _PyUnicode_STATE(s).interned = 1;
14662
}
14663
14664
// Function kept for the stable ABI.
14665
PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
14666
void
14667
PyUnicode_InternImmortal(PyObject **p)
14668
{
14669
    PyUnicode_InternInPlace(p);
14670
    // Leak a reference on purpose
14671
    Py_INCREF(*p);
14672
}
14673
14674
PyObject *
14675
PyUnicode_InternFromString(const char *cp)
14676
{
14677
    PyObject *s = PyUnicode_FromString(cp);
14678
    if (s == NULL)
  Branch (14678:9): [True: 0, False: 542k]
14679
        return NULL;
14680
    PyUnicode_InternInPlace(&s);
14681
    return s;
14682
}
14683
14684
14685
void
14686
_PyUnicode_ClearInterned(PyInterpreterState *interp)
14687
{
14688
    if (!_Py_IsMainInterpreter(interp)) {
  Branch (14688:9): [True: 169, False: 103]
14689
        // interned dict is shared by all interpreters
14690
        return;
14691
    }
14692
14693
    if (interned == NULL) {
  Branch (14693:9): [True: 0, False: 103]
14694
        return;
14695
    }
14696
    assert(PyDict_CheckExact(interned));
14697
14698
    /* Interned unicode strings are not forcibly deallocated; rather, we give
14699
       them their stolen references back, and then clear and DECREF the
14700
       interned dict. */
14701
14702
#ifdef INTERNED_STATS
14703
    fprintf(stderr, "releasing %zd interned strings\n",
14704
            PyDict_GET_SIZE(interned));
14705
14706
    Py_ssize_t total_length = 0;
14707
#endif
14708
    Py_ssize_t pos = 0;
14709
    PyObject *s, *ignored_value;
14710
    while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
  Branch (14710:12): [True: 171k, False: 103]
14711
        assert(PyUnicode_CHECK_INTERNED(s));
14712
        // Restore the two references (key and value) ignored
14713
        // by PyUnicode_InternInPlace().
14714
        Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
14715
#ifdef INTERNED_STATS
14716
        total_length += PyUnicode_GET_LENGTH(s);
14717
#endif
14718
14719
        _PyUnicode_STATE(s).interned = 0;
14720
    }
14721
#ifdef INTERNED_STATS
14722
    fprintf(stderr,
14723
            "total length of all interned strings: %zd characters\n",
14724
            total_length);
14725
#endif
14726
14727
    PyDict_Clear(interned);
14728
    Py_CLEAR(interned);
14729
}
14730
14731
14732
/********************* Unicode Iterator **************************/
14733
14734
typedef struct {
14735
    PyObject_HEAD
14736
    Py_ssize_t it_index;
14737
    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
14738
} unicodeiterobject;
14739
14740
static void
14741
unicodeiter_dealloc(unicodeiterobject *it)
14742
{
14743
    _PyObject_GC_UNTRACK(it);
14744
    Py_XDECREF(it->it_seq);
14745
    PyObject_GC_Del(it);
14746
}
14747
14748
static int
14749
unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14750
{
14751
    Py_VISIT(it->it_seq);
14752
    return 0;
14753
}
14754
14755
static PyObject *
14756
unicodeiter_next(unicodeiterobject *it)
14757
{
14758
    PyObject *seq;
14759
14760
    assert(it != NULL);
14761
    seq = it->it_seq;
14762
    if (seq == NULL)
  Branch (14762:9): [True: 3, False: 128k]
14763
        return NULL;
14764
    assert(_PyUnicode_CHECK(seq));
14765
14766
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
  Branch (14766:9): [True: 126k, False: 1.36k]
14767
        int kind = PyUnicode_KIND(seq);
14768
        const void *data = PyUnicode_DATA(seq);
14769
        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14770
        it->it_index++;
14771
        return unicode_char(chr);
14772
    }
14773
14774
    it->it_seq = NULL;
14775
    Py_DECREF(seq);
14776
    return NULL;
14777
}
14778
14779
static PyObject *
14780
unicode_ascii_iter_next(unicodeiterobject *it)
14781
{
14782
    assert(it != NULL);
14783
    PyObject *seq = it->it_seq;
14784
    if (seq == NULL) {
  Branch (14784:9): [True: 35, False: 2.53M]
14785
        return NULL;
14786
    }
14787
    assert(_PyUnicode_CHECK(seq));
14788
    assert(PyUnicode_IS_COMPACT_ASCII(seq));
14789
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
  Branch (14789:9): [True: 1.83M, False: 701k]
14790
        const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
14791
        Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
14792
                                              data, it->it_index);
14793
        it->it_index++;
14794
        PyObject *item = (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
14795
        return Py_NewRef(item);
14796
    }
14797
    it->it_seq = NULL;
14798
    Py_DECREF(seq);
14799
    return NULL;
14800
}
14801
14802
static PyObject *
14803
unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
14804
{
14805
    Py_ssize_t len = 0;
14806
    if (it->it_seq)
  Branch (14806:9): [True: 11.8k, False: 1]
14807
        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
14808
    return PyLong_FromSsize_t(len);
14809
}
14810
14811
PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14812
14813
static PyObject *
14814
unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
14815
{
14816
    if (it->it_seq != NULL) {
  Branch (14816:9): [True: 432, False: 6]
14817
        return Py_BuildValue("N(O)n", _PyEval_GetBuiltin(&_Py_ID(iter)),
14818
                             it->it_seq, it->it_index);
14819
    } else {
14820
        PyObject *u = unicode_new_empty();
14821
        if (u == NULL)
  Branch (14821:13): [True: 0, False: 6]
14822
            return NULL;
14823
        return Py_BuildValue("N(N)", _PyEval_GetBuiltin(&_Py_ID(iter)), u);
14824
    }
14825
}
14826
14827
PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14828
14829
static PyObject *
14830
unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14831
{
14832
    Py_ssize_t index = PyLong_AsSsize_t(state);
14833
    if (index == -1 && 
PyErr_Occurred()0
)
  Branch (14833:9): [True: 0, False: 564]
  Branch (14833:24): [True: 0, False: 0]
14834
        return NULL;
14835
    if (it->it_seq != NULL) {
  Branch (14835:9): [True: 564, False: 0]
14836
        if (index < 0)
  Branch (14836:13): [True: 0, False: 564]
14837
            index = 0;
14838
        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
  Branch (14838:18): [True: 0, False: 564]
14839
            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
14840
        it->it_index = index;
14841
    }
14842
    Py_RETURN_NONE;
14843
}
14844
14845
PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14846
14847
static PyMethodDef unicodeiter_methods[] = {
14848
    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
14849
     length_hint_doc},
14850
    {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14851
     reduce_doc},
14852
    {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
14853
     setstate_doc},
14854
    {NULL,      NULL}       /* sentinel */
14855
};
14856
14857
PyTypeObject PyUnicodeIter_Type = {
14858
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14859
    "str_iterator",         /* tp_name */
14860
    sizeof(unicodeiterobject),      /* tp_basicsize */
14861
    0,                  /* tp_itemsize */
14862
    /* methods */
14863
    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
14864
    0,                  /* tp_vectorcall_offset */
14865
    0,                  /* tp_getattr */
14866
    0,                  /* tp_setattr */
14867
    0,                  /* tp_as_async */
14868
    0,                  /* tp_repr */
14869
    0,                  /* tp_as_number */
14870
    0,                  /* tp_as_sequence */
14871
    0,                  /* tp_as_mapping */
14872
    0,                  /* tp_hash */
14873
    0,                  /* tp_call */
14874
    0,                  /* tp_str */
14875
    PyObject_GenericGetAttr,        /* tp_getattro */
14876
    0,                  /* tp_setattro */
14877
    0,                  /* tp_as_buffer */
14878
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14879
    0,                  /* tp_doc */
14880
    (traverseproc)unicodeiter_traverse, /* tp_traverse */
14881
    0,                  /* tp_clear */
14882
    0,                  /* tp_richcompare */
14883
    0,                  /* tp_weaklistoffset */
14884
    PyObject_SelfIter,          /* tp_iter */
14885
    (iternextfunc)unicodeiter_next,     /* tp_iternext */
14886
    unicodeiter_methods,            /* tp_methods */
14887
    0,
14888
};
14889
14890
PyTypeObject _PyUnicodeASCIIIter_Type = {
14891
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14892
    .tp_name = "str_ascii_iterator",
14893
    .tp_basicsize = sizeof(unicodeiterobject),
14894
    .tp_dealloc = (destructor)unicodeiter_dealloc,
14895
    .tp_getattro = PyObject_GenericGetAttr,
14896
    .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
14897
    .tp_traverse = (traverseproc)unicodeiter_traverse,
14898
    .tp_iter = PyObject_SelfIter,
14899
    .tp_iternext = (iternextfunc)unicode_ascii_iter_next,
14900
    .tp_methods = unicodeiter_methods,
14901
};
14902
14903
static PyObject *
14904
unicode_iter(PyObject *seq)
14905
{
14906
    unicodeiterobject *it;
14907
14908
    if (!PyUnicode_Check(seq)) {
  Branch (14908:9): [True: 0, False: 705k]
14909
        PyErr_BadInternalCall();
14910
        return NULL;
14911
    }
14912
    if (PyUnicode_IS_COMPACT_ASCII(seq)) {
14913
        it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
14914
    }
14915
    else {
14916
        it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14917
    }
14918
    if (it == NULL)
  Branch (14918:9): [True: 0, False: 705k]
14919
        return NULL;
14920
    it->it_index = 0;
14921
    Py_INCREF(seq);
14922
    it->it_seq = seq;
14923
    _PyObject_GC_TRACK(it);
14924
    return (PyObject *)it;
14925
}
14926
14927
static int
14928
encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
14929
{
14930
    int res;
14931
    res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
14932
    if (res == -2) {
  Branch (14932:9): [True: 0, False: 1.11k]
14933
        PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
14934
        return -1;
14935
    }
14936
    if (res < 0) {
  Branch (14936:9): [True: 0, False: 1.11k]
14937
        PyErr_NoMemory();
14938
        return -1;
14939
    }
14940
    return 0;
14941
}
14942
14943
14944
static int
14945
config_get_codec_name(wchar_t **config_encoding)
14946
{
14947
    char *encoding;
14948
    if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
  Branch (14948:9): [True: 0, False: 556]
14949
        return -1;
14950
    }
14951
14952
    PyObject *name_obj = NULL;
14953
    PyObject *codec = _PyCodec_Lookup(encoding);
14954
    PyMem_RawFree(encoding);
14955
14956
    if (!codec)
  Branch (14956:9): [True: 0, False: 556]
14957
        goto error;
14958
14959
    name_obj = PyObject_GetAttrString(codec, "name");
14960
    Py_CLEAR(codec);
14961
    if (!name_obj) {
  Branch (14961:9): [True: 0, False: 556]
14962
        goto error;
14963
    }
14964
14965
    wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
14966
    Py_DECREF(name_obj);
14967
    if (wname == NULL) {
  Branch (14967:9): [True: 0, False: 556]
14968
        goto error;
14969
    }
14970
14971
    wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
14972
    if (raw_wname == NULL) {
  Branch (14972:9): [True: 0, False: 556]
14973
        PyMem_Free(wname);
14974
        PyErr_NoMemory();
14975
        goto error;
14976
    }
14977
14978
    PyMem_RawFree(*config_encoding);
14979
    *config_encoding = raw_wname;
14980
14981
    PyMem_Free(wname);
14982
    return 0;
14983
14984
error:
14985
    Py_XDECREF(codec);
14986
    Py_XDECREF(name_obj);
14987
    return -1;
14988
}
14989
14990
14991
static PyStatus
14992
init_stdio_encoding(PyInterpreterState *interp)
14993
{
14994
    /* Update the stdio encoding to the normalized Python codec name. */
14995
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14996
    if (config_get_codec_name(&config->stdio_encoding) < 0) {
  Branch (14996:9): [True: 0, False: 278]
14997
        return _PyStatus_ERR("failed to get the Python codec name "
14998
                             "of the stdio encoding");
14999
    }
15000
    return _PyStatus_OK();
15001
}
15002
15003
15004
static int
15005
init_fs_codec(PyInterpreterState *interp)
15006
{
15007
    const PyConfig *config = _PyInterpreterState_GetConfig(interp);
15008
15009
    _Py_error_handler error_handler;
15010
    error_handler = get_error_handler_wide(config->filesystem_errors);
15011
    if (error_handler == _Py_ERROR_UNKNOWN) {
  Branch (15011:9): [True: 0, False: 278]
15012
        PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
15013
        return -1;
15014
    }
15015
15016
    char *encoding, *errors;
15017
    if (encode_wstr_utf8(config->filesystem_encoding,
  Branch (15017:9): [True: 0, False: 278]
15018
                         &encoding,
15019
                         "filesystem_encoding") < 0) {
15020
        return -1;
15021
    }
15022
15023
    if (encode_wstr_utf8(config->filesystem_errors,
  Branch (15023:9): [True: 0, False: 278]
15024
                         &errors,
15025
                         "filesystem_errors") < 0) {
15026
        PyMem_RawFree(encoding);
15027
        return -1;
15028
    }
15029
15030
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
15031
    PyMem_RawFree(fs_codec->encoding);
15032
    fs_codec->encoding = encoding;
15033
    /* encoding has been normalized by init_fs_encoding() */
15034
    fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
15035
    PyMem_RawFree(fs_codec->errors);
15036
    fs_codec->errors = errors;
15037
    fs_codec->error_handler = error_handler;
15038
15039
#ifdef _Py_FORCE_UTF8_FS_ENCODING
15040
    assert(fs_codec->utf8 == 1);
15041
#endif
15042
15043
    /* At this point, PyUnicode_EncodeFSDefault() and
15044
       PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15045
       the C implementation of the filesystem encoding. */
15046
15047
    /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15048
       global configuration variables. */
15049
    if (_Py_SetFileSystemEncoding(fs_codec->encoding,
  Branch (15049:9): [True: 0, False: 278]
15050
                                  fs_codec->errors) < 0) {
15051
        PyErr_NoMemory();
15052
        return -1;
15053
    }
15054
    return 0;
15055
}
15056
15057
15058
static PyStatus
15059
init_fs_encoding(PyThreadState *tstate)
15060
{
15061
    PyInterpreterState *interp = tstate->interp;
15062
15063
    /* Update the filesystem encoding to the normalized Python codec name.
15064
       For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15065
       (Python codec name). */
15066
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
15067
    if (config_get_codec_name(&config->filesystem_encoding) < 0) {
  Branch (15067:9): [True: 0, False: 278]
15068
        _Py_DumpPathConfig(tstate);
15069
        return _PyStatus_ERR("failed to get the Python codec "
15070
                             "of the filesystem encoding");
15071
    }
15072
15073
    if (init_fs_codec(interp) < 0) {
  Branch (15073:9): [True: 0, False: 278]
15074
        return _PyStatus_ERR("cannot initialize filesystem codec");
15075
    }
15076
    return _PyStatus_OK();
15077
}
15078
15079
15080
PyStatus
15081
_PyUnicode_InitEncodings(PyThreadState *tstate)
15082
{
15083
    PyStatus status = init_fs_encoding(tstate);
15084
    if (_PyStatus_EXCEPTION(status)) {
15085
        return status;
15086
    }
15087
15088
    return init_stdio_encoding(tstate->interp);
15089
}
15090
15091
15092
static void
15093
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
15094
{
15095
    PyMem_RawFree(fs_codec->encoding);
15096
    fs_codec->encoding = NULL;
15097
    fs_codec->utf8 = 0;
15098
    PyMem_RawFree(fs_codec->errors);
15099
    fs_codec->errors = NULL;
15100
    fs_codec->error_handler = _Py_ERROR_UNKNOWN;
15101
}
15102
15103
15104
#ifdef MS_WINDOWS
15105
int
15106
_PyUnicode_EnableLegacyWindowsFSEncoding(void)
15107
{
15108
    PyInterpreterState *interp = _PyInterpreterState_GET();
15109
    PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
15110
15111
    /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15112
    wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15113
    wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15114
    if (encoding == NULL || errors == NULL) {
15115
        PyMem_RawFree(encoding);
15116
        PyMem_RawFree(errors);
15117
        PyErr_NoMemory();
15118
        return -1;
15119
    }
15120
15121
    PyMem_RawFree(config->filesystem_encoding);
15122
    config->filesystem_encoding = encoding;
15123
    PyMem_RawFree(config->filesystem_errors);
15124
    config->filesystem_errors = errors;
15125
15126
    return init_fs_codec(interp);
15127
}
15128
#endif
15129
15130
15131
#ifdef Py_DEBUG
15132
static inline int
15133
unicode_is_finalizing(void)
15134
{
15135
    return (interned == NULL);
15136
}
15137
#endif
15138
15139
15140
void
15141
_PyUnicode_FiniTypes(PyInterpreterState *interp)
15142
{
15143
    if (!_Py_IsMainInterpreter(interp)) {
  Branch (15143:9): [True: 169, False: 103]
15144
        return;
15145
    }
15146
15147
    _PyStaticType_Dealloc(&EncodingMapType);
15148
    _PyStaticType_Dealloc(&PyFieldNameIter_Type);
15149
    _PyStaticType_Dealloc(&PyFormatterIter_Type);
15150
}
15151
15152
15153
static void unicode_static_dealloc(PyObject *op)
15154
{
15155
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
15156
15157
    assert(ascii->state.compact);
15158
15159
    if (!ascii->state.ascii) {
  Branch (15159:9): [True: 35.4k, False: 300k]
15160
        PyCompactUnicodeObject* compact = (PyCompactUnicodeObject*)op;
15161
        if (compact->utf8) {
  Branch (15161:13): [True: 137, False: 35.2k]
15162
            PyObject_Free(compact->utf8);
15163
            compact->utf8 = NULL;
15164
            compact->utf8_length = 0;
15165
        }
15166
    }
15167
}
15168
15169
15170
void
15171
_PyUnicode_Fini(PyInterpreterState *interp)
15172
{
15173
    struct _Py_unicode_state *state = &interp->unicode;
15174
15175
    if (_Py_IsMainInterpreter(interp)) {
  Branch (15175:9): [True: 103, False: 169]
15176
        // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
15177
        assert(interned == NULL);
15178
        // bpo-47182: force a unicodedata CAPI capsule re-import on
15179
        // subsequent initialization of main interpreter.
15180
        ucnhash_capi = NULL;
15181
    }
15182
15183
    _PyUnicode_FiniEncodings(&state->fs_codec);
15184
15185
    unicode_clear_identifiers(state);
15186
15187
    // Clear the single character singletons
15188
    for (int i = 0; i < 128; 
i++34.8k
) {
  Branch (15188:21): [True: 34.8k, False: 272]
15189
        unicode_static_dealloc((PyObject*)&_Py_SINGLETON(strings).ascii[i]);
15190
    }
15191
    for (int i = 0; i < 128; 
i++34.8k
) {
  Branch (15191:21): [True: 34.8k, False: 272]
15192
        unicode_static_dealloc((PyObject*)&_Py_SINGLETON(strings).latin1[i]);
15193
    }
15194
}
15195
15196
15197
void
15198
_PyStaticUnicode_Dealloc(PyObject *op)
15199
{
15200
    unicode_static_dealloc(op);
15201
}
15202
15203
15204
/* A _string module, to export formatter_parser and formatter_field_name_split
15205
   to the string.Formatter class implemented in Python. */
15206
15207
static PyMethodDef _string_methods[] = {
15208
    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15209
     METH_O, PyDoc_STR("split the argument as a field name")},
15210
    {"formatter_parser", (PyCFunction) formatter_parser,
15211
     METH_O, PyDoc_STR("parse the argument as a format string")},
15212
    {NULL, NULL}
15213
};
15214
15215
static struct PyModuleDef _string_module = {
15216
    PyModuleDef_HEAD_INIT,
15217
    .m_name = "_string",
15218
    .m_doc = PyDoc_STR("string helper module"),
15219
    .m_size = 0,
15220
    .m_methods = _string_methods,
15221
};
15222
15223
PyMODINIT_FUNC
15224
PyInit__string(void)
15225
{
15226
    return PyModuleDef_Init(&_string_module);
15227
}
15228
15229
15230
#ifdef __cplusplus
15231
}
15232
#endif