Coverage Report

Created: 2022-07-08 09:39

/home/mdboom/Work/builds/cpython/Python/codecs.c
Line
Count
Source (jump to first uncovered line)
1
/* ------------------------------------------------------------------------
2
3
   Python Codec Registry and support functions
4
5
Written by Marc-Andre Lemburg (mal@lemburg.com).
6
7
Copyright (c) Corporation for National Research Initiatives.
8
9
   ------------------------------------------------------------------------ */
10
11
#include "Python.h"
12
#include "pycore_call.h"          // _PyObject_CallNoArgs()
13
#include "pycore_interp.h"        // PyInterpreterState.codec_search_path
14
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
15
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
16
#include <ctype.h>
17
18
const char *Py_hexdigits = "0123456789abcdef";
19
20
/* --- Codec Registry ----------------------------------------------------- */
21
22
/* Import the standard encodings package which will register the first
23
   codec search function.
24
25
   This is done in a lazy way so that the Unicode implementation does
26
   not downgrade startup time of scripts not needing it.
27
28
   ImportErrors are silently ignored by this function. Only one try is
29
   made.
30
31
*/
32
33
static int _PyCodecRegistry_Init(void); /* Forward */
34
35
int PyCodec_Register(PyObject *search_function)
36
{
37
    PyInterpreterState *interp = _PyInterpreterState_GET();
38
    if (interp->codec_search_path == NULL && 
_PyCodecRegistry_Init()0
)
  Branch (38:9): [True: 0, False: 558]
  Branch (38:46): [True: 0, False: 0]
39
        goto onError;
40
    if (search_function == NULL) {
  Branch (40:9): [True: 0, False: 558]
41
        PyErr_BadArgument();
42
        goto onError;
43
    }
44
    if (!PyCallable_Check(search_function)) {
  Branch (44:9): [True: 1, False: 557]
45
        PyErr_SetString(PyExc_TypeError, "argument must be callable");
46
        goto onError;
47
    }
48
    return PyList_Append(interp->codec_search_path, search_function);
49
50
 onError:
51
    return -1;
52
}
53
54
int
55
PyCodec_Unregister(PyObject *search_function)
56
{
57
    PyInterpreterState *interp = PyInterpreterState_Get();
58
    PyObject *codec_search_path = interp->codec_search_path;
59
    /* Do nothing if codec_search_path is not created yet or was cleared. */
60
    if (codec_search_path == NULL) {
  Branch (60:9): [True: 0, False: 279]
61
        return 0;
62
    }
63
64
    assert(PyList_CheckExact(codec_search_path));
65
    Py_ssize_t n = PyList_GET_SIZE(codec_search_path);
66
    for (Py_ssize_t i = 0; i < n; 
i++279
) {
  Branch (66:28): [True: 558, False: 0]
67
        PyObject *item = PyList_GET_ITEM(codec_search_path, i);
68
        if (item == search_function) {
  Branch (68:13): [True: 279, False: 279]
69
            if (interp->codec_search_cache != NULL) {
  Branch (69:17): [True: 279, False: 0]
70
                assert(PyDict_CheckExact(interp->codec_search_cache));
71
                PyDict_Clear(interp->codec_search_cache);
72
            }
73
            return PyList_SetSlice(codec_search_path, i, i+1, NULL);
74
        }
75
    }
76
    return 0;
77
}
78
79
extern int _Py_normalize_encoding(const char *, char *, size_t);
80
81
/* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
82
   converted to lower case, spaces and hyphens are replaced with underscores. */
83
84
static
85
PyObject *normalizestring(const char *string)
86
{
87
    size_t len = strlen(string);
88
    char *encoding;
89
    PyObject *v;
90
91
    if (len > PY_SSIZE_T_MAX) {
  Branch (91:9): [True: 0, False: 1.17M]
92
        PyErr_SetString(PyExc_OverflowError, "string is too large");
93
        return NULL;
94
    }
95
96
    encoding = PyMem_Malloc(len + 1);
97
    if (encoding == NULL)
  Branch (97:9): [True: 0, False: 1.17M]
98
        return PyErr_NoMemory();
99
100
    if (!_Py_normalize_encoding(string, encoding, len + 1))
  Branch (100:9): [True: 0, False: 1.17M]
101
    {
102
        PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
103
        PyMem_Free(encoding);
104
        return NULL;
105
    }
106
107
    v = PyUnicode_FromString(encoding);
108
    PyMem_Free(encoding);
109
    return v;
110
}
111
112
/* Lookup the given encoding and return a tuple providing the codec
113
   facilities.
114
115
   The encoding string is looked up converted to all lower-case
116
   characters. This makes encodings looked up through this mechanism
117
   effectively case-insensitive.
118
119
   If no codec is found, a LookupError is set and NULL returned.
120
121
   As side effect, this tries to load the encodings package, if not
122
   yet done. This is part of the lazy load strategy for the encodings
123
   package.
124
125
*/
126
127
PyObject *_PyCodec_Lookup(const char *encoding)
128
{
129
    if (encoding == NULL) {
  Branch (129:9): [True: 0, False: 1.17M]
130
        PyErr_BadArgument();
131
        return NULL;
132
    }
133
134
    PyInterpreterState *interp = _PyInterpreterState_GET();
135
    if (interp->codec_search_path == NULL && 
_PyCodecRegistry_Init()278
) {
  Branch (135:9): [True: 278, False: 1.17M]
  Branch (135:46): [True: 0, False: 278]
136
        return NULL;
137
    }
138
139
    /* Convert the encoding to a normalized Python string: all
140
       characters are converted to lower case, spaces and hyphens are
141
       replaced with underscores. */
142
    PyObject *v = normalizestring(encoding);
143
    if (v == NULL) {
  Branch (143:9): [True: 0, False: 1.17M]
144
        return NULL;
145
    }
146
    PyUnicode_InternInPlace(&v);
147
148
    /* First, try to lookup the name in the registry dictionary */
149
    PyObject *result = PyDict_GetItemWithError(interp->codec_search_cache, v);
150
    if (result != NULL) {
  Branch (150:9): [True: 1.17M, False: 972]
151
        Py_INCREF(result);
152
        Py_DECREF(v);
153
        return result;
154
    }
155
    else if (PyErr_Occurred()) {
  Branch (155:14): [True: 0, False: 972]
156
        goto onError;
157
    }
158
159
    /* Next, scan the search functions in order of registration */
160
    const Py_ssize_t len = PyList_Size(interp->codec_search_path);
161
    if (len < 0)
  Branch (161:9): [True: 0, False: 972]
162
        goto onError;
163
    if (len == 0) {
  Branch (163:9): [True: 0, False: 972]
164
        PyErr_SetString(PyExc_LookupError,
165
                        "no codec search functions registered: "
166
                        "can't find encoding");
167
        goto onError;
168
    }
169
170
    Py_ssize_t i;
171
    for (i = 0; i < len; 
i++76
) {
  Branch (171:17): [True: 1.00k, False: 47]
172
        PyObject *func;
173
174
        func = PyList_GetItem(interp->codec_search_path, i);
175
        if (func == NULL)
  Branch (175:13): [True: 0, False: 1.00k]
176
            goto onError;
177
        result = PyObject_CallOneArg(func, v);
178
        if (result == NULL)
  Branch (178:13): [True: 0, False: 1.00k]
179
            goto onError;
180
        if (result == Py_None) {
  Branch (180:13): [True: 76, False: 925]
181
            Py_DECREF(result);
182
            continue;
183
        }
184
        if (!PyTuple_Check(result) || 
PyTuple_GET_SIZE924
(result) != 4924
) {
  Branch (184:13): [True: 1, False: 924]
  Branch (184:39): [True: 0, False: 924]
185
            PyErr_SetString(PyExc_TypeError,
186
                            "codec search functions must return 4-tuples");
187
            Py_DECREF(result);
188
            goto onError;
189
        }
190
        break;
191
    }
192
    if (i == len) {
  Branch (192:9): [True: 47, False: 924]
193
        /* XXX Perhaps we should cache misses too ? */
194
        PyErr_Format(PyExc_LookupError,
195
                     "unknown encoding: %s", encoding);
196
        goto onError;
197
    }
198
199
    /* Cache and return the result */
200
    if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
  Branch (200:9): [True: 0, False: 924]
201
        Py_DECREF(result);
202
        goto onError;
203
    }
204
    Py_DECREF(v);
205
    return result;
206
207
 onError:
208
    Py_DECREF(v);
209
    return NULL;
210
}
211
212
/* Codec registry encoding check API. */
213
214
int PyCodec_KnownEncoding(const char *encoding)
215
{
216
    PyObject *codecs;
217
218
    codecs = _PyCodec_Lookup(encoding);
219
    if (!codecs) {
  Branch (219:9): [True: 0, False: 0]
220
        PyErr_Clear();
221
        return 0;
222
    }
223
    else {
224
        Py_DECREF(codecs);
225
        return 1;
226
    }
227
}
228
229
static
230
PyObject *args_tuple(PyObject *object,
231
                     const char *errors)
232
{
233
    PyObject *args;
234
235
    args = PyTuple_New(1 + (errors != NULL));
236
    if (args == NULL)
  Branch (236:9): [True: 0, False: 1.12M]
237
        return NULL;
238
    Py_INCREF(object);
239
    PyTuple_SET_ITEM(args,0,object);
240
    if (errors) {
  Branch (240:9): [True: 1.04M, False: 75.8k]
241
        PyObject *v;
242
243
        v = PyUnicode_FromString(errors);
244
        if (v == NULL) {
  Branch (244:13): [True: 0, False: 1.04M]
245
            Py_DECREF(args);
246
            return NULL;
247
        }
248
        PyTuple_SET_ITEM(args, 1, v);
249
    }
250
    return args;
251
}
252
253
/* Helper function to get a codec item */
254
255
static
256
PyObject *codec_getitem(const char *encoding, int index)
257
{
258
    PyObject *codecs;
259
    PyObject *v;
260
261
    codecs = _PyCodec_Lookup(encoding);
262
    if (codecs == NULL)
  Branch (262:9): [True: 3, False: 10.0k]
263
        return NULL;
264
    v = PyTuple_GET_ITEM(codecs, index);
265
    Py_DECREF(codecs);
266
    Py_INCREF(v);
267
    return v;
268
}
269
270
/* Helper functions to create an incremental codec. */
271
static
272
PyObject *codec_makeincrementalcodec(PyObject *codec_info,
273
                                     const char *errors,
274
                                     const char *attrname)
275
{
276
    PyObject *ret, *inccodec;
277
278
    inccodec = PyObject_GetAttrString(codec_info, attrname);
279
    if (inccodec == NULL)
  Branch (279:9): [True: 0, False: 27.5k]
280
        return NULL;
281
    if (errors)
  Branch (281:9): [True: 27.3k, False: 200]
282
        ret = PyObject_CallFunction(inccodec, "s", errors);
283
    else
284
        ret = _PyObject_CallNoArgs(inccodec);
285
    Py_DECREF(inccodec);
286
    return ret;
287
}
288
289
static
290
PyObject *codec_getincrementalcodec(const char *encoding,
291
                                    const char *errors,
292
                                    const char *attrname)
293
{
294
    PyObject *codec_info, *ret;
295
296
    codec_info = _PyCodec_Lookup(encoding);
297
    if (codec_info == NULL)
  Branch (297:9): [True: 0, False: 398]
298
        return NULL;
299
    ret = codec_makeincrementalcodec(codec_info, errors, attrname);
300
    Py_DECREF(codec_info);
301
    return ret;
302
}
303
304
/* Helper function to create a stream codec. */
305
306
static
307
PyObject *codec_getstreamcodec(const char *encoding,
308
                               PyObject *stream,
309
                               const char *errors,
310
                               const int index)
311
{
312
    PyObject *codecs, *streamcodec, *codeccls;
313
314
    codecs = _PyCodec_Lookup(encoding);
315
    if (codecs == NULL)
  Branch (315:9): [True: 0, False: 0]
316
        return NULL;
317
318
    codeccls = PyTuple_GET_ITEM(codecs, index);
319
    if (errors != NULL)
  Branch (319:9): [True: 0, False: 0]
320
        streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
321
    else
322
        streamcodec = PyObject_CallOneArg(codeccls, stream);
323
    Py_DECREF(codecs);
324
    return streamcodec;
325
}
326
327
/* Helpers to work with the result of _PyCodec_Lookup
328
329
 */
330
PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
331
                                             const char *errors)
332
{
333
    return codec_makeincrementalcodec(codec_info, errors,
334
                                      "incrementaldecoder");
335
}
336
337
PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
338
                                             const char *errors)
339
{
340
    return codec_makeincrementalcodec(codec_info, errors,
341
                                      "incrementalencoder");
342
}
343
344
345
/* Convenience APIs to query the Codec registry.
346
347
   All APIs return a codec object with incremented refcount.
348
349
 */
350
351
PyObject *PyCodec_Encoder(const char *encoding)
352
{
353
    return codec_getitem(encoding, 0);
354
}
355
356
PyObject *PyCodec_Decoder(const char *encoding)
357
{
358
    return codec_getitem(encoding, 1);
359
}
360
361
PyObject *PyCodec_IncrementalEncoder(const char *encoding,
362
                                     const char *errors)
363
{
364
    return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
365
}
366
367
PyObject *PyCodec_IncrementalDecoder(const char *encoding,
368
                                     const char *errors)
369
{
370
    return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
371
}
372
373
PyObject *PyCodec_StreamReader(const char *encoding,
374
                               PyObject *stream,
375
                               const char *errors)
376
{
377
    return codec_getstreamcodec(encoding, stream, errors, 2);
378
}
379
380
PyObject *PyCodec_StreamWriter(const char *encoding,
381
                               PyObject *stream,
382
                               const char *errors)
383
{
384
    return codec_getstreamcodec(encoding, stream, errors, 3);
385
}
386
387
/* Helper that tries to ensure the reported exception chain indicates the
388
 * codec that was invoked to trigger the failure without changing the type
389
 * of the exception raised.
390
 */
391
static void
392
wrap_codec_error(const char *operation,
393
                 const char *encoding)
394
{
395
    /* TrySetFromCause will replace the active exception with a suitably
396
     * updated clone if it can, otherwise it will leave the original
397
     * exception alone.
398
     */
399
    _PyErr_TrySetFromCause("%s with '%s' codec failed",
400
                           operation, encoding);
401
}
402
403
/* Encode an object (e.g. a Unicode object) using the given encoding
404
   and return the resulting encoded object (usually a Python string).
405
406
   errors is passed to the encoder factory as argument if non-NULL. */
407
408
static PyObject *
409
_PyCodec_EncodeInternal(PyObject *object,
410
                        PyObject *encoder,
411
                        const char *encoding,
412
                        const char *errors)
413
{
414
    PyObject *args = NULL, *result = NULL;
415
    PyObject *v = NULL;
416
417
    args = args_tuple(object, errors);
418
    if (args == NULL)
  Branch (418:9): [True: 0, False: 1.08M]
419
        goto onError;
420
421
    result = PyObject_Call(encoder, args, NULL);
422
    if (result == NULL) {
  Branch (422:9): [True: 63, False: 1.08M]
423
        wrap_codec_error("encoding", encoding);
424
        goto onError;
425
    }
426
427
    if (!PyTuple_Check(result) ||
  Branch (427:9): [True: 1, False: 1.08M]
428
        
PyTuple_GET_SIZE1.08M
(result) != 21.08M
) {
  Branch (428:9): [True: 0, False: 1.08M]
429
        PyErr_SetString(PyExc_TypeError,
430
                        "encoder must return a tuple (object, integer)");
431
        goto onError;
432
    }
433
    v = PyTuple_GET_ITEM(result,0);
434
    Py_INCREF(v);
435
    /* We don't check or use the second (integer) entry. */
436
437
    Py_DECREF(args);
438
    Py_DECREF(encoder);
439
    Py_DECREF(result);
440
    return v;
441
442
 onError:
443
    Py_XDECREF(result);
444
    Py_XDECREF(args);
445
    Py_XDECREF(encoder);
446
    return NULL;
447
}
448
449
/* Decode an object (usually a Python string) using the given encoding
450
   and return an equivalent object (e.g. a Unicode object).
451
452
   errors is passed to the decoder factory as argument if non-NULL. */
453
454
static PyObject *
455
_PyCodec_DecodeInternal(PyObject *object,
456
                        PyObject *decoder,
457
                        const char *encoding,
458
                        const char *errors)
459
{
460
    PyObject *args = NULL, *result = NULL;
461
    PyObject *v;
462
463
    args = args_tuple(object, errors);
464
    if (args == NULL)
  Branch (464:9): [True: 0, False: 43.5k]
465
        goto onError;
466
467
    result = PyObject_Call(decoder, args, NULL);
468
    if (result == NULL) {
  Branch (468:9): [True: 78, False: 43.5k]
469
        wrap_codec_error("decoding", encoding);
470
        goto onError;
471
    }
472
    if (!PyTuple_Check(result) ||
  Branch (472:9): [True: 1, False: 43.5k]
473
        
PyTuple_GET_SIZE43.5k
(result) != 243.5k
) {
  Branch (473:9): [True: 0, False: 43.5k]
474
        PyErr_SetString(PyExc_TypeError,
475
                        "decoder must return a tuple (object,integer)");
476
        goto onError;
477
    }
478
    v = PyTuple_GET_ITEM(result,0);
479
    Py_INCREF(v);
480
    /* We don't check or use the second (integer) entry. */
481
482
    Py_DECREF(args);
483
    Py_DECREF(decoder);
484
    Py_DECREF(result);
485
    return v;
486
487
 onError:
488
    Py_XDECREF(args);
489
    Py_XDECREF(decoder);
490
    Py_XDECREF(result);
491
    return NULL;
492
}
493
494
/* Generic encoding/decoding API */
495
PyObject *PyCodec_Encode(PyObject *object,
496
                         const char *encoding,
497
                         const char *errors)
498
{
499
    PyObject *encoder;
500
501
    encoder = PyCodec_Encoder(encoding);
502
    if (encoder == NULL)
  Branch (502:9): [True: 2, False: 9.98k]
503
        return NULL;
504
505
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
506
}
507
508
PyObject *PyCodec_Decode(PyObject *object,
509
                         const char *encoding,
510
                         const char *errors)
511
{
512
    PyObject *decoder;
513
514
    decoder = PyCodec_Decoder(encoding);
515
    if (decoder == NULL)
  Branch (515:9): [True: 1, False: 40]
516
        return NULL;
517
518
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
519
}
520
521
/* Text encoding/decoding API */
522
PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
523
                                       const char *alternate_command)
524
{
525
    PyObject *codec;
526
    PyObject *attr;
527
    int is_text_codec;
528
529
    codec = _PyCodec_Lookup(encoding);
530
    if (codec == NULL)
  Branch (530:9): [True: 32, False: 1.13M]
531
        return NULL;
532
533
    /* Backwards compatibility: assume any raw tuple describes a text
534
     * encoding, and the same for anything lacking the private
535
     * attribute.
536
     */
537
    if (!PyTuple_CheckExact(codec)) {
  Branch (537:9): [True: 1.13M, False: 19]
538
        if (_PyObject_LookupAttr(codec, &_Py_ID(_is_text_encoding), &attr) < 0) {
  Branch (538:13): [True: 0, False: 1.13M]
539
            Py_DECREF(codec);
540
            return NULL;
541
        }
542
        if (attr != NULL) {
  Branch (542:13): [True: 1.13M, False: 0]
543
            is_text_codec = PyObject_IsTrue(attr);
544
            Py_DECREF(attr);
545
            if (is_text_codec <= 0) {
  Branch (545:17): [True: 22, False: 1.13M]
546
                Py_DECREF(codec);
547
                if (!is_text_codec)
  Branch (547:21): [True: 22, False: 0]
548
                    PyErr_Format(PyExc_LookupError,
549
                                 "'%.400s' is not a text encoding; "
550
                                 "use %s to handle arbitrary codecs",
551
                                 encoding, alternate_command);
552
                return NULL;
553
            }
554
        }
555
    }
556
557
    /* This appears to be a valid text encoding */
558
    return codec;
559
}
560
561
562
static
563
PyObject *codec_getitem_checked(const char *encoding,
564
                                const char *alternate_command,
565
                                int index)
566
{
567
    PyObject *codec;
568
    PyObject *v;
569
570
    codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
571
    if (codec == NULL)
  Branch (571:9): [True: 51, False: 1.11M]
572
        return NULL;
573
574
    v = PyTuple_GET_ITEM(codec, index);
575
    Py_INCREF(v);
576
    Py_DECREF(codec);
577
    return v;
578
}
579
580
static PyObject * _PyCodec_TextEncoder(const char *encoding)
581
{
582
    return codec_getitem_checked(encoding, "codecs.encode()", 0);
583
}
584
585
static PyObject * _PyCodec_TextDecoder(const char *encoding)
586
{
587
    return codec_getitem_checked(encoding, "codecs.decode()", 1);
588
}
589
590
PyObject *_PyCodec_EncodeText(PyObject *object,
591
                              const char *encoding,
592
                              const char *errors)
593
{
594
    PyObject *encoder;
595
596
    encoder = _PyCodec_TextEncoder(encoding);
597
    if (encoder == NULL)
  Branch (597:9): [True: 12, False: 1.07M]
598
        return NULL;
599
600
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
601
}
602
603
PyObject *_PyCodec_DecodeText(PyObject *object,
604
                              const char *encoding,
605
                              const char *errors)
606
{
607
    PyObject *decoder;
608
609
    decoder = _PyCodec_TextDecoder(encoding);
610
    if (decoder == NULL)
  Branch (610:9): [True: 39, False: 43.5k]
611
        return NULL;
612
613
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
614
}
615
616
/* Register the error handling callback function error under the name
617
   name. This function will be called by the codec when it encounters
618
   an unencodable characters/undecodable bytes and doesn't know the
619
   callback name, when name is specified as the error parameter
620
   in the call to the encode/decode function.
621
   Return 0 on success, -1 on error */
622
int PyCodec_RegisterError(const char *name, PyObject *error)
623
{
624
    PyInterpreterState *interp = _PyInterpreterState_GET();
625
    if (interp->codec_search_path == NULL && 
_PyCodecRegistry_Init()0
)
  Branch (625:9): [True: 0, False: 2.45k]
  Branch (625:46): [True: 0, False: 0]
626
        return -1;
627
    if (!PyCallable_Check(error)) {
  Branch (627:9): [True: 1, False: 2.45k]
628
        PyErr_SetString(PyExc_TypeError, "handler must be callable");
629
        return -1;
630
    }
631
    return PyDict_SetItemString(interp->codec_error_registry,
632
                                name, error);
633
}
634
635
/* Lookup the error handling callback function registered under the
636
   name error. As a special case NULL can be passed, in which case
637
   the error handling callback for strict encoding will be returned. */
638
PyObject *PyCodec_LookupError(const char *name)
639
{
640
    PyObject *handler = NULL;
641
642
    PyInterpreterState *interp = _PyInterpreterState_GET();
643
    if (interp->codec_search_path == NULL && 
_PyCodecRegistry_Init()0
)
  Branch (643:9): [True: 0, False: 7.28k]
  Branch (643:46): [True: 0, False: 0]
644
        return NULL;
645
646
    if (name==NULL)
  Branch (646:9): [True: 1.33k, False: 5.95k]
647
        name = "strict";
648
    handler = _PyDict_GetItemStringWithError(interp->codec_error_registry, name);
649
    if (handler) {
  Branch (649:9): [True: 7.28k, False: 3]
650
        Py_INCREF(handler);
651
    }
652
    else if (!PyErr_Occurred()) {
  Branch (652:14): [True: 3, False: 0]
653
        PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
654
    }
655
    return handler;
656
}
657
658
static void wrong_exception_type(PyObject *exc)
659
{
660
    PyErr_Format(PyExc_TypeError,
661
                 "don't know how to handle %.200s in error callback",
662
                 Py_TYPE(exc)->tp_name);
663
}
664
665
PyObject *PyCodec_StrictErrors(PyObject *exc)
666
{
667
    if (PyExceptionInstance_Check(exc))
668
        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
669
    else
670
        PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
671
    return NULL;
672
}
673
674
675
PyObject *PyCodec_IgnoreErrors(PyObject *exc)
676
{
677
    Py_ssize_t end;
678
679
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
680
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
  Branch (680:13): [True: 0, False: 3.42k]
681
            return NULL;
682
    }
683
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
684
        if (PyUnicodeDecodeError_GetEnd(exc, &end))
  Branch (684:13): [True: 0, False: 580]
685
            return NULL;
686
    }
687
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
688
        if (PyUnicodeTranslateError_GetEnd(exc, &end))
  Branch (688:13): [True: 0, False: 1]
689
            return NULL;
690
    }
691
    else {
692
        wrong_exception_type(exc);
693
        return NULL;
694
    }
695
    return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
696
}
697
698
699
PyObject *PyCodec_ReplaceErrors(PyObject *exc)
700
{
701
    Py_ssize_t start, end, i, len;
702
703
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
704
        PyObject *res;
705
        Py_UCS1 *outp;
706
        if (PyUnicodeEncodeError_GetStart(exc, &start))
  Branch (706:13): [True: 1, False: 1.02k]
707
            return NULL;
708
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
  Branch (708:13): [True: 0, False: 1.02k]
709
            return NULL;
710
        len = end - start;
711
        res = PyUnicode_New(len, '?');
712
        if (res == NULL)
  Branch (712:13): [True: 0, False: 1.02k]
713
            return NULL;
714
        assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
715
        outp = PyUnicode_1BYTE_DATA(res);
716
        for (i = 0; i < len; 
++i5.01k
)
  Branch (716:21): [True: 5.01k, False: 1.02k]
717
            outp[i] = '?';
718
        assert(_PyUnicode_CheckConsistency(res, 1));
719
        return Py_BuildValue("(Nn)", res, end);
720
    }
721
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
722
        if (PyUnicodeDecodeError_GetEnd(exc, &end))
  Branch (722:13): [True: 1, False: 932]
723
            return NULL;
724
        return Py_BuildValue("(Cn)",
725
                             (int)Py_UNICODE_REPLACEMENT_CHARACTER,
726
                             end);
727
    }
728
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
729
        PyObject *res;
730
        Py_UCS2 *outp;
731
        if (PyUnicodeTranslateError_GetStart(exc, &start))
  Branch (731:13): [True: 0, False: 1]
732
            return NULL;
733
        if (PyUnicodeTranslateError_GetEnd(exc, &end))
  Branch (733:13): [True: 0, False: 1]
734
            return NULL;
735
        len = end - start;
736
        res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
737
        if (res == NULL)
  Branch (737:13): [True: 0, False: 1]
738
            return NULL;
739
        assert(PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
740
        outp = PyUnicode_2BYTE_DATA(res);
741
        for (i = 0; i < len; 
i++1
)
  Branch (741:21): [True: 1, False: 1]
742
            outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
743
        assert(_PyUnicode_CheckConsistency(res, 1));
744
        return Py_BuildValue("(Nn)", res, end);
745
    }
746
    else {
747
        wrong_exception_type(exc);
748
        return NULL;
749
    }
750
}
751
752
PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
753
{
754
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
755
        PyObject *restuple;
756
        PyObject *object;
757
        Py_ssize_t i;
758
        Py_ssize_t start;
759
        Py_ssize_t end;
760
        PyObject *res;
761
        Py_UCS1 *outp;
762
        Py_ssize_t ressize;
763
        Py_UCS4 ch;
764
        if (PyUnicodeEncodeError_GetStart(exc, &start))
  Branch (764:13): [True: 0, False: 1.06k]
765
            return NULL;
766
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
  Branch (766:13): [True: 0, False: 1.06k]
767
            return NULL;
768
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
  Branch (768:13): [True: 0, False: 1.06k]
769
            return NULL;
770
        if (end - start > PY_SSIZE_T_MAX / (2+7+1))
  Branch (770:13): [True: 0, False: 1.06k]
771
            end = start + PY_SSIZE_T_MAX / (2+7+1);
772
        for (i = start, ressize = 0; i < end; 
++i5.09k
) {
  Branch (772:38): [True: 5.09k, False: 1.06k]
773
            /* object is guaranteed to be "ready" */
774
            ch = PyUnicode_READ_CHAR(object, i);
775
            if (ch<10)
  Branch (775:17): [True: 3, False: 5.08k]
776
                ressize += 2+1+1;
777
            else if (ch<100)
  Branch (777:22): [True: 2, False: 5.08k]
778
                ressize += 2+2+1;
779
            else if (ch<1000)
  Branch (779:22): [True: 1.00k, False: 4.08k]
780
                ressize += 2+3+1;
781
            else if (ch<10000)
  Branch (781:22): [True: 60, False: 4.02k]
782
                ressize += 2+4+1;
783
            else if (ch<100000)
  Branch (783:22): [True: 4.01k, False: 9]
784
                ressize += 2+5+1;
785
            else if (ch<1000000)
  Branch (785:22): [True: 6, False: 3]
786
                ressize += 2+6+1;
787
            else
788
                ressize += 2+7+1;
789
        }
790
        /* allocate replacement */
791
        res = PyUnicode_New(ressize, 127);
792
        if (res == NULL) {
  Branch (792:13): [True: 0, False: 1.06k]
793
            Py_DECREF(object);
794
            return NULL;
795
        }
796
        outp = PyUnicode_1BYTE_DATA(res);
797
        /* generate replacement */
798
        for (i = start; i < end; 
++i5.09k
) {
  Branch (798:25): [True: 5.09k, False: 1.06k]
799
            int digits;
800
            int base;
801
            ch = PyUnicode_READ_CHAR(object, i);
802
            *outp++ = '&';
803
            *outp++ = '#';
804
            if (ch<10) {
  Branch (804:17): [True: 3, False: 5.08k]
805
                digits = 1;
806
                base = 1;
807
            }
808
            else if (ch<100) {
  Branch (808:22): [True: 2, False: 5.08k]
809
                digits = 2;
810
                base = 10;
811
            }
812
            else if (ch<1000) {
  Branch (812:22): [True: 1.00k, False: 4.08k]
813
                digits = 3;
814
                base = 100;
815
            }
816
            else if (ch<10000) {
  Branch (816:22): [True: 60, False: 4.02k]
817
                digits = 4;
818
                base = 1000;
819
            }
820
            else if (ch<100000) {
  Branch (820:22): [True: 4.01k, False: 9]
821
                digits = 5;
822
                base = 10000;
823
            }
824
            else if (ch<1000000) {
  Branch (824:22): [True: 6, False: 3]
825
                digits = 6;
826
                base = 100000;
827
            }
828
            else {
829
                digits = 7;
830
                base = 1000000;
831
            }
832
            while (digits-->0) {
  Branch (832:20): [True: 23.3k, False: 5.09k]
833
                *outp++ = '0' + ch/base;
834
                ch %= base;
835
                base /= 10;
836
            }
837
            *outp++ = ';';
838
        }
839
        assert(_PyUnicode_CheckConsistency(res, 1));
840
        restuple = Py_BuildValue("(Nn)", res, end);
841
        Py_DECREF(object);
842
        return restuple;
843
    }
844
    else {
845
        wrong_exception_type(exc);
846
        return NULL;
847
    }
848
}
849
850
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
851
{
852
    PyObject *object;
853
    Py_ssize_t i;
854
    Py_ssize_t start;
855
    Py_ssize_t end;
856
    PyObject *res;
857
    Py_UCS1 *outp;
858
    int ressize;
859
    Py_UCS4 c;
860
861
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
862
        const unsigned char *p;
863
        if (PyUnicodeDecodeError_GetStart(exc, &start))
  Branch (863:13): [True: 0, False: 60]
864
            return NULL;
865
        if (PyUnicodeDecodeError_GetEnd(exc, &end))
  Branch (865:13): [True: 0, False: 60]
866
            return NULL;
867
        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
  Branch (867:13): [True: 0, False: 60]
868
            return NULL;
869
        p = (const unsigned char*)PyBytes_AS_STRING(object);
870
        res = PyUnicode_New(4 * (end - start), 127);
871
        if (res == NULL) {
  Branch (871:13): [True: 0, False: 60]
872
            Py_DECREF(object);
873
            return NULL;
874
        }
875
        outp = PyUnicode_1BYTE_DATA(res);
876
        for (i = start; i < end; 
i++, outp += 4114
) {
  Branch (876:25): [True: 114, False: 60]
877
            unsigned char c = p[i];
878
            outp[0] = '\\';
879
            outp[1] = 'x';
880
            outp[2] = Py_hexdigits[(c>>4)&0xf];
881
            outp[3] = Py_hexdigits[c&0xf];
882
        }
883
884
        assert(_PyUnicode_CheckConsistency(res, 1));
885
        Py_DECREF(object);
886
        return Py_BuildValue("(Nn)", res, end);
887
    }
888
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
889
        if (PyUnicodeEncodeError_GetStart(exc, &start))
  Branch (889:13): [True: 0, False: 1.04k]
890
            return NULL;
891
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
  Branch (891:13): [True: 0, False: 1.04k]
892
            return NULL;
893
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
  Branch (893:13): [True: 0, False: 1.04k]
894
            return NULL;
895
    }
896
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
897
        if (PyUnicodeTranslateError_GetStart(exc, &start))
  Branch (897:13): [True: 0, False: 12]
898
            return NULL;
899
        if (PyUnicodeTranslateError_GetEnd(exc, &end))
  Branch (899:13): [True: 0, False: 12]
900
            return NULL;
901
        if (!(object = PyUnicodeTranslateError_GetObject(exc)))
  Branch (901:13): [True: 0, False: 12]
902
            return NULL;
903
    }
904
    else {
905
        wrong_exception_type(exc);
906
        return NULL;
907
    }
908
909
    if (end - start > PY_SSIZE_T_MAX / (1+1+8))
  Branch (909:9): [True: 0, False: 1.05k]
910
        end = start + PY_SSIZE_T_MAX / (1+1+8);
911
    for (i = start, ressize = 0; i < end; 
++i6.05k
) {
  Branch (911:34): [True: 6.05k, False: 1.05k]
912
        /* object is guaranteed to be "ready" */
913
        c = PyUnicode_READ_CHAR(object, i);
914
        if (c >= 0x10000) {
  Branch (914:13): [True: 7, False: 6.04k]
915
            ressize += 1+1+8;
916
        }
917
        else if (c >= 0x100) {
  Branch (917:18): [True: 5.03k, False: 1.01k]
918
            ressize += 1+1+4;
919
        }
920
        else
921
            ressize += 1+1+2;
922
    }
923
    res = PyUnicode_New(ressize, 127);
924
    if (res == NULL) {
  Branch (924:9): [True: 0, False: 1.05k]
925
        Py_DECREF(object);
926
        return NULL;
927
    }
928
    outp = PyUnicode_1BYTE_DATA(res);
929
    for (i = start; i < end; 
++i6.05k
) {
  Branch (929:21): [True: 6.05k, False: 1.05k]
930
        c = PyUnicode_READ_CHAR(object, i);
931
        *outp++ = '\\';
932
        if (c >= 0x00010000) {
  Branch (932:13): [True: 7, False: 6.04k]
933
            *outp++ = 'U';
934
            *outp++ = Py_hexdigits[(c>>28)&0xf];
935
            *outp++ = Py_hexdigits[(c>>24)&0xf];
936
            *outp++ = Py_hexdigits[(c>>20)&0xf];
937
            *outp++ = Py_hexdigits[(c>>16)&0xf];
938
            *outp++ = Py_hexdigits[(c>>12)&0xf];
939
            *outp++ = Py_hexdigits[(c>>8)&0xf];
940
        }
941
        else if (c >= 0x100) {
  Branch (941:18): [True: 5.03k, False: 1.01k]
942
            *outp++ = 'u';
943
            *outp++ = Py_hexdigits[(c>>12)&0xf];
944
            *outp++ = Py_hexdigits[(c>>8)&0xf];
945
        }
946
        else
947
            *outp++ = 'x';
948
        *outp++ = Py_hexdigits[(c>>4)&0xf];
949
        *outp++ = Py_hexdigits[c&0xf];
950
    }
951
952
    assert(_PyUnicode_CheckConsistency(res, 1));
953
    Py_DECREF(object);
954
    return Py_BuildValue("(Nn)", res, end);
955
}
956
957
static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
958
959
PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
960
{
961
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
962
        PyObject *restuple;
963
        PyObject *object;
964
        Py_ssize_t i;
965
        Py_ssize_t start;
966
        Py_ssize_t end;
967
        PyObject *res;
968
        Py_UCS1 *outp;
969
        Py_ssize_t ressize;
970
        int replsize;
971
        Py_UCS4 c;
972
        char buffer[256]; /* NAME_MAXLEN */
973
        if (PyUnicodeEncodeError_GetStart(exc, &start))
  Branch (973:13): [True: 0, False: 2.02k]
974
            return NULL;
975
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
  Branch (975:13): [True: 0, False: 2.02k]
976
            return NULL;
977
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
  Branch (977:13): [True: 0, False: 2.02k]
978
            return NULL;
979
        if (!ucnhash_capi) {
  Branch (979:13): [True: 1, False: 2.02k]
980
            /* load the unicode data module */
981
            ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
982
                                            PyUnicodeData_CAPSULE_NAME, 1);
983
            if (!ucnhash_capi) {
  Branch (983:17): [True: 0, False: 1]
984
                return NULL;
985
            }
986
        }
987
        
for (i = start, ressize = 0; 2.02k
i < end;
++i10.0k
) {
  Branch (987:38): [True: 10.0k, False: 2.02k]
988
            /* object is guaranteed to be "ready" */
989
            c = PyUnicode_READ_CHAR(object, i);
990
            if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
  Branch (990:17): [True: 10.0k, False: 17]
991
                replsize = 1+1+1+(int)strlen(buffer)+1;
992
            }
993
            else if (c >= 0x10000) {
  Branch (993:22): [True: 4, False: 13]
994
                replsize = 1+1+8;
995
            }
996
            else if (c >= 0x100) {
  Branch (996:22): [True: 12, False: 1]
997
                replsize = 1+1+4;
998
            }
999
            else
1000
                replsize = 1+1+2;
1001
            if (ressize > PY_SSIZE_T_MAX - replsize)
  Branch (1001:17): [True: 0, False: 10.0k]
1002
                break;
1003
            ressize += replsize;
1004
        }
1005
        end = i;
1006
        res = PyUnicode_New(ressize, 127);
1007
        if (res==NULL)
  Branch (1007:13): [True: 0, False: 2.02k]
1008
            return NULL;
1009
        for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1010
            i < end; 
++i10.0k
) {
  Branch (1010:13): [True: 10.0k, False: 2.02k]
1011
            c = PyUnicode_READ_CHAR(object, i);
1012
            *outp++ = '\\';
1013
            if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
  Branch (1013:17): [True: 10.0k, False: 17]
1014
                *outp++ = 'N';
1015
                *outp++ = '{';
1016
                strcpy((char *)outp, buffer);
1017
                outp += strlen(buffer);
1018
                *outp++ = '}';
1019
                continue;
1020
            }
1021
            if (c >= 0x00010000) {
  Branch (1021:17): [True: 4, False: 13]
1022
                *outp++ = 'U';
1023
                *outp++ = Py_hexdigits[(c>>28)&0xf];
1024
                *outp++ = Py_hexdigits[(c>>24)&0xf];
1025
                *outp++ = Py_hexdigits[(c>>20)&0xf];
1026
                *outp++ = Py_hexdigits[(c>>16)&0xf];
1027
                *outp++ = Py_hexdigits[(c>>12)&0xf];
1028
                *outp++ = Py_hexdigits[(c>>8)&0xf];
1029
            }
1030
            else if (c >= 0x100) {
  Branch (1030:22): [True: 12, False: 1]
1031
                *outp++ = 'u';
1032
                *outp++ = Py_hexdigits[(c>>12)&0xf];
1033
                *outp++ = Py_hexdigits[(c>>8)&0xf];
1034
            }
1035
            else
1036
                *outp++ = 'x';
1037
            *outp++ = Py_hexdigits[(c>>4)&0xf];
1038
            *outp++ = Py_hexdigits[c&0xf];
1039
        }
1040
1041
        assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1042
        assert(_PyUnicode_CheckConsistency(res, 1));
1043
        restuple = Py_BuildValue("(Nn)", res, end);
1044
        Py_DECREF(object);
1045
        return restuple;
1046
    }
1047
    else {
1048
        wrong_exception_type(exc);
1049
        return NULL;
1050
    }
1051
}
1052
1053
#define ENC_UNKNOWN     -1
1054
#define ENC_UTF8        0
1055
#define ENC_UTF16BE     1
1056
#define ENC_UTF16LE     2
1057
#define ENC_UTF32BE     3
1058
#define ENC_UTF32LE     4
1059
1060
static int
1061
get_standard_encoding(const char *encoding, int *bytelength)
1062
{
1063
    if (Py_TOLOWER(encoding[0]) == 'u' &&
  Branch (1063:9): [True: 658, False: 3]
1064
        
Py_TOLOWER658
(encoding[1]) == 't'658
&&
  Branch (1064:9): [True: 658, False: 0]
1065
        
Py_TOLOWER658
(encoding[2]) == 'f'658
) {
  Branch (1065:9): [True: 658, False: 0]
1066
        encoding += 3;
1067
        if (*encoding == '-' || 
*encoding == '_'0
)
  Branch (1067:13): [True: 658, False: 0]
  Branch (1067:33): [True: 0, False: 0]
1068
            encoding++;
1069
        if (encoding[0] == '8' && 
encoding[1] == '\0'542
) {
  Branch (1069:13): [True: 542, False: 116]
  Branch (1069:35): [True: 542, False: 0]
1070
            *bytelength = 3;
1071
            return ENC_UTF8;
1072
        }
1073
        else if (encoding[0] == '1' && 
encoding[1] == '6'50
) {
  Branch (1073:18): [True: 50, False: 66]
  Branch (1073:40): [True: 50, False: 0]
1074
            encoding += 2;
1075
            *bytelength = 2;
1076
            if (*encoding == '\0') {
  Branch (1076:17): [True: 5, False: 45]
1077
#ifdef WORDS_BIGENDIAN
1078
                return ENC_UTF16BE;
1079
#else
1080
                return ENC_UTF16LE;
1081
#endif
1082
            }
1083
            if (*encoding == '-' || 
*encoding == '_'16
)
  Branch (1083:17): [True: 29, False: 16]
  Branch (1083:37): [True: 0, False: 16]
1084
                encoding++;
1085
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
  Branch (1085:17): [True: 45, False: 0]
  Branch (1085:51): [True: 45, False: 0]
1086
                if (Py_TOLOWER(encoding[0]) == 'b')
  Branch (1086:21): [True: 18, False: 27]
1087
                    return ENC_UTF16BE;
1088
                if (Py_TOLOWER(encoding[0]) == 'l')
  Branch (1088:21): [True: 27, False: 0]
1089
                    return ENC_UTF16LE;
1090
            }
1091
        }
1092
        else if (encoding[0] == '3' && encoding[1] == '2') {
  Branch (1092:18): [True: 66, False: 0]
  Branch (1092:40): [True: 66, False: 0]
1093
            encoding += 2;
1094
            *bytelength = 4;
1095
            if (*encoding == '\0') {
  Branch (1095:17): [True: 5, False: 61]
1096
#ifdef WORDS_BIGENDIAN
1097
                return ENC_UTF32BE;
1098
#else
1099
                return ENC_UTF32LE;
1100
#endif
1101
            }
1102
            if (*encoding == '-' || 
*encoding == '_'16
)
  Branch (1102:17): [True: 45, False: 16]
  Branch (1102:37): [True: 0, False: 16]
1103
                encoding++;
1104
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
  Branch (1104:17): [True: 61, False: 0]
  Branch (1104:51): [True: 61, False: 0]
1105
                if (Py_TOLOWER(encoding[0]) == 'b')
  Branch (1105:21): [True: 22, False: 39]
1106
                    return ENC_UTF32BE;
1107
                if (Py_TOLOWER(encoding[0]) == 'l')
  Branch (1107:21): [True: 39, False: 0]
1108
                    return ENC_UTF32LE;
1109
            }
1110
        }
1111
    }
1112
    else if (strcmp(encoding, "CP_UTF8") == 0) {
  Branch (1112:14): [True: 0, False: 3]
1113
        *bytelength = 3;
1114
        return ENC_UTF8;
1115
    }
1116
    return ENC_UNKNOWN;
1117
}
1118
1119
/* This handler is declared static until someone demonstrates
1120
   a need to call it directly. */
1121
static PyObject *
1122
PyCodec_SurrogatePassErrors(PyObject *exc)
1123
{
1124
    PyObject *restuple;
1125
    PyObject *object;
1126
    PyObject *encode;
1127
    const char *encoding;
1128
    int code;
1129
    int bytelength;
1130
    Py_ssize_t i;
1131
    Py_ssize_t start;
1132
    Py_ssize_t end;
1133
    PyObject *res;
1134
1135
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1136
        unsigned char *outp;
1137
        if (PyUnicodeEncodeError_GetStart(exc, &start))
  Branch (1137:13): [True: 0, False: 53]
1138
            return NULL;
1139
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
  Branch (1139:13): [True: 0, False: 53]
1140
            return NULL;
1141
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
  Branch (1141:13): [True: 0, False: 53]
1142
            return NULL;
1143
        if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
  Branch (1143:13): [True: 0, False: 53]
1144
            Py_DECREF(object);
1145
            return NULL;
1146
        }
1147
        if (!(encoding = PyUnicode_AsUTF8(encode))) {
  Branch (1147:13): [True: 0, False: 53]
1148
            Py_DECREF(object);
1149
            Py_DECREF(encode);
1150
            return NULL;
1151
        }
1152
        code = get_standard_encoding(encoding, &bytelength);
1153
        Py_DECREF(encode);
1154
        if (code == ENC_UNKNOWN) {
  Branch (1154:13): [True: 3, False: 50]
1155
            /* Not supported, fail with original exception */
1156
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1157
            Py_DECREF(object);
1158
            return NULL;
1159
        }
1160
1161
        if (end - start > PY_SSIZE_T_MAX / bytelength)
  Branch (1161:13): [True: 0, False: 50]
1162
            end = start + PY_SSIZE_T_MAX / bytelength;
1163
        res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
1164
        if (!res) {
  Branch (1164:13): [True: 0, False: 50]
1165
            Py_DECREF(object);
1166
            return NULL;
1167
        }
1168
        outp = (unsigned char*)PyBytes_AsString(res);
1169
        for (i = start; i < end; 
i++50
) {
  Branch (1169:25): [True: 55, False: 45]
1170
            /* object is guaranteed to be "ready" */
1171
            Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1172
            if (!Py_UNICODE_IS_SURROGATE(ch)) {
  Branch (1172:17): [True: 5, False: 50]
1173
                /* Not a surrogate, fail with original exception */
1174
                PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1175
                Py_DECREF(res);
1176
                Py_DECREF(object);
1177
                return NULL;
1178
            }
1179
            switch (code) {
  Branch (1179:21): [True: 0, False: 50]
1180
            case ENC_UTF8:
  Branch (1180:13): [True: 4, False: 46]
1181
                *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1182
                *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1183
                *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1184
                break;
1185
            case ENC_UTF16LE:
  Branch (1185:13): [True: 14, False: 36]
1186
                *outp++ = (unsigned char) ch;
1187
                *outp++ = (unsigned char)(ch >> 8);
1188
                break;
1189
            case ENC_UTF16BE:
  Branch (1189:13): [True: 9, False: 41]
1190
                *outp++ = (unsigned char)(ch >> 8);
1191
                *outp++ = (unsigned char) ch;
1192
                break;
1193
            case ENC_UTF32LE:
  Branch (1193:13): [True: 14, False: 36]
1194
                *outp++ = (unsigned char) ch;
1195
                *outp++ = (unsigned char)(ch >> 8);
1196
                *outp++ = (unsigned char)(ch >> 16);
1197
                *outp++ = (unsigned char)(ch >> 24);
1198
                break;
1199
            case ENC_UTF32BE:
  Branch (1199:13): [True: 9, False: 41]
1200
                *outp++ = (unsigned char)(ch >> 24);
1201
                *outp++ = (unsigned char)(ch >> 16);
1202
                *outp++ = (unsigned char)(ch >> 8);
1203
                *outp++ = (unsigned char) ch;
1204
                break;
1205
            }
1206
        }
1207
        restuple = Py_BuildValue("(On)", res, end);
1208
        Py_DECREF(res);
1209
        Py_DECREF(object);
1210
        return restuple;
1211
    }
1212
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1213
        const unsigned char *p;
1214
        Py_UCS4 ch = 0;
1215
        if (PyUnicodeDecodeError_GetStart(exc, &start))
  Branch (1215:13): [True: 0, False: 608]
1216
            return NULL;
1217
        if (PyUnicodeDecodeError_GetEnd(exc, &end))
  Branch (1217:13): [True: 0, False: 608]
1218
            return NULL;
1219
        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
  Branch (1219:13): [True: 0, False: 608]
1220
            return NULL;
1221
        p = (const unsigned char*)PyBytes_AS_STRING(object);
1222
        if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
  Branch (1222:13): [True: 0, False: 608]
1223
            Py_DECREF(object);
1224
            return NULL;
1225
        }
1226
        if (!(encoding = PyUnicode_AsUTF8(encode))) {
  Branch (1226:13): [True: 0, False: 608]
1227
            Py_DECREF(object);
1228
            Py_DECREF(encode);
1229
            return NULL;
1230
        }
1231
        code = get_standard_encoding(encoding, &bytelength);
1232
        Py_DECREF(encode);
1233
        if (code == ENC_UNKNOWN) {
  Branch (1233:13): [True: 0, False: 608]
1234
            /* Not supported, fail with original exception */
1235
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1236
            Py_DECREF(object);
1237
            return NULL;
1238
        }
1239
1240
        /* Try decoding a single surrogate character. If
1241
           there are more, let the codec call us again. */
1242
        p += start;
1243
        if (PyBytes_GET_SIZE(object) - start >= bytelength) {
  Branch (1243:13): [True: 605, False: 3]
1244
            switch (code) {
  Branch (1244:21): [True: 0, False: 605]
1245
            case ENC_UTF8:
  Branch (1245:13): [True: 535, False: 70]
1246
                if ((p[0] & 0xf0) == 0xe0 &&
  Branch (1246:21): [True: 533, False: 2]
1247
                    
(p[1] & 0xc0) == 0x80533
&&
  Branch (1247:21): [True: 533, False: 0]
1248
                    
(p[2] & 0xc0) == 0x80533
) {
  Branch (1248:21): [True: 531, False: 2]
1249
                    /* it's a three-byte code */
1250
                    ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1251
                }
1252
                break;
1253
            case ENC_UTF16LE:
  Branch (1253:13): [True: 18, False: 587]
1254
                ch = p[1] << 8 | p[0];
1255
                break;
1256
            case ENC_UTF16BE:
  Branch (1256:13): [True: 9, False: 596]
1257
                ch = p[0] << 8 | p[1];
1258
                break;
1259
            case ENC_UTF32LE:
  Branch (1259:13): [True: 30, False: 575]
1260
                ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1261
                break;
1262
            case ENC_UTF32BE:
  Branch (1262:13): [True: 13, False: 592]
1263
                ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1264
                break;
1265
            }
1266
        }
1267
1268
        Py_DECREF(object);
1269
        if (!Py_UNICODE_IS_SURROGATE(ch)) {
  Branch (1269:13): [True: 11, False: 597]
1270
            /* it's not a surrogate - fail */
1271
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1272
            return NULL;
1273
        }
1274
        res = PyUnicode_FromOrdinal(ch);
1275
        if (res == NULL)
  Branch (1275:13): [True: 0, False: 597]
1276
            return NULL;
1277
        return Py_BuildValue("(Nn)", res, start + bytelength);
1278
    }
1279
    else {
1280
        wrong_exception_type(exc);
1281
        return NULL;
1282
    }
1283
}
1284
1285
static PyObject *
1286
PyCodec_SurrogateEscapeErrors(PyObject *exc)
1287
{
1288
    PyObject *restuple;
1289
    PyObject *object;
1290
    Py_ssize_t i;
1291
    Py_ssize_t start;
1292
    Py_ssize_t end;
1293
    PyObject *res;
1294
1295
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1296
        char *outp;
1297
        if (PyUnicodeEncodeError_GetStart(exc, &start))
  Branch (1297:13): [True: 0, False: 65]
1298
            return NULL;
1299
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
  Branch (1299:13): [True: 0, False: 65]
1300
            return NULL;
1301
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
  Branch (1301:13): [True: 0, False: 65]
1302
            return NULL;
1303
        res = PyBytes_FromStringAndSize(NULL, end-start);
1304
        if (!res) {
  Branch (1304:13): [True: 0, False: 65]
1305
            Py_DECREF(object);
1306
            return NULL;
1307
        }
1308
        outp = PyBytes_AsString(res);
1309
        for (i = start; i < end; 
i++2
) {
  Branch (1309:25): [True: 65, False: 2]
1310
            /* object is guaranteed to be "ready" */
1311
            Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1312
            if (ch < 0xdc80 || 
ch > 0xdcff49
) {
  Branch (1312:17): [True: 16, False: 49]
  Branch (1312:32): [True: 47, False: 2]
1313
                /* Not a UTF-8b surrogate, fail with original exception */
1314
                PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1315
                Py_DECREF(res);
1316
                Py_DECREF(object);
1317
                return NULL;
1318
            }
1319
            *outp++ = ch - 0xdc00;
1320
        }
1321
        restuple = Py_BuildValue("(On)", res, end);
1322
        Py_DECREF(res);
1323
        Py_DECREF(object);
1324
        return restuple;
1325
    }
1326
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1327
        PyObject *str;
1328
        const unsigned char *p;
1329
        Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1330
        int consumed = 0;
1331
        if (PyUnicodeDecodeError_GetStart(exc, &start))
  Branch (1331:13): [True: 0, False: 3]
1332
            return NULL;
1333
        if (PyUnicodeDecodeError_GetEnd(exc, &end))
  Branch (1333:13): [True: 0, False: 3]
1334
            return NULL;
1335
        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
  Branch (1335:13): [True: 0, False: 3]
1336
            return NULL;
1337
        p = (const unsigned char*)PyBytes_AS_STRING(object);
1338
        while (consumed < 4 && consumed < end-start) {
  Branch (1338:16): [True: 5, False: 0]
  Branch (1338:32): [True: 3, False: 2]
1339
            /* Refuse to escape ASCII bytes. */
1340
            if (p[start+consumed] < 128)
  Branch (1340:17): [True: 1, False: 2]
1341
                break;
1342
            ch[consumed] = 0xdc00 + p[start+consumed];
1343
            consumed++;
1344
        }
1345
        Py_DECREF(object);
1346
        if (!consumed) {
  Branch (1346:13): [True: 1, False: 2]
1347
            /* codec complained about ASCII byte. */
1348
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1349
            return NULL;
1350
        }
1351
        str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1352
        if (str == NULL)
  Branch (1352:13): [True: 0, False: 2]
1353
            return NULL;
1354
        return Py_BuildValue("(Nn)", str, start+consumed);
1355
    }
1356
    else {
1357
        wrong_exception_type(exc);
1358
        return NULL;
1359
    }
1360
}
1361
1362
1363
static PyObject *strict_errors(PyObject *self, PyObject *exc)
1364
{
1365
    return PyCodec_StrictErrors(exc);
1366
}
1367
1368
1369
static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1370
{
1371
    return PyCodec_IgnoreErrors(exc);
1372
}
1373
1374
1375
static PyObject *replace_errors(PyObject *self, PyObject *exc)
1376
{
1377
    return PyCodec_ReplaceErrors(exc);
1378
}
1379
1380
1381
static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1382
{
1383
    return PyCodec_XMLCharRefReplaceErrors(exc);
1384
}
1385
1386
1387
static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1388
{
1389
    return PyCodec_BackslashReplaceErrors(exc);
1390
}
1391
1392
static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1393
{
1394
    return PyCodec_NameReplaceErrors(exc);
1395
}
1396
1397
static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
1398
{
1399
    return PyCodec_SurrogatePassErrors(exc);
1400
}
1401
1402
static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
1403
{
1404
    return PyCodec_SurrogateEscapeErrors(exc);
1405
}
1406
1407
static int _PyCodecRegistry_Init(void)
1408
{
1409
    static struct {
1410
        const char *name;
1411
        PyMethodDef def;
1412
    } methods[] =
1413
    {
1414
        {
1415
            "strict",
1416
            {
1417
                "strict_errors",
1418
                strict_errors,
1419
                METH_O,
1420
                PyDoc_STR("Implements the 'strict' error handling, which "
1421
                          "raises a UnicodeError on coding errors.")
1422
            }
1423
        },
1424
        {
1425
            "ignore",
1426
            {
1427
                "ignore_errors",
1428
                ignore_errors,
1429
                METH_O,
1430
                PyDoc_STR("Implements the 'ignore' error handling, which "
1431
                          "ignores malformed data and continues.")
1432
            }
1433
        },
1434
        {
1435
            "replace",
1436
            {
1437
                "replace_errors",
1438
                replace_errors,
1439
                METH_O,
1440
                PyDoc_STR("Implements the 'replace' error handling, which "
1441
                          "replaces malformed data with a replacement marker.")
1442
            }
1443
        },
1444
        {
1445
            "xmlcharrefreplace",
1446
            {
1447
                "xmlcharrefreplace_errors",
1448
                xmlcharrefreplace_errors,
1449
                METH_O,
1450
                PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1451
                          "which replaces an unencodable character with the "
1452
                          "appropriate XML character reference.")
1453
            }
1454
        },
1455
        {
1456
            "backslashreplace",
1457
            {
1458
                "backslashreplace_errors",
1459
                backslashreplace_errors,
1460
                METH_O,
1461
                PyDoc_STR("Implements the 'backslashreplace' error handling, "
1462
                          "which replaces malformed data with a backslashed "
1463
                          "escape sequence.")
1464
            }
1465
        },
1466
        {
1467
            "namereplace",
1468
            {
1469
                "namereplace_errors",
1470
                namereplace_errors,
1471
                METH_O,
1472
                PyDoc_STR("Implements the 'namereplace' error handling, "
1473
                          "which replaces an unencodable character with a "
1474
                          "\\N{...} escape sequence.")
1475
            }
1476
        },
1477
        {
1478
            "surrogatepass",
1479
            {
1480
                "surrogatepass",
1481
                surrogatepass_errors,
1482
                METH_O
1483
            }
1484
        },
1485
        {
1486
            "surrogateescape",
1487
            {
1488
                "surrogateescape",
1489
                surrogateescape_errors,
1490
                METH_O
1491
            }
1492
        }
1493
    };
1494
1495
    PyInterpreterState *interp = _PyInterpreterState_GET();
1496
    PyObject *mod;
1497
1498
    if (interp->codec_search_path != NULL)
  Branch (1498:9): [True: 0, False: 278]
1499
        return 0;
1500
1501
    interp->codec_search_path = PyList_New(0);
1502
    if (interp->codec_search_path == NULL) {
  Branch (1502:9): [True: 0, False: 278]
1503
        return -1;
1504
    }
1505
1506
    interp->codec_search_cache = PyDict_New();
1507
    if (interp->codec_search_cache == NULL) {
  Branch (1507:9): [True: 0, False: 278]
1508
        return -1;
1509
    }
1510
1511
    interp->codec_error_registry = PyDict_New();
1512
    if (interp->codec_error_registry == NULL) {
  Branch (1512:9): [True: 0, False: 278]
1513
        return -1;
1514
    }
1515
1516
    
for (size_t i = 0; 278
i < Py_ARRAY_LENGTH(methods);
++i2.22k
) {
  Branch (1516:24): [True: 2.22k, False: 278]
1517
        PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1518
        if (!func) {
  Branch (1518:13): [True: 0, False: 2.22k]
1519
            return -1;
1520
        }
1521
1522
        int res = PyCodec_RegisterError(methods[i].name, func);
1523
        Py_DECREF(func);
1524
        if (res) {
  Branch (1524:13): [True: 0, False: 2.22k]
1525
            return -1;
1526
        }
1527
    }
1528
1529
    mod = PyImport_ImportModule("encodings");
1530
    if (mod == NULL) {
  Branch (1530:9): [True: 0, False: 278]
1531
        return -1;
1532
    }
1533
    Py_DECREF(mod);
1534
    interp->codecs_initialized = 1;
1535
    return 0;
1536
}