/home/mdboom/Work/builds/cpython/Python/codecs.c
Line | Count | Source (jump to first uncovered line) |
1 | /* ------------------------------------------------------------------------ |
2 | |
3 | Python Codec Registry and support functions |
4 | |
5 | Written by Marc-Andre Lemburg (mal@lemburg.com). |
6 | |
7 | Copyright (c) Corporation for National Research Initiatives. |
8 | |
9 | ------------------------------------------------------------------------ */ |
10 | |
11 | #include "Python.h" |
12 | #include "pycore_call.h" // _PyObject_CallNoArgs() |
13 | #include "pycore_interp.h" // PyInterpreterState.codec_search_path |
14 | #include "pycore_pystate.h" // _PyInterpreterState_GET() |
15 | #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI |
16 | #include <ctype.h> |
17 | |
18 | const char *Py_hexdigits = "0123456789abcdef"; |
19 | |
20 | /* --- Codec Registry ----------------------------------------------------- */ |
21 | |
22 | /* Import the standard encodings package which will register the first |
23 | codec search function. |
24 | |
25 | This is done in a lazy way so that the Unicode implementation does |
26 | not downgrade startup time of scripts not needing it. |
27 | |
28 | ImportErrors are silently ignored by this function. Only one try is |
29 | made. |
30 | |
31 | */ |
32 | |
33 | static int _PyCodecRegistry_Init(void); /* Forward */ |
34 | |
35 | int PyCodec_Register(PyObject *search_function) |
36 | { |
37 | PyInterpreterState *interp = _PyInterpreterState_GET(); |
38 | if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()0 ) Branch (38:9): [True: 0, False: 558]
Branch (38:46): [True: 0, False: 0]
|
39 | goto onError; |
40 | if (search_function == NULL) { Branch (40:9): [True: 0, False: 558]
|
41 | PyErr_BadArgument(); |
42 | goto onError; |
43 | } |
44 | if (!PyCallable_Check(search_function)) { Branch (44:9): [True: 1, False: 557]
|
45 | PyErr_SetString(PyExc_TypeError, "argument must be callable"); |
46 | goto onError; |
47 | } |
48 | return PyList_Append(interp->codec_search_path, search_function); |
49 | |
50 | onError: |
51 | return -1; |
52 | } |
53 | |
54 | int |
55 | PyCodec_Unregister(PyObject *search_function) |
56 | { |
57 | PyInterpreterState *interp = PyInterpreterState_Get(); |
58 | PyObject *codec_search_path = interp->codec_search_path; |
59 | /* Do nothing if codec_search_path is not created yet or was cleared. */ |
60 | if (codec_search_path == NULL) { Branch (60:9): [True: 0, False: 279]
|
61 | return 0; |
62 | } |
63 | |
64 | assert(PyList_CheckExact(codec_search_path)); |
65 | Py_ssize_t n = PyList_GET_SIZE(codec_search_path); |
66 | for (Py_ssize_t i = 0; i < n; i++279 ) { Branch (66:28): [True: 558, False: 0]
|
67 | PyObject *item = PyList_GET_ITEM(codec_search_path, i); |
68 | if (item == search_function) { Branch (68:13): [True: 279, False: 279]
|
69 | if (interp->codec_search_cache != NULL) { Branch (69:17): [True: 279, False: 0]
|
70 | assert(PyDict_CheckExact(interp->codec_search_cache)); |
71 | PyDict_Clear(interp->codec_search_cache); |
72 | } |
73 | return PyList_SetSlice(codec_search_path, i, i+1, NULL); |
74 | } |
75 | } |
76 | return 0; |
77 | } |
78 | |
79 | extern int _Py_normalize_encoding(const char *, char *, size_t); |
80 | |
81 | /* Convert a string to a normalized Python string(decoded from UTF-8): all characters are |
82 | converted to lower case, spaces and hyphens are replaced with underscores. */ |
83 | |
84 | static |
85 | PyObject *normalizestring(const char *string) |
86 | { |
87 | size_t len = strlen(string); |
88 | char *encoding; |
89 | PyObject *v; |
90 | |
91 | if (len > PY_SSIZE_T_MAX) { Branch (91:9): [True: 0, False: 1.17M]
|
92 | PyErr_SetString(PyExc_OverflowError, "string is too large"); |
93 | return NULL; |
94 | } |
95 | |
96 | encoding = PyMem_Malloc(len + 1); |
97 | if (encoding == NULL) Branch (97:9): [True: 0, False: 1.17M]
|
98 | return PyErr_NoMemory(); |
99 | |
100 | if (!_Py_normalize_encoding(string, encoding, len + 1)) Branch (100:9): [True: 0, False: 1.17M]
|
101 | { |
102 | PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed"); |
103 | PyMem_Free(encoding); |
104 | return NULL; |
105 | } |
106 | |
107 | v = PyUnicode_FromString(encoding); |
108 | PyMem_Free(encoding); |
109 | return v; |
110 | } |
111 | |
112 | /* Lookup the given encoding and return a tuple providing the codec |
113 | facilities. |
114 | |
115 | The encoding string is looked up converted to all lower-case |
116 | characters. This makes encodings looked up through this mechanism |
117 | effectively case-insensitive. |
118 | |
119 | If no codec is found, a LookupError is set and NULL returned. |
120 | |
121 | As side effect, this tries to load the encodings package, if not |
122 | yet done. This is part of the lazy load strategy for the encodings |
123 | package. |
124 | |
125 | */ |
126 | |
127 | PyObject *_PyCodec_Lookup(const char *encoding) |
128 | { |
129 | if (encoding == NULL) { Branch (129:9): [True: 0, False: 1.17M]
|
130 | PyErr_BadArgument(); |
131 | return NULL; |
132 | } |
133 | |
134 | PyInterpreterState *interp = _PyInterpreterState_GET(); |
135 | if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()278 ) { Branch (135:9): [True: 278, False: 1.17M]
Branch (135:46): [True: 0, False: 278]
|
136 | return NULL; |
137 | } |
138 | |
139 | /* Convert the encoding to a normalized Python string: all |
140 | characters are converted to lower case, spaces and hyphens are |
141 | replaced with underscores. */ |
142 | PyObject *v = normalizestring(encoding); |
143 | if (v == NULL) { Branch (143:9): [True: 0, False: 1.17M]
|
144 | return NULL; |
145 | } |
146 | PyUnicode_InternInPlace(&v); |
147 | |
148 | /* First, try to lookup the name in the registry dictionary */ |
149 | PyObject *result = PyDict_GetItemWithError(interp->codec_search_cache, v); |
150 | if (result != NULL) { Branch (150:9): [True: 1.17M, False: 972]
|
151 | Py_INCREF(result); |
152 | Py_DECREF(v); |
153 | return result; |
154 | } |
155 | else if (PyErr_Occurred()) { Branch (155:14): [True: 0, False: 972]
|
156 | goto onError; |
157 | } |
158 | |
159 | /* Next, scan the search functions in order of registration */ |
160 | const Py_ssize_t len = PyList_Size(interp->codec_search_path); |
161 | if (len < 0) Branch (161:9): [True: 0, False: 972]
|
162 | goto onError; |
163 | if (len == 0) { Branch (163:9): [True: 0, False: 972]
|
164 | PyErr_SetString(PyExc_LookupError, |
165 | "no codec search functions registered: " |
166 | "can't find encoding"); |
167 | goto onError; |
168 | } |
169 | |
170 | Py_ssize_t i; |
171 | for (i = 0; i < len; i++76 ) { Branch (171:17): [True: 1.00k, False: 47]
|
172 | PyObject *func; |
173 | |
174 | func = PyList_GetItem(interp->codec_search_path, i); |
175 | if (func == NULL) Branch (175:13): [True: 0, False: 1.00k]
|
176 | goto onError; |
177 | result = PyObject_CallOneArg(func, v); |
178 | if (result == NULL) Branch (178:13): [True: 0, False: 1.00k]
|
179 | goto onError; |
180 | if (result == Py_None) { Branch (180:13): [True: 76, False: 925]
|
181 | Py_DECREF(result); |
182 | continue; |
183 | } |
184 | if (!PyTuple_Check(result) || PyTuple_GET_SIZE924 (result) != 4924 ) { Branch (184:13): [True: 1, False: 924]
Branch (184:39): [True: 0, False: 924]
|
185 | PyErr_SetString(PyExc_TypeError, |
186 | "codec search functions must return 4-tuples"); |
187 | Py_DECREF(result); |
188 | goto onError; |
189 | } |
190 | break; |
191 | } |
192 | if (i == len) { Branch (192:9): [True: 47, False: 924]
|
193 | /* XXX Perhaps we should cache misses too ? */ |
194 | PyErr_Format(PyExc_LookupError, |
195 | "unknown encoding: %s", encoding); |
196 | goto onError; |
197 | } |
198 | |
199 | /* Cache and return the result */ |
200 | if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) { Branch (200:9): [True: 0, False: 924]
|
201 | Py_DECREF(result); |
202 | goto onError; |
203 | } |
204 | Py_DECREF(v); |
205 | return result; |
206 | |
207 | onError: |
208 | Py_DECREF(v); |
209 | return NULL; |
210 | } |
211 | |
212 | /* Codec registry encoding check API. */ |
213 | |
214 | int PyCodec_KnownEncoding(const char *encoding) |
215 | { |
216 | PyObject *codecs; |
217 |
|
218 | codecs = _PyCodec_Lookup(encoding); |
219 | if (!codecs) { Branch (219:9): [True: 0, False: 0]
|
220 | PyErr_Clear(); |
221 | return 0; |
222 | } |
223 | else { |
224 | Py_DECREF(codecs); |
225 | return 1; |
226 | } |
227 | } |
228 | |
229 | static |
230 | PyObject *args_tuple(PyObject *object, |
231 | const char *errors) |
232 | { |
233 | PyObject *args; |
234 | |
235 | args = PyTuple_New(1 + (errors != NULL)); |
236 | if (args == NULL) Branch (236:9): [True: 0, False: 1.12M]
|
237 | return NULL; |
238 | Py_INCREF(object); |
239 | PyTuple_SET_ITEM(args,0,object); |
240 | if (errors) { Branch (240:9): [True: 1.04M, False: 75.8k]
|
241 | PyObject *v; |
242 | |
243 | v = PyUnicode_FromString(errors); |
244 | if (v == NULL) { Branch (244:13): [True: 0, False: 1.04M]
|
245 | Py_DECREF(args); |
246 | return NULL; |
247 | } |
248 | PyTuple_SET_ITEM(args, 1, v); |
249 | } |
250 | return args; |
251 | } |
252 | |
253 | /* Helper function to get a codec item */ |
254 | |
255 | static |
256 | PyObject *codec_getitem(const char *encoding, int index) |
257 | { |
258 | PyObject *codecs; |
259 | PyObject *v; |
260 | |
261 | codecs = _PyCodec_Lookup(encoding); |
262 | if (codecs == NULL) Branch (262:9): [True: 3, False: 10.0k]
|
263 | return NULL; |
264 | v = PyTuple_GET_ITEM(codecs, index); |
265 | Py_DECREF(codecs); |
266 | Py_INCREF(v); |
267 | return v; |
268 | } |
269 | |
270 | /* Helper functions to create an incremental codec. */ |
271 | static |
272 | PyObject *codec_makeincrementalcodec(PyObject *codec_info, |
273 | const char *errors, |
274 | const char *attrname) |
275 | { |
276 | PyObject *ret, *inccodec; |
277 | |
278 | inccodec = PyObject_GetAttrString(codec_info, attrname); |
279 | if (inccodec == NULL) Branch (279:9): [True: 0, False: 27.5k]
|
280 | return NULL; |
281 | if (errors) Branch (281:9): [True: 27.3k, False: 200]
|
282 | ret = PyObject_CallFunction(inccodec, "s", errors); |
283 | else |
284 | ret = _PyObject_CallNoArgs(inccodec); |
285 | Py_DECREF(inccodec); |
286 | return ret; |
287 | } |
288 | |
289 | static |
290 | PyObject *codec_getincrementalcodec(const char *encoding, |
291 | const char *errors, |
292 | const char *attrname) |
293 | { |
294 | PyObject *codec_info, *ret; |
295 | |
296 | codec_info = _PyCodec_Lookup(encoding); |
297 | if (codec_info == NULL) Branch (297:9): [True: 0, False: 398]
|
298 | return NULL; |
299 | ret = codec_makeincrementalcodec(codec_info, errors, attrname); |
300 | Py_DECREF(codec_info); |
301 | return ret; |
302 | } |
303 | |
304 | /* Helper function to create a stream codec. */ |
305 | |
306 | static |
307 | PyObject *codec_getstreamcodec(const char *encoding, |
308 | PyObject *stream, |
309 | const char *errors, |
310 | const int index) |
311 | { |
312 | PyObject *codecs, *streamcodec, *codeccls; |
313 |
|
314 | codecs = _PyCodec_Lookup(encoding); |
315 | if (codecs == NULL) Branch (315:9): [True: 0, False: 0]
|
316 | return NULL; |
317 | |
318 | codeccls = PyTuple_GET_ITEM(codecs, index); |
319 | if (errors != NULL) Branch (319:9): [True: 0, False: 0]
|
320 | streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors); |
321 | else |
322 | streamcodec = PyObject_CallOneArg(codeccls, stream); |
323 | Py_DECREF(codecs); |
324 | return streamcodec; |
325 | } |
326 | |
327 | /* Helpers to work with the result of _PyCodec_Lookup |
328 | |
329 | */ |
330 | PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info, |
331 | const char *errors) |
332 | { |
333 | return codec_makeincrementalcodec(codec_info, errors, |
334 | "incrementaldecoder"); |
335 | } |
336 | |
337 | PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info, |
338 | const char *errors) |
339 | { |
340 | return codec_makeincrementalcodec(codec_info, errors, |
341 | "incrementalencoder"); |
342 | } |
343 | |
344 | |
345 | /* Convenience APIs to query the Codec registry. |
346 | |
347 | All APIs return a codec object with incremented refcount. |
348 | |
349 | */ |
350 | |
351 | PyObject *PyCodec_Encoder(const char *encoding) |
352 | { |
353 | return codec_getitem(encoding, 0); |
354 | } |
355 | |
356 | PyObject *PyCodec_Decoder(const char *encoding) |
357 | { |
358 | return codec_getitem(encoding, 1); |
359 | } |
360 | |
361 | PyObject *PyCodec_IncrementalEncoder(const char *encoding, |
362 | const char *errors) |
363 | { |
364 | return codec_getincrementalcodec(encoding, errors, "incrementalencoder"); |
365 | } |
366 | |
367 | PyObject *PyCodec_IncrementalDecoder(const char *encoding, |
368 | const char *errors) |
369 | { |
370 | return codec_getincrementalcodec(encoding, errors, "incrementaldecoder"); |
371 | } |
372 | |
373 | PyObject *PyCodec_StreamReader(const char *encoding, |
374 | PyObject *stream, |
375 | const char *errors) |
376 | { |
377 | return codec_getstreamcodec(encoding, stream, errors, 2); |
378 | } |
379 | |
380 | PyObject *PyCodec_StreamWriter(const char *encoding, |
381 | PyObject *stream, |
382 | const char *errors) |
383 | { |
384 | return codec_getstreamcodec(encoding, stream, errors, 3); |
385 | } |
386 | |
387 | /* Helper that tries to ensure the reported exception chain indicates the |
388 | * codec that was invoked to trigger the failure without changing the type |
389 | * of the exception raised. |
390 | */ |
391 | static void |
392 | wrap_codec_error(const char *operation, |
393 | const char *encoding) |
394 | { |
395 | /* TrySetFromCause will replace the active exception with a suitably |
396 | * updated clone if it can, otherwise it will leave the original |
397 | * exception alone. |
398 | */ |
399 | _PyErr_TrySetFromCause("%s with '%s' codec failed", |
400 | operation, encoding); |
401 | } |
402 | |
403 | /* Encode an object (e.g. a Unicode object) using the given encoding |
404 | and return the resulting encoded object (usually a Python string). |
405 | |
406 | errors is passed to the encoder factory as argument if non-NULL. */ |
407 | |
408 | static PyObject * |
409 | _PyCodec_EncodeInternal(PyObject *object, |
410 | PyObject *encoder, |
411 | const char *encoding, |
412 | const char *errors) |
413 | { |
414 | PyObject *args = NULL, *result = NULL; |
415 | PyObject *v = NULL; |
416 | |
417 | args = args_tuple(object, errors); |
418 | if (args == NULL) Branch (418:9): [True: 0, False: 1.08M]
|
419 | goto onError; |
420 | |
421 | result = PyObject_Call(encoder, args, NULL); |
422 | if (result == NULL) { Branch (422:9): [True: 63, False: 1.08M]
|
423 | wrap_codec_error("encoding", encoding); |
424 | goto onError; |
425 | } |
426 | |
427 | if (!PyTuple_Check(result) || Branch (427:9): [True: 1, False: 1.08M]
|
428 | PyTuple_GET_SIZE1.08M (result) != 21.08M ) { Branch (428:9): [True: 0, False: 1.08M]
|
429 | PyErr_SetString(PyExc_TypeError, |
430 | "encoder must return a tuple (object, integer)"); |
431 | goto onError; |
432 | } |
433 | v = PyTuple_GET_ITEM(result,0); |
434 | Py_INCREF(v); |
435 | /* We don't check or use the second (integer) entry. */ |
436 | |
437 | Py_DECREF(args); |
438 | Py_DECREF(encoder); |
439 | Py_DECREF(result); |
440 | return v; |
441 | |
442 | onError: |
443 | Py_XDECREF(result); |
444 | Py_XDECREF(args); |
445 | Py_XDECREF(encoder); |
446 | return NULL; |
447 | } |
448 | |
449 | /* Decode an object (usually a Python string) using the given encoding |
450 | and return an equivalent object (e.g. a Unicode object). |
451 | |
452 | errors is passed to the decoder factory as argument if non-NULL. */ |
453 | |
454 | static PyObject * |
455 | _PyCodec_DecodeInternal(PyObject *object, |
456 | PyObject *decoder, |
457 | const char *encoding, |
458 | const char *errors) |
459 | { |
460 | PyObject *args = NULL, *result = NULL; |
461 | PyObject *v; |
462 | |
463 | args = args_tuple(object, errors); |
464 | if (args == NULL) Branch (464:9): [True: 0, False: 43.5k]
|
465 | goto onError; |
466 | |
467 | result = PyObject_Call(decoder, args, NULL); |
468 | if (result == NULL) { Branch (468:9): [True: 78, False: 43.5k]
|
469 | wrap_codec_error("decoding", encoding); |
470 | goto onError; |
471 | } |
472 | if (!PyTuple_Check(result) || Branch (472:9): [True: 1, False: 43.5k]
|
473 | PyTuple_GET_SIZE43.5k (result) != 243.5k ) { Branch (473:9): [True: 0, False: 43.5k]
|
474 | PyErr_SetString(PyExc_TypeError, |
475 | "decoder must return a tuple (object,integer)"); |
476 | goto onError; |
477 | } |
478 | v = PyTuple_GET_ITEM(result,0); |
479 | Py_INCREF(v); |
480 | /* We don't check or use the second (integer) entry. */ |
481 | |
482 | Py_DECREF(args); |
483 | Py_DECREF(decoder); |
484 | Py_DECREF(result); |
485 | return v; |
486 | |
487 | onError: |
488 | Py_XDECREF(args); |
489 | Py_XDECREF(decoder); |
490 | Py_XDECREF(result); |
491 | return NULL; |
492 | } |
493 | |
494 | /* Generic encoding/decoding API */ |
495 | PyObject *PyCodec_Encode(PyObject *object, |
496 | const char *encoding, |
497 | const char *errors) |
498 | { |
499 | PyObject *encoder; |
500 | |
501 | encoder = PyCodec_Encoder(encoding); |
502 | if (encoder == NULL) Branch (502:9): [True: 2, False: 9.98k]
|
503 | return NULL; |
504 | |
505 | return _PyCodec_EncodeInternal(object, encoder, encoding, errors); |
506 | } |
507 | |
508 | PyObject *PyCodec_Decode(PyObject *object, |
509 | const char *encoding, |
510 | const char *errors) |
511 | { |
512 | PyObject *decoder; |
513 | |
514 | decoder = PyCodec_Decoder(encoding); |
515 | if (decoder == NULL) Branch (515:9): [True: 1, False: 40]
|
516 | return NULL; |
517 | |
518 | return _PyCodec_DecodeInternal(object, decoder, encoding, errors); |
519 | } |
520 | |
521 | /* Text encoding/decoding API */ |
522 | PyObject * _PyCodec_LookupTextEncoding(const char *encoding, |
523 | const char *alternate_command) |
524 | { |
525 | PyObject *codec; |
526 | PyObject *attr; |
527 | int is_text_codec; |
528 | |
529 | codec = _PyCodec_Lookup(encoding); |
530 | if (codec == NULL) Branch (530:9): [True: 32, False: 1.13M]
|
531 | return NULL; |
532 | |
533 | /* Backwards compatibility: assume any raw tuple describes a text |
534 | * encoding, and the same for anything lacking the private |
535 | * attribute. |
536 | */ |
537 | if (!PyTuple_CheckExact(codec)) { Branch (537:9): [True: 1.13M, False: 19]
|
538 | if (_PyObject_LookupAttr(codec, &_Py_ID(_is_text_encoding), &attr) < 0) { Branch (538:13): [True: 0, False: 1.13M]
|
539 | Py_DECREF(codec); |
540 | return NULL; |
541 | } |
542 | if (attr != NULL) { Branch (542:13): [True: 1.13M, False: 0]
|
543 | is_text_codec = PyObject_IsTrue(attr); |
544 | Py_DECREF(attr); |
545 | if (is_text_codec <= 0) { Branch (545:17): [True: 22, False: 1.13M]
|
546 | Py_DECREF(codec); |
547 | if (!is_text_codec) Branch (547:21): [True: 22, False: 0]
|
548 | PyErr_Format(PyExc_LookupError, |
549 | "'%.400s' is not a text encoding; " |
550 | "use %s to handle arbitrary codecs", |
551 | encoding, alternate_command); |
552 | return NULL; |
553 | } |
554 | } |
555 | } |
556 | |
557 | /* This appears to be a valid text encoding */ |
558 | return codec; |
559 | } |
560 | |
561 | |
562 | static |
563 | PyObject *codec_getitem_checked(const char *encoding, |
564 | const char *alternate_command, |
565 | int index) |
566 | { |
567 | PyObject *codec; |
568 | PyObject *v; |
569 | |
570 | codec = _PyCodec_LookupTextEncoding(encoding, alternate_command); |
571 | if (codec == NULL) Branch (571:9): [True: 51, False: 1.11M]
|
572 | return NULL; |
573 | |
574 | v = PyTuple_GET_ITEM(codec, index); |
575 | Py_INCREF(v); |
576 | Py_DECREF(codec); |
577 | return v; |
578 | } |
579 | |
580 | static PyObject * _PyCodec_TextEncoder(const char *encoding) |
581 | { |
582 | return codec_getitem_checked(encoding, "codecs.encode()", 0); |
583 | } |
584 | |
585 | static PyObject * _PyCodec_TextDecoder(const char *encoding) |
586 | { |
587 | return codec_getitem_checked(encoding, "codecs.decode()", 1); |
588 | } |
589 | |
590 | PyObject *_PyCodec_EncodeText(PyObject *object, |
591 | const char *encoding, |
592 | const char *errors) |
593 | { |
594 | PyObject *encoder; |
595 | |
596 | encoder = _PyCodec_TextEncoder(encoding); |
597 | if (encoder == NULL) Branch (597:9): [True: 12, False: 1.07M]
|
598 | return NULL; |
599 | |
600 | return _PyCodec_EncodeInternal(object, encoder, encoding, errors); |
601 | } |
602 | |
603 | PyObject *_PyCodec_DecodeText(PyObject *object, |
604 | const char *encoding, |
605 | const char *errors) |
606 | { |
607 | PyObject *decoder; |
608 | |
609 | decoder = _PyCodec_TextDecoder(encoding); |
610 | if (decoder == NULL) Branch (610:9): [True: 39, False: 43.5k]
|
611 | return NULL; |
612 | |
613 | return _PyCodec_DecodeInternal(object, decoder, encoding, errors); |
614 | } |
615 | |
616 | /* Register the error handling callback function error under the name |
617 | name. This function will be called by the codec when it encounters |
618 | an unencodable characters/undecodable bytes and doesn't know the |
619 | callback name, when name is specified as the error parameter |
620 | in the call to the encode/decode function. |
621 | Return 0 on success, -1 on error */ |
622 | int PyCodec_RegisterError(const char *name, PyObject *error) |
623 | { |
624 | PyInterpreterState *interp = _PyInterpreterState_GET(); |
625 | if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()0 ) Branch (625:9): [True: 0, False: 2.45k]
Branch (625:46): [True: 0, False: 0]
|
626 | return -1; |
627 | if (!PyCallable_Check(error)) { Branch (627:9): [True: 1, False: 2.45k]
|
628 | PyErr_SetString(PyExc_TypeError, "handler must be callable"); |
629 | return -1; |
630 | } |
631 | return PyDict_SetItemString(interp->codec_error_registry, |
632 | name, error); |
633 | } |
634 | |
635 | /* Lookup the error handling callback function registered under the |
636 | name error. As a special case NULL can be passed, in which case |
637 | the error handling callback for strict encoding will be returned. */ |
638 | PyObject *PyCodec_LookupError(const char *name) |
639 | { |
640 | PyObject *handler = NULL; |
641 | |
642 | PyInterpreterState *interp = _PyInterpreterState_GET(); |
643 | if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()0 ) Branch (643:9): [True: 0, False: 7.28k]
Branch (643:46): [True: 0, False: 0]
|
644 | return NULL; |
645 | |
646 | if (name==NULL) Branch (646:9): [True: 1.33k, False: 5.95k]
|
647 | name = "strict"; |
648 | handler = _PyDict_GetItemStringWithError(interp->codec_error_registry, name); |
649 | if (handler) { Branch (649:9): [True: 7.28k, False: 3]
|
650 | Py_INCREF(handler); |
651 | } |
652 | else if (!PyErr_Occurred()) { Branch (652:14): [True: 3, False: 0]
|
653 | PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name); |
654 | } |
655 | return handler; |
656 | } |
657 | |
658 | static void wrong_exception_type(PyObject *exc) |
659 | { |
660 | PyErr_Format(PyExc_TypeError, |
661 | "don't know how to handle %.200s in error callback", |
662 | Py_TYPE(exc)->tp_name); |
663 | } |
664 | |
665 | PyObject *PyCodec_StrictErrors(PyObject *exc) |
666 | { |
667 | if (PyExceptionInstance_Check(exc)) |
668 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
669 | else |
670 | PyErr_SetString(PyExc_TypeError, "codec must pass exception instance"); |
671 | return NULL; |
672 | } |
673 | |
674 | |
675 | PyObject *PyCodec_IgnoreErrors(PyObject *exc) |
676 | { |
677 | Py_ssize_t end; |
678 | |
679 | if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { |
680 | if (PyUnicodeEncodeError_GetEnd(exc, &end)) Branch (680:13): [True: 0, False: 3.42k]
|
681 | return NULL; |
682 | } |
683 | else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { |
684 | if (PyUnicodeDecodeError_GetEnd(exc, &end)) Branch (684:13): [True: 0, False: 580]
|
685 | return NULL; |
686 | } |
687 | else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) { |
688 | if (PyUnicodeTranslateError_GetEnd(exc, &end)) Branch (688:13): [True: 0, False: 1]
|
689 | return NULL; |
690 | } |
691 | else { |
692 | wrong_exception_type(exc); |
693 | return NULL; |
694 | } |
695 | return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end); |
696 | } |
697 | |
698 | |
699 | PyObject *PyCodec_ReplaceErrors(PyObject *exc) |
700 | { |
701 | Py_ssize_t start, end, i, len; |
702 | |
703 | if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { |
704 | PyObject *res; |
705 | Py_UCS1 *outp; |
706 | if (PyUnicodeEncodeError_GetStart(exc, &start)) Branch (706:13): [True: 1, False: 1.02k]
|
707 | return NULL; |
708 | if (PyUnicodeEncodeError_GetEnd(exc, &end)) Branch (708:13): [True: 0, False: 1.02k]
|
709 | return NULL; |
710 | len = end - start; |
711 | res = PyUnicode_New(len, '?'); |
712 | if (res == NULL) Branch (712:13): [True: 0, False: 1.02k]
|
713 | return NULL; |
714 | assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND); |
715 | outp = PyUnicode_1BYTE_DATA(res); |
716 | for (i = 0; i < len; ++i5.01k ) Branch (716:21): [True: 5.01k, False: 1.02k]
|
717 | outp[i] = '?'; |
718 | assert(_PyUnicode_CheckConsistency(res, 1)); |
719 | return Py_BuildValue("(Nn)", res, end); |
720 | } |
721 | else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { |
722 | if (PyUnicodeDecodeError_GetEnd(exc, &end)) Branch (722:13): [True: 1, False: 932]
|
723 | return NULL; |
724 | return Py_BuildValue("(Cn)", |
725 | (int)Py_UNICODE_REPLACEMENT_CHARACTER, |
726 | end); |
727 | } |
728 | else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) { |
729 | PyObject *res; |
730 | Py_UCS2 *outp; |
731 | if (PyUnicodeTranslateError_GetStart(exc, &start)) Branch (731:13): [True: 0, False: 1]
|
732 | return NULL; |
733 | if (PyUnicodeTranslateError_GetEnd(exc, &end)) Branch (733:13): [True: 0, False: 1]
|
734 | return NULL; |
735 | len = end - start; |
736 | res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER); |
737 | if (res == NULL) Branch (737:13): [True: 0, False: 1]
|
738 | return NULL; |
739 | assert(PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND); |
740 | outp = PyUnicode_2BYTE_DATA(res); |
741 | for (i = 0; i < len; i++1 ) Branch (741:21): [True: 1, False: 1]
|
742 | outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER; |
743 | assert(_PyUnicode_CheckConsistency(res, 1)); |
744 | return Py_BuildValue("(Nn)", res, end); |
745 | } |
746 | else { |
747 | wrong_exception_type(exc); |
748 | return NULL; |
749 | } |
750 | } |
751 | |
752 | PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) |
753 | { |
754 | if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { |
755 | PyObject *restuple; |
756 | PyObject *object; |
757 | Py_ssize_t i; |
758 | Py_ssize_t start; |
759 | Py_ssize_t end; |
760 | PyObject *res; |
761 | Py_UCS1 *outp; |
762 | Py_ssize_t ressize; |
763 | Py_UCS4 ch; |
764 | if (PyUnicodeEncodeError_GetStart(exc, &start)) Branch (764:13): [True: 0, False: 1.06k]
|
765 | return NULL; |
766 | if (PyUnicodeEncodeError_GetEnd(exc, &end)) Branch (766:13): [True: 0, False: 1.06k]
|
767 | return NULL; |
768 | if (!(object = PyUnicodeEncodeError_GetObject(exc))) Branch (768:13): [True: 0, False: 1.06k]
|
769 | return NULL; |
770 | if (end - start > PY_SSIZE_T_MAX / (2+7+1)) Branch (770:13): [True: 0, False: 1.06k]
|
771 | end = start + PY_SSIZE_T_MAX / (2+7+1); |
772 | for (i = start, ressize = 0; i < end; ++i5.09k ) { Branch (772:38): [True: 5.09k, False: 1.06k]
|
773 | /* object is guaranteed to be "ready" */ |
774 | ch = PyUnicode_READ_CHAR(object, i); |
775 | if (ch<10) Branch (775:17): [True: 3, False: 5.08k]
|
776 | ressize += 2+1+1; |
777 | else if (ch<100) Branch (777:22): [True: 2, False: 5.08k]
|
778 | ressize += 2+2+1; |
779 | else if (ch<1000) Branch (779:22): [True: 1.00k, False: 4.08k]
|
780 | ressize += 2+3+1; |
781 | else if (ch<10000) Branch (781:22): [True: 60, False: 4.02k]
|
782 | ressize += 2+4+1; |
783 | else if (ch<100000) Branch (783:22): [True: 4.01k, False: 9]
|
784 | ressize += 2+5+1; |
785 | else if (ch<1000000) Branch (785:22): [True: 6, False: 3]
|
786 | ressize += 2+6+1; |
787 | else |
788 | ressize += 2+7+1; |
789 | } |
790 | /* allocate replacement */ |
791 | res = PyUnicode_New(ressize, 127); |
792 | if (res == NULL) { Branch (792:13): [True: 0, False: 1.06k]
|
793 | Py_DECREF(object); |
794 | return NULL; |
795 | } |
796 | outp = PyUnicode_1BYTE_DATA(res); |
797 | /* generate replacement */ |
798 | for (i = start; i < end; ++i5.09k ) { Branch (798:25): [True: 5.09k, False: 1.06k]
|
799 | int digits; |
800 | int base; |
801 | ch = PyUnicode_READ_CHAR(object, i); |
802 | *outp++ = '&'; |
803 | *outp++ = '#'; |
804 | if (ch<10) { Branch (804:17): [True: 3, False: 5.08k]
|
805 | digits = 1; |
806 | base = 1; |
807 | } |
808 | else if (ch<100) { Branch (808:22): [True: 2, False: 5.08k]
|
809 | digits = 2; |
810 | base = 10; |
811 | } |
812 | else if (ch<1000) { Branch (812:22): [True: 1.00k, False: 4.08k]
|
813 | digits = 3; |
814 | base = 100; |
815 | } |
816 | else if (ch<10000) { Branch (816:22): [True: 60, False: 4.02k]
|
817 | digits = 4; |
818 | base = 1000; |
819 | } |
820 | else if (ch<100000) { Branch (820:22): [True: 4.01k, False: 9]
|
821 | digits = 5; |
822 | base = 10000; |
823 | } |
824 | else if (ch<1000000) { Branch (824:22): [True: 6, False: 3]
|
825 | digits = 6; |
826 | base = 100000; |
827 | } |
828 | else { |
829 | digits = 7; |
830 | base = 1000000; |
831 | } |
832 | while (digits-->0) { Branch (832:20): [True: 23.3k, False: 5.09k]
|
833 | *outp++ = '0' + ch/base; |
834 | ch %= base; |
835 | base /= 10; |
836 | } |
837 | *outp++ = ';'; |
838 | } |
839 | assert(_PyUnicode_CheckConsistency(res, 1)); |
840 | restuple = Py_BuildValue("(Nn)", res, end); |
841 | Py_DECREF(object); |
842 | return restuple; |
843 | } |
844 | else { |
845 | wrong_exception_type(exc); |
846 | return NULL; |
847 | } |
848 | } |
849 | |
850 | PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) |
851 | { |
852 | PyObject *object; |
853 | Py_ssize_t i; |
854 | Py_ssize_t start; |
855 | Py_ssize_t end; |
856 | PyObject *res; |
857 | Py_UCS1 *outp; |
858 | int ressize; |
859 | Py_UCS4 c; |
860 | |
861 | if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { |
862 | const unsigned char *p; |
863 | if (PyUnicodeDecodeError_GetStart(exc, &start)) Branch (863:13): [True: 0, False: 60]
|
864 | return NULL; |
865 | if (PyUnicodeDecodeError_GetEnd(exc, &end)) Branch (865:13): [True: 0, False: 60]
|
866 | return NULL; |
867 | if (!(object = PyUnicodeDecodeError_GetObject(exc))) Branch (867:13): [True: 0, False: 60]
|
868 | return NULL; |
869 | p = (const unsigned char*)PyBytes_AS_STRING(object); |
870 | res = PyUnicode_New(4 * (end - start), 127); |
871 | if (res == NULL) { Branch (871:13): [True: 0, False: 60]
|
872 | Py_DECREF(object); |
873 | return NULL; |
874 | } |
875 | outp = PyUnicode_1BYTE_DATA(res); |
876 | for (i = start; i < end; i++, outp += 4114 ) { Branch (876:25): [True: 114, False: 60]
|
877 | unsigned char c = p[i]; |
878 | outp[0] = '\\'; |
879 | outp[1] = 'x'; |
880 | outp[2] = Py_hexdigits[(c>>4)&0xf]; |
881 | outp[3] = Py_hexdigits[c&0xf]; |
882 | } |
883 | |
884 | assert(_PyUnicode_CheckConsistency(res, 1)); |
885 | Py_DECREF(object); |
886 | return Py_BuildValue("(Nn)", res, end); |
887 | } |
888 | if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { |
889 | if (PyUnicodeEncodeError_GetStart(exc, &start)) Branch (889:13): [True: 0, False: 1.04k]
|
890 | return NULL; |
891 | if (PyUnicodeEncodeError_GetEnd(exc, &end)) Branch (891:13): [True: 0, False: 1.04k]
|
892 | return NULL; |
893 | if (!(object = PyUnicodeEncodeError_GetObject(exc))) Branch (893:13): [True: 0, False: 1.04k]
|
894 | return NULL; |
895 | } |
896 | else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) { |
897 | if (PyUnicodeTranslateError_GetStart(exc, &start)) Branch (897:13): [True: 0, False: 12]
|
898 | return NULL; |
899 | if (PyUnicodeTranslateError_GetEnd(exc, &end)) Branch (899:13): [True: 0, False: 12]
|
900 | return NULL; |
901 | if (!(object = PyUnicodeTranslateError_GetObject(exc))) Branch (901:13): [True: 0, False: 12]
|
902 | return NULL; |
903 | } |
904 | else { |
905 | wrong_exception_type(exc); |
906 | return NULL; |
907 | } |
908 | |
909 | if (end - start > PY_SSIZE_T_MAX / (1+1+8)) Branch (909:9): [True: 0, False: 1.05k]
|
910 | end = start + PY_SSIZE_T_MAX / (1+1+8); |
911 | for (i = start, ressize = 0; i < end; ++i6.05k ) { Branch (911:34): [True: 6.05k, False: 1.05k]
|
912 | /* object is guaranteed to be "ready" */ |
913 | c = PyUnicode_READ_CHAR(object, i); |
914 | if (c >= 0x10000) { Branch (914:13): [True: 7, False: 6.04k]
|
915 | ressize += 1+1+8; |
916 | } |
917 | else if (c >= 0x100) { Branch (917:18): [True: 5.03k, False: 1.01k]
|
918 | ressize += 1+1+4; |
919 | } |
920 | else |
921 | ressize += 1+1+2; |
922 | } |
923 | res = PyUnicode_New(ressize, 127); |
924 | if (res == NULL) { Branch (924:9): [True: 0, False: 1.05k]
|
925 | Py_DECREF(object); |
926 | return NULL; |
927 | } |
928 | outp = PyUnicode_1BYTE_DATA(res); |
929 | for (i = start; i < end; ++i6.05k ) { Branch (929:21): [True: 6.05k, False: 1.05k]
|
930 | c = PyUnicode_READ_CHAR(object, i); |
931 | *outp++ = '\\'; |
932 | if (c >= 0x00010000) { Branch (932:13): [True: 7, False: 6.04k]
|
933 | *outp++ = 'U'; |
934 | *outp++ = Py_hexdigits[(c>>28)&0xf]; |
935 | *outp++ = Py_hexdigits[(c>>24)&0xf]; |
936 | *outp++ = Py_hexdigits[(c>>20)&0xf]; |
937 | *outp++ = Py_hexdigits[(c>>16)&0xf]; |
938 | *outp++ = Py_hexdigits[(c>>12)&0xf]; |
939 | *outp++ = Py_hexdigits[(c>>8)&0xf]; |
940 | } |
941 | else if (c >= 0x100) { Branch (941:18): [True: 5.03k, False: 1.01k]
|
942 | *outp++ = 'u'; |
943 | *outp++ = Py_hexdigits[(c>>12)&0xf]; |
944 | *outp++ = Py_hexdigits[(c>>8)&0xf]; |
945 | } |
946 | else |
947 | *outp++ = 'x'; |
948 | *outp++ = Py_hexdigits[(c>>4)&0xf]; |
949 | *outp++ = Py_hexdigits[c&0xf]; |
950 | } |
951 | |
952 | assert(_PyUnicode_CheckConsistency(res, 1)); |
953 | Py_DECREF(object); |
954 | return Py_BuildValue("(Nn)", res, end); |
955 | } |
956 | |
957 | static _PyUnicode_Name_CAPI *ucnhash_capi = NULL; |
958 | |
959 | PyObject *PyCodec_NameReplaceErrors(PyObject *exc) |
960 | { |
961 | if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { |
962 | PyObject *restuple; |
963 | PyObject *object; |
964 | Py_ssize_t i; |
965 | Py_ssize_t start; |
966 | Py_ssize_t end; |
967 | PyObject *res; |
968 | Py_UCS1 *outp; |
969 | Py_ssize_t ressize; |
970 | int replsize; |
971 | Py_UCS4 c; |
972 | char buffer[256]; /* NAME_MAXLEN */ |
973 | if (PyUnicodeEncodeError_GetStart(exc, &start)) Branch (973:13): [True: 0, False: 2.02k]
|
974 | return NULL; |
975 | if (PyUnicodeEncodeError_GetEnd(exc, &end)) Branch (975:13): [True: 0, False: 2.02k]
|
976 | return NULL; |
977 | if (!(object = PyUnicodeEncodeError_GetObject(exc))) Branch (977:13): [True: 0, False: 2.02k]
|
978 | return NULL; |
979 | if (!ucnhash_capi) { Branch (979:13): [True: 1, False: 2.02k]
|
980 | /* load the unicode data module */ |
981 | ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import( |
982 | PyUnicodeData_CAPSULE_NAME, 1); |
983 | if (!ucnhash_capi) { Branch (983:17): [True: 0, False: 1]
|
984 | return NULL; |
985 | } |
986 | } |
987 | for (i = start, ressize = 0; 2.02k i < end; ++i10.0k ) { Branch (987:38): [True: 10.0k, False: 2.02k]
|
988 | /* object is guaranteed to be "ready" */ |
989 | c = PyUnicode_READ_CHAR(object, i); |
990 | if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) { Branch (990:17): [True: 10.0k, False: 17]
|
991 | replsize = 1+1+1+(int)strlen(buffer)+1; |
992 | } |
993 | else if (c >= 0x10000) { Branch (993:22): [True: 4, False: 13]
|
994 | replsize = 1+1+8; |
995 | } |
996 | else if (c >= 0x100) { Branch (996:22): [True: 12, False: 1]
|
997 | replsize = 1+1+4; |
998 | } |
999 | else |
1000 | replsize = 1+1+2; |
1001 | if (ressize > PY_SSIZE_T_MAX - replsize) Branch (1001:17): [True: 0, False: 10.0k]
|
1002 | break; |
1003 | ressize += replsize; |
1004 | } |
1005 | end = i; |
1006 | res = PyUnicode_New(ressize, 127); |
1007 | if (res==NULL) Branch (1007:13): [True: 0, False: 2.02k]
|
1008 | return NULL; |
1009 | for (i = start, outp = PyUnicode_1BYTE_DATA(res); |
1010 | i < end; ++i10.0k ) { Branch (1010:13): [True: 10.0k, False: 2.02k]
|
1011 | c = PyUnicode_READ_CHAR(object, i); |
1012 | *outp++ = '\\'; |
1013 | if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) { Branch (1013:17): [True: 10.0k, False: 17]
|
1014 | *outp++ = 'N'; |
1015 | *outp++ = '{'; |
1016 | strcpy((char *)outp, buffer); |
1017 | outp += strlen(buffer); |
1018 | *outp++ = '}'; |
1019 | continue; |
1020 | } |
1021 | if (c >= 0x00010000) { Branch (1021:17): [True: 4, False: 13]
|
1022 | *outp++ = 'U'; |
1023 | *outp++ = Py_hexdigits[(c>>28)&0xf]; |
1024 | *outp++ = Py_hexdigits[(c>>24)&0xf]; |
1025 | *outp++ = Py_hexdigits[(c>>20)&0xf]; |
1026 | *outp++ = Py_hexdigits[(c>>16)&0xf]; |
1027 | *outp++ = Py_hexdigits[(c>>12)&0xf]; |
1028 | *outp++ = Py_hexdigits[(c>>8)&0xf]; |
1029 | } |
1030 | else if (c >= 0x100) { Branch (1030:22): [True: 12, False: 1]
|
1031 | *outp++ = 'u'; |
1032 | *outp++ = Py_hexdigits[(c>>12)&0xf]; |
1033 | *outp++ = Py_hexdigits[(c>>8)&0xf]; |
1034 | } |
1035 | else |
1036 | *outp++ = 'x'; |
1037 | *outp++ = Py_hexdigits[(c>>4)&0xf]; |
1038 | *outp++ = Py_hexdigits[c&0xf]; |
1039 | } |
1040 | |
1041 | assert(outp == PyUnicode_1BYTE_DATA(res) + ressize); |
1042 | assert(_PyUnicode_CheckConsistency(res, 1)); |
1043 | restuple = Py_BuildValue("(Nn)", res, end); |
1044 | Py_DECREF(object); |
1045 | return restuple; |
1046 | } |
1047 | else { |
1048 | wrong_exception_type(exc); |
1049 | return NULL; |
1050 | } |
1051 | } |
1052 | |
1053 | #define ENC_UNKNOWN -1 |
1054 | #define ENC_UTF8 0 |
1055 | #define ENC_UTF16BE 1 |
1056 | #define ENC_UTF16LE 2 |
1057 | #define ENC_UTF32BE 3 |
1058 | #define ENC_UTF32LE 4 |
1059 | |
1060 | static int |
1061 | get_standard_encoding(const char *encoding, int *bytelength) |
1062 | { |
1063 | if (Py_TOLOWER(encoding[0]) == 'u' && Branch (1063:9): [True: 658, False: 3]
|
1064 | Py_TOLOWER658 (encoding[1]) == 't'658 && Branch (1064:9): [True: 658, False: 0]
|
1065 | Py_TOLOWER658 (encoding[2]) == 'f'658 ) { Branch (1065:9): [True: 658, False: 0]
|
1066 | encoding += 3; |
1067 | if (*encoding == '-' || *encoding == '_'0 ) Branch (1067:13): [True: 658, False: 0]
Branch (1067:33): [True: 0, False: 0]
|
1068 | encoding++; |
1069 | if (encoding[0] == '8' && encoding[1] == '\0'542 ) { Branch (1069:13): [True: 542, False: 116]
Branch (1069:35): [True: 542, False: 0]
|
1070 | *bytelength = 3; |
1071 | return ENC_UTF8; |
1072 | } |
1073 | else if (encoding[0] == '1' && encoding[1] == '6'50 ) { Branch (1073:18): [True: 50, False: 66]
Branch (1073:40): [True: 50, False: 0]
|
1074 | encoding += 2; |
1075 | *bytelength = 2; |
1076 | if (*encoding == '\0') { Branch (1076:17): [True: 5, False: 45]
|
1077 | #ifdef WORDS_BIGENDIAN |
1078 | return ENC_UTF16BE; |
1079 | #else |
1080 | return ENC_UTF16LE; |
1081 | #endif |
1082 | } |
1083 | if (*encoding == '-' || *encoding == '_'16 ) Branch (1083:17): [True: 29, False: 16]
Branch (1083:37): [True: 0, False: 16]
|
1084 | encoding++; |
1085 | if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') { Branch (1085:17): [True: 45, False: 0]
Branch (1085:51): [True: 45, False: 0]
|
1086 | if (Py_TOLOWER(encoding[0]) == 'b') Branch (1086:21): [True: 18, False: 27]
|
1087 | return ENC_UTF16BE; |
1088 | if (Py_TOLOWER(encoding[0]) == 'l') Branch (1088:21): [True: 27, False: 0]
|
1089 | return ENC_UTF16LE; |
1090 | } |
1091 | } |
1092 | else if (encoding[0] == '3' && encoding[1] == '2') { Branch (1092:18): [True: 66, False: 0]
Branch (1092:40): [True: 66, False: 0]
|
1093 | encoding += 2; |
1094 | *bytelength = 4; |
1095 | if (*encoding == '\0') { Branch (1095:17): [True: 5, False: 61]
|
1096 | #ifdef WORDS_BIGENDIAN |
1097 | return ENC_UTF32BE; |
1098 | #else |
1099 | return ENC_UTF32LE; |
1100 | #endif |
1101 | } |
1102 | if (*encoding == '-' || *encoding == '_'16 ) Branch (1102:17): [True: 45, False: 16]
Branch (1102:37): [True: 0, False: 16]
|
1103 | encoding++; |
1104 | if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') { Branch (1104:17): [True: 61, False: 0]
Branch (1104:51): [True: 61, False: 0]
|
1105 | if (Py_TOLOWER(encoding[0]) == 'b') Branch (1105:21): [True: 22, False: 39]
|
1106 | return ENC_UTF32BE; |
1107 | if (Py_TOLOWER(encoding[0]) == 'l') Branch (1107:21): [True: 39, False: 0]
|
1108 | return ENC_UTF32LE; |
1109 | } |
1110 | } |
1111 | } |
1112 | else if (strcmp(encoding, "CP_UTF8") == 0) { Branch (1112:14): [True: 0, False: 3]
|
1113 | *bytelength = 3; |
1114 | return ENC_UTF8; |
1115 | } |
1116 | return ENC_UNKNOWN; |
1117 | } |
1118 | |
1119 | /* This handler is declared static until someone demonstrates |
1120 | a need to call it directly. */ |
1121 | static PyObject * |
1122 | PyCodec_SurrogatePassErrors(PyObject *exc) |
1123 | { |
1124 | PyObject *restuple; |
1125 | PyObject *object; |
1126 | PyObject *encode; |
1127 | const char *encoding; |
1128 | int code; |
1129 | int bytelength; |
1130 | Py_ssize_t i; |
1131 | Py_ssize_t start; |
1132 | Py_ssize_t end; |
1133 | PyObject *res; |
1134 | |
1135 | if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { |
1136 | unsigned char *outp; |
1137 | if (PyUnicodeEncodeError_GetStart(exc, &start)) Branch (1137:13): [True: 0, False: 53]
|
1138 | return NULL; |
1139 | if (PyUnicodeEncodeError_GetEnd(exc, &end)) Branch (1139:13): [True: 0, False: 53]
|
1140 | return NULL; |
1141 | if (!(object = PyUnicodeEncodeError_GetObject(exc))) Branch (1141:13): [True: 0, False: 53]
|
1142 | return NULL; |
1143 | if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) { Branch (1143:13): [True: 0, False: 53]
|
1144 | Py_DECREF(object); |
1145 | return NULL; |
1146 | } |
1147 | if (!(encoding = PyUnicode_AsUTF8(encode))) { Branch (1147:13): [True: 0, False: 53]
|
1148 | Py_DECREF(object); |
1149 | Py_DECREF(encode); |
1150 | return NULL; |
1151 | } |
1152 | code = get_standard_encoding(encoding, &bytelength); |
1153 | Py_DECREF(encode); |
1154 | if (code == ENC_UNKNOWN) { Branch (1154:13): [True: 3, False: 50]
|
1155 | /* Not supported, fail with original exception */ |
1156 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1157 | Py_DECREF(object); |
1158 | return NULL; |
1159 | } |
1160 | |
1161 | if (end - start > PY_SSIZE_T_MAX / bytelength) Branch (1161:13): [True: 0, False: 50]
|
1162 | end = start + PY_SSIZE_T_MAX / bytelength; |
1163 | res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start)); |
1164 | if (!res) { Branch (1164:13): [True: 0, False: 50]
|
1165 | Py_DECREF(object); |
1166 | return NULL; |
1167 | } |
1168 | outp = (unsigned char*)PyBytes_AsString(res); |
1169 | for (i = start; i < end; i++50 ) { Branch (1169:25): [True: 55, False: 45]
|
1170 | /* object is guaranteed to be "ready" */ |
1171 | Py_UCS4 ch = PyUnicode_READ_CHAR(object, i); |
1172 | if (!Py_UNICODE_IS_SURROGATE(ch)) { Branch (1172:17): [True: 5, False: 50]
|
1173 | /* Not a surrogate, fail with original exception */ |
1174 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1175 | Py_DECREF(res); |
1176 | Py_DECREF(object); |
1177 | return NULL; |
1178 | } |
1179 | switch (code) { Branch (1179:21): [True: 0, False: 50]
|
1180 | case ENC_UTF8: Branch (1180:13): [True: 4, False: 46]
|
1181 | *outp++ = (unsigned char)(0xe0 | (ch >> 12)); |
1182 | *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f)); |
1183 | *outp++ = (unsigned char)(0x80 | (ch & 0x3f)); |
1184 | break; |
1185 | case ENC_UTF16LE: Branch (1185:13): [True: 14, False: 36]
|
1186 | *outp++ = (unsigned char) ch; |
1187 | *outp++ = (unsigned char)(ch >> 8); |
1188 | break; |
1189 | case ENC_UTF16BE: Branch (1189:13): [True: 9, False: 41]
|
1190 | *outp++ = (unsigned char)(ch >> 8); |
1191 | *outp++ = (unsigned char) ch; |
1192 | break; |
1193 | case ENC_UTF32LE: Branch (1193:13): [True: 14, False: 36]
|
1194 | *outp++ = (unsigned char) ch; |
1195 | *outp++ = (unsigned char)(ch >> 8); |
1196 | *outp++ = (unsigned char)(ch >> 16); |
1197 | *outp++ = (unsigned char)(ch >> 24); |
1198 | break; |
1199 | case ENC_UTF32BE: Branch (1199:13): [True: 9, False: 41]
|
1200 | *outp++ = (unsigned char)(ch >> 24); |
1201 | *outp++ = (unsigned char)(ch >> 16); |
1202 | *outp++ = (unsigned char)(ch >> 8); |
1203 | *outp++ = (unsigned char) ch; |
1204 | break; |
1205 | } |
1206 | } |
1207 | restuple = Py_BuildValue("(On)", res, end); |
1208 | Py_DECREF(res); |
1209 | Py_DECREF(object); |
1210 | return restuple; |
1211 | } |
1212 | else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { |
1213 | const unsigned char *p; |
1214 | Py_UCS4 ch = 0; |
1215 | if (PyUnicodeDecodeError_GetStart(exc, &start)) Branch (1215:13): [True: 0, False: 608]
|
1216 | return NULL; |
1217 | if (PyUnicodeDecodeError_GetEnd(exc, &end)) Branch (1217:13): [True: 0, False: 608]
|
1218 | return NULL; |
1219 | if (!(object = PyUnicodeDecodeError_GetObject(exc))) Branch (1219:13): [True: 0, False: 608]
|
1220 | return NULL; |
1221 | p = (const unsigned char*)PyBytes_AS_STRING(object); |
1222 | if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) { Branch (1222:13): [True: 0, False: 608]
|
1223 | Py_DECREF(object); |
1224 | return NULL; |
1225 | } |
1226 | if (!(encoding = PyUnicode_AsUTF8(encode))) { Branch (1226:13): [True: 0, False: 608]
|
1227 | Py_DECREF(object); |
1228 | Py_DECREF(encode); |
1229 | return NULL; |
1230 | } |
1231 | code = get_standard_encoding(encoding, &bytelength); |
1232 | Py_DECREF(encode); |
1233 | if (code == ENC_UNKNOWN) { Branch (1233:13): [True: 0, False: 608]
|
1234 | /* Not supported, fail with original exception */ |
1235 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1236 | Py_DECREF(object); |
1237 | return NULL; |
1238 | } |
1239 | |
1240 | /* Try decoding a single surrogate character. If |
1241 | there are more, let the codec call us again. */ |
1242 | p += start; |
1243 | if (PyBytes_GET_SIZE(object) - start >= bytelength) { Branch (1243:13): [True: 605, False: 3]
|
1244 | switch (code) { Branch (1244:21): [True: 0, False: 605]
|
1245 | case ENC_UTF8: Branch (1245:13): [True: 535, False: 70]
|
1246 | if ((p[0] & 0xf0) == 0xe0 && Branch (1246:21): [True: 533, False: 2]
|
1247 | (p[1] & 0xc0) == 0x80533 && Branch (1247:21): [True: 533, False: 0]
|
1248 | (p[2] & 0xc0) == 0x80533 ) { Branch (1248:21): [True: 531, False: 2]
|
1249 | /* it's a three-byte code */ |
1250 | ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f); |
1251 | } |
1252 | break; |
1253 | case ENC_UTF16LE: Branch (1253:13): [True: 18, False: 587]
|
1254 | ch = p[1] << 8 | p[0]; |
1255 | break; |
1256 | case ENC_UTF16BE: Branch (1256:13): [True: 9, False: 596]
|
1257 | ch = p[0] << 8 | p[1]; |
1258 | break; |
1259 | case ENC_UTF32LE: Branch (1259:13): [True: 30, False: 575]
|
1260 | ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0]; |
1261 | break; |
1262 | case ENC_UTF32BE: Branch (1262:13): [True: 13, False: 592]
|
1263 | ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; |
1264 | break; |
1265 | } |
1266 | } |
1267 | |
1268 | Py_DECREF(object); |
1269 | if (!Py_UNICODE_IS_SURROGATE(ch)) { Branch (1269:13): [True: 11, False: 597]
|
1270 | /* it's not a surrogate - fail */ |
1271 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1272 | return NULL; |
1273 | } |
1274 | res = PyUnicode_FromOrdinal(ch); |
1275 | if (res == NULL) Branch (1275:13): [True: 0, False: 597]
|
1276 | return NULL; |
1277 | return Py_BuildValue("(Nn)", res, start + bytelength); |
1278 | } |
1279 | else { |
1280 | wrong_exception_type(exc); |
1281 | return NULL; |
1282 | } |
1283 | } |
1284 | |
1285 | static PyObject * |
1286 | PyCodec_SurrogateEscapeErrors(PyObject *exc) |
1287 | { |
1288 | PyObject *restuple; |
1289 | PyObject *object; |
1290 | Py_ssize_t i; |
1291 | Py_ssize_t start; |
1292 | Py_ssize_t end; |
1293 | PyObject *res; |
1294 | |
1295 | if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { |
1296 | char *outp; |
1297 | if (PyUnicodeEncodeError_GetStart(exc, &start)) Branch (1297:13): [True: 0, False: 65]
|
1298 | return NULL; |
1299 | if (PyUnicodeEncodeError_GetEnd(exc, &end)) Branch (1299:13): [True: 0, False: 65]
|
1300 | return NULL; |
1301 | if (!(object = PyUnicodeEncodeError_GetObject(exc))) Branch (1301:13): [True: 0, False: 65]
|
1302 | return NULL; |
1303 | res = PyBytes_FromStringAndSize(NULL, end-start); |
1304 | if (!res) { Branch (1304:13): [True: 0, False: 65]
|
1305 | Py_DECREF(object); |
1306 | return NULL; |
1307 | } |
1308 | outp = PyBytes_AsString(res); |
1309 | for (i = start; i < end; i++2 ) { Branch (1309:25): [True: 65, False: 2]
|
1310 | /* object is guaranteed to be "ready" */ |
1311 | Py_UCS4 ch = PyUnicode_READ_CHAR(object, i); |
1312 | if (ch < 0xdc80 || ch > 0xdcff49 ) { Branch (1312:17): [True: 16, False: 49]
Branch (1312:32): [True: 47, False: 2]
|
1313 | /* Not a UTF-8b surrogate, fail with original exception */ |
1314 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1315 | Py_DECREF(res); |
1316 | Py_DECREF(object); |
1317 | return NULL; |
1318 | } |
1319 | *outp++ = ch - 0xdc00; |
1320 | } |
1321 | restuple = Py_BuildValue("(On)", res, end); |
1322 | Py_DECREF(res); |
1323 | Py_DECREF(object); |
1324 | return restuple; |
1325 | } |
1326 | else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { |
1327 | PyObject *str; |
1328 | const unsigned char *p; |
1329 | Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */ |
1330 | int consumed = 0; |
1331 | if (PyUnicodeDecodeError_GetStart(exc, &start)) Branch (1331:13): [True: 0, False: 3]
|
1332 | return NULL; |
1333 | if (PyUnicodeDecodeError_GetEnd(exc, &end)) Branch (1333:13): [True: 0, False: 3]
|
1334 | return NULL; |
1335 | if (!(object = PyUnicodeDecodeError_GetObject(exc))) Branch (1335:13): [True: 0, False: 3]
|
1336 | return NULL; |
1337 | p = (const unsigned char*)PyBytes_AS_STRING(object); |
1338 | while (consumed < 4 && consumed < end-start) { Branch (1338:16): [True: 5, False: 0]
Branch (1338:32): [True: 3, False: 2]
|
1339 | /* Refuse to escape ASCII bytes. */ |
1340 | if (p[start+consumed] < 128) Branch (1340:17): [True: 1, False: 2]
|
1341 | break; |
1342 | ch[consumed] = 0xdc00 + p[start+consumed]; |
1343 | consumed++; |
1344 | } |
1345 | Py_DECREF(object); |
1346 | if (!consumed) { Branch (1346:13): [True: 1, False: 2]
|
1347 | /* codec complained about ASCII byte. */ |
1348 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1349 | return NULL; |
1350 | } |
1351 | str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed); |
1352 | if (str == NULL) Branch (1352:13): [True: 0, False: 2]
|
1353 | return NULL; |
1354 | return Py_BuildValue("(Nn)", str, start+consumed); |
1355 | } |
1356 | else { |
1357 | wrong_exception_type(exc); |
1358 | return NULL; |
1359 | } |
1360 | } |
1361 | |
1362 | |
1363 | static PyObject *strict_errors(PyObject *self, PyObject *exc) |
1364 | { |
1365 | return PyCodec_StrictErrors(exc); |
1366 | } |
1367 | |
1368 | |
1369 | static PyObject *ignore_errors(PyObject *self, PyObject *exc) |
1370 | { |
1371 | return PyCodec_IgnoreErrors(exc); |
1372 | } |
1373 | |
1374 | |
1375 | static PyObject *replace_errors(PyObject *self, PyObject *exc) |
1376 | { |
1377 | return PyCodec_ReplaceErrors(exc); |
1378 | } |
1379 | |
1380 | |
1381 | static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc) |
1382 | { |
1383 | return PyCodec_XMLCharRefReplaceErrors(exc); |
1384 | } |
1385 | |
1386 | |
1387 | static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc) |
1388 | { |
1389 | return PyCodec_BackslashReplaceErrors(exc); |
1390 | } |
1391 | |
1392 | static PyObject *namereplace_errors(PyObject *self, PyObject *exc) |
1393 | { |
1394 | return PyCodec_NameReplaceErrors(exc); |
1395 | } |
1396 | |
1397 | static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc) |
1398 | { |
1399 | return PyCodec_SurrogatePassErrors(exc); |
1400 | } |
1401 | |
1402 | static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc) |
1403 | { |
1404 | return PyCodec_SurrogateEscapeErrors(exc); |
1405 | } |
1406 | |
1407 | static int _PyCodecRegistry_Init(void) |
1408 | { |
1409 | static struct { |
1410 | const char *name; |
1411 | PyMethodDef def; |
1412 | } methods[] = |
1413 | { |
1414 | { |
1415 | "strict", |
1416 | { |
1417 | "strict_errors", |
1418 | strict_errors, |
1419 | METH_O, |
1420 | PyDoc_STR("Implements the 'strict' error handling, which " |
1421 | "raises a UnicodeError on coding errors.") |
1422 | } |
1423 | }, |
1424 | { |
1425 | "ignore", |
1426 | { |
1427 | "ignore_errors", |
1428 | ignore_errors, |
1429 | METH_O, |
1430 | PyDoc_STR("Implements the 'ignore' error handling, which " |
1431 | "ignores malformed data and continues.") |
1432 | } |
1433 | }, |
1434 | { |
1435 | "replace", |
1436 | { |
1437 | "replace_errors", |
1438 | replace_errors, |
1439 | METH_O, |
1440 | PyDoc_STR("Implements the 'replace' error handling, which " |
1441 | "replaces malformed data with a replacement marker.") |
1442 | } |
1443 | }, |
1444 | { |
1445 | "xmlcharrefreplace", |
1446 | { |
1447 | "xmlcharrefreplace_errors", |
1448 | xmlcharrefreplace_errors, |
1449 | METH_O, |
1450 | PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, " |
1451 | "which replaces an unencodable character with the " |
1452 | "appropriate XML character reference.") |
1453 | } |
1454 | }, |
1455 | { |
1456 | "backslashreplace", |
1457 | { |
1458 | "backslashreplace_errors", |
1459 | backslashreplace_errors, |
1460 | METH_O, |
1461 | PyDoc_STR("Implements the 'backslashreplace' error handling, " |
1462 | "which replaces malformed data with a backslashed " |
1463 | "escape sequence.") |
1464 | } |
1465 | }, |
1466 | { |
1467 | "namereplace", |
1468 | { |
1469 | "namereplace_errors", |
1470 | namereplace_errors, |
1471 | METH_O, |
1472 | PyDoc_STR("Implements the 'namereplace' error handling, " |
1473 | "which replaces an unencodable character with a " |
1474 | "\\N{...} escape sequence.") |
1475 | } |
1476 | }, |
1477 | { |
1478 | "surrogatepass", |
1479 | { |
1480 | "surrogatepass", |
1481 | surrogatepass_errors, |
1482 | METH_O |
1483 | } |
1484 | }, |
1485 | { |
1486 | "surrogateescape", |
1487 | { |
1488 | "surrogateescape", |
1489 | surrogateescape_errors, |
1490 | METH_O |
1491 | } |
1492 | } |
1493 | }; |
1494 | |
1495 | PyInterpreterState *interp = _PyInterpreterState_GET(); |
1496 | PyObject *mod; |
1497 | |
1498 | if (interp->codec_search_path != NULL) Branch (1498:9): [True: 0, False: 278]
|
1499 | return 0; |
1500 | |
1501 | interp->codec_search_path = PyList_New(0); |
1502 | if (interp->codec_search_path == NULL) { Branch (1502:9): [True: 0, False: 278]
|
1503 | return -1; |
1504 | } |
1505 | |
1506 | interp->codec_search_cache = PyDict_New(); |
1507 | if (interp->codec_search_cache == NULL) { Branch (1507:9): [True: 0, False: 278]
|
1508 | return -1; |
1509 | } |
1510 | |
1511 | interp->codec_error_registry = PyDict_New(); |
1512 | if (interp->codec_error_registry == NULL) { Branch (1512:9): [True: 0, False: 278]
|
1513 | return -1; |
1514 | } |
1515 | |
1516 | for (size_t i = 0; 278 i < Py_ARRAY_LENGTH(methods); ++i2.22k ) { Branch (1516:24): [True: 2.22k, False: 278]
|
1517 | PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL); |
1518 | if (!func) { Branch (1518:13): [True: 0, False: 2.22k]
|
1519 | return -1; |
1520 | } |
1521 | |
1522 | int res = PyCodec_RegisterError(methods[i].name, func); |
1523 | Py_DECREF(func); |
1524 | if (res) { Branch (1524:13): [True: 0, False: 2.22k]
|
1525 | return -1; |
1526 | } |
1527 | } |
1528 | |
1529 | mod = PyImport_ImportModule("encodings"); |
1530 | if (mod == NULL) { Branch (1530:9): [True: 0, False: 278]
|
1531 | return -1; |
1532 | } |
1533 | Py_DECREF(mod); |
1534 | interp->codecs_initialized = 1; |
1535 | return 0; |
1536 | } |