/home/mdboom/Work/builds/cpython/Include/unicodeobject.h
Line | Count | Source |
1 | #ifndef Py_UNICODEOBJECT_H |
2 | #define Py_UNICODEOBJECT_H |
3 | |
4 | #include <stdarg.h> // va_list |
5 | |
6 | /* |
7 | |
8 | Unicode implementation based on original code by Fredrik Lundh, |
9 | modified by Marc-Andre Lemburg (mal@lemburg.com) according to the |
10 | Unicode Integration Proposal. (See |
11 | http://www.egenix.com/files/python/unicode-proposal.txt). |
12 | |
13 | Copyright (c) Corporation for National Research Initiatives. |
14 | |
15 | |
16 | Original header: |
17 | -------------------------------------------------------------------- |
18 | |
19 | * Yet another Unicode string type for Python. This type supports the |
20 | * 16-bit Basic Multilingual Plane (BMP) only. |
21 | * |
22 | * Written by Fredrik Lundh, January 1999. |
23 | * |
24 | * Copyright (c) 1999 by Secret Labs AB. |
25 | * Copyright (c) 1999 by Fredrik Lundh. |
26 | * |
27 | * fredrik@pythonware.com |
28 | * http://www.pythonware.com |
29 | * |
30 | * -------------------------------------------------------------------- |
31 | * This Unicode String Type is |
32 | * |
33 | * Copyright (c) 1999 by Secret Labs AB |
34 | * Copyright (c) 1999 by Fredrik Lundh |
35 | * |
36 | * By obtaining, using, and/or copying this software and/or its |
37 | * associated documentation, you agree that you have read, understood, |
38 | * and will comply with the following terms and conditions: |
39 | * |
40 | * Permission to use, copy, modify, and distribute this software and its |
41 | * associated documentation for any purpose and without fee is hereby |
42 | * granted, provided that the above copyright notice appears in all |
43 | * copies, and that both that copyright notice and this permission notice |
44 | * appear in supporting documentation, and that the name of Secret Labs |
45 | * AB or the author not be used in advertising or publicity pertaining to |
46 | * distribution of the software without specific, written prior |
47 | * permission. |
48 | * |
49 | * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO |
50 | * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND |
51 | * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR |
52 | * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
53 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
54 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT |
55 | * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
56 | * -------------------------------------------------------------------- */ |
57 | |
58 | #include <ctype.h> |
59 | |
60 | /* === Internal API ======================================================= */ |
61 | |
62 | /* --- Internal Unicode Format -------------------------------------------- */ |
63 | |
64 | /* Python 3.x requires unicode */ |
65 | #define Py_USING_UNICODE |
66 | |
67 | #ifndef SIZEOF_WCHAR_T |
68 | #error Must define SIZEOF_WCHAR_T |
69 | #endif |
70 | |
71 | #define Py_UNICODE_SIZE SIZEOF_WCHAR_T |
72 | |
73 | /* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE. |
74 | Otherwise, Unicode strings are stored as UCS-2 (with limited support |
75 | for UTF-16) */ |
76 | |
77 | #if Py_UNICODE_SIZE >= 4 |
78 | #define Py_UNICODE_WIDE |
79 | #endif |
80 | |
81 | /* Set these flags if the platform has "wchar.h" and the |
82 | wchar_t type is a 16-bit unsigned type */ |
83 | /* #define HAVE_WCHAR_H */ |
84 | /* #define HAVE_USABLE_WCHAR_T */ |
85 | |
86 | /* If the compiler provides a wchar_t type we try to support it |
87 | through the interface functions PyUnicode_FromWideChar(), |
88 | PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */ |
89 | |
90 | #ifdef HAVE_USABLE_WCHAR_T |
91 | # ifndef HAVE_WCHAR_H |
92 | # define HAVE_WCHAR_H |
93 | # endif |
94 | #endif |
95 | |
96 | #ifdef HAVE_WCHAR_H |
97 | # include <wchar.h> |
98 | #endif |
99 | |
100 | /* Py_UCS4 and Py_UCS2 are typedefs for the respective |
101 | unicode representations. */ |
102 | typedef uint32_t Py_UCS4; |
103 | typedef uint16_t Py_UCS2; |
104 | typedef uint8_t Py_UCS1; |
105 | |
106 | #ifdef __cplusplus |
107 | extern "C" { |
108 | #endif |
109 | |
110 | |
111 | PyAPI_DATA(PyTypeObject) PyUnicode_Type; |
112 | PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; |
113 | |
114 | #define PyUnicode_Check(op) \ |
115 | PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS) |
116 | #define PyUnicode_CheckExact(op) Py_IS_TYPE((op), &PyUnicode_Type) |
117 | |
118 | /* --- Constants ---------------------------------------------------------- */ |
119 | |
120 | /* This Unicode character will be used as replacement character during |
121 | decoding if the errors argument is set to "replace". Note: the |
122 | Unicode character U+FFFD is the official REPLACEMENT CHARACTER in |
123 | Unicode 3.0. */ |
124 | |
125 | #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD) |
126 | |
127 | /* === Public API ========================================================= */ |
128 | |
129 | /* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */ |
130 | PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize( |
131 | const char *u, /* UTF-8 encoded string */ |
132 | Py_ssize_t size /* size of buffer */ |
133 | ); |
134 | |
135 | /* Similar to PyUnicode_FromUnicode(), but u points to null-terminated |
136 | UTF-8 encoded bytes. The size is determined with strlen(). */ |
137 | PyAPI_FUNC(PyObject*) PyUnicode_FromString( |
138 | const char *u /* UTF-8 encoded string */ |
139 | ); |
140 | |
141 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
142 | PyAPI_FUNC(PyObject*) PyUnicode_Substring( |
143 | PyObject *str, |
144 | Py_ssize_t start, |
145 | Py_ssize_t end); |
146 | #endif |
147 | |
148 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
149 | /* Copy the string into a UCS4 buffer including the null character if copy_null |
150 | is set. Return NULL and raise an exception on error. Raise a SystemError if |
151 | the buffer is smaller than the string. Return buffer on success. |
152 | |
153 | buflen is the length of the buffer in (Py_UCS4) characters. */ |
154 | PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4( |
155 | PyObject *unicode, |
156 | Py_UCS4* buffer, |
157 | Py_ssize_t buflen, |
158 | int copy_null); |
159 | |
160 | /* Copy the string into a UCS4 buffer. A new buffer is allocated using |
161 | * PyMem_Malloc; if this fails, NULL is returned with a memory error |
162 | exception set. */ |
163 | PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode); |
164 | #endif |
165 | |
166 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
167 | /* Get the length of the Unicode object. */ |
168 | |
169 | PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength( |
170 | PyObject *unicode |
171 | ); |
172 | #endif |
173 | |
174 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
175 | /* Read a character from the string. */ |
176 | |
177 | PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar( |
178 | PyObject *unicode, |
179 | Py_ssize_t index |
180 | ); |
181 | |
182 | /* Write a character to the string. The string must have been created through |
183 | PyUnicode_New, must not be shared, and must not have been hashed yet. |
184 | |
185 | Return 0 on success, -1 on error. */ |
186 | |
187 | PyAPI_FUNC(int) PyUnicode_WriteChar( |
188 | PyObject *unicode, |
189 | Py_ssize_t index, |
190 | Py_UCS4 character |
191 | ); |
192 | #endif |
193 | |
194 | /* Resize a Unicode object. The length is the number of codepoints. |
195 | |
196 | *unicode is modified to point to the new (resized) object and 0 |
197 | returned on success. |
198 | |
199 | Try to resize the string in place (which is usually faster than allocating |
200 | a new string and copy characters), or create a new string. |
201 | |
202 | Error handling is implemented as follows: an exception is set, -1 |
203 | is returned and *unicode left untouched. |
204 | |
205 | WARNING: The function doesn't check string content, the result may not be a |
206 | string in canonical representation. */ |
207 | |
208 | PyAPI_FUNC(int) PyUnicode_Resize( |
209 | PyObject **unicode, /* Pointer to the Unicode object */ |
210 | Py_ssize_t length /* New length */ |
211 | ); |
212 | |
213 | /* Decode obj to a Unicode object. |
214 | |
215 | bytes, bytearray and other bytes-like objects are decoded according to the |
216 | given encoding and error handler. The encoding and error handler can be |
217 | NULL to have the interface use UTF-8 and "strict". |
218 | |
219 | All other objects (including Unicode objects) raise an exception. |
220 | |
221 | The API returns NULL in case of an error. The caller is responsible |
222 | for decref'ing the returned objects. |
223 | |
224 | */ |
225 | |
226 | PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( |
227 | PyObject *obj, /* Object */ |
228 | const char *encoding, /* encoding */ |
229 | const char *errors /* error handling */ |
230 | ); |
231 | |
232 | /* Copy an instance of a Unicode subtype to a new true Unicode object if |
233 | necessary. If obj is already a true Unicode object (not a subtype), return |
234 | the reference with *incremented* refcount. |
235 | |
236 | The API returns NULL in case of an error. The caller is responsible |
237 | for decref'ing the returned objects. |
238 | |
239 | */ |
240 | |
241 | PyAPI_FUNC(PyObject*) PyUnicode_FromObject( |
242 | PyObject *obj /* Object */ |
243 | ); |
244 | |
245 | PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV( |
246 | const char *format, /* ASCII-encoded string */ |
247 | va_list vargs |
248 | ); |
249 | PyAPI_FUNC(PyObject *) PyUnicode_FromFormat( |
250 | const char *format, /* ASCII-encoded string */ |
251 | ... |
252 | ); |
253 | |
254 | PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **); |
255 | PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( |
256 | const char *u /* UTF-8 encoded string */ |
257 | ); |
258 | |
259 | /* --- wchar_t support for platforms which support it --------------------- */ |
260 | |
261 | #ifdef HAVE_WCHAR_H |
262 | |
263 | /* Create a Unicode Object from the wchar_t buffer w of the given |
264 | size. |
265 | |
266 | The buffer is copied into the new object. */ |
267 | |
268 | PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( |
269 | const wchar_t *w, /* wchar_t buffer */ |
270 | Py_ssize_t size /* size of buffer */ |
271 | ); |
272 | |
273 | /* Copies the Unicode Object contents into the wchar_t buffer w. At |
274 | most size wchar_t characters are copied. |
275 | |
276 | Note that the resulting wchar_t string may or may not be |
277 | 0-terminated. It is the responsibility of the caller to make sure |
278 | that the wchar_t string is 0-terminated in case this is required by |
279 | the application. |
280 | |
281 | Returns the number of wchar_t characters copied (excluding a |
282 | possibly trailing 0-termination character) or -1 in case of an |
283 | error. */ |
284 | |
285 | PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( |
286 | PyObject *unicode, /* Unicode object */ |
287 | wchar_t *w, /* wchar_t buffer */ |
288 | Py_ssize_t size /* size of buffer */ |
289 | ); |
290 | |
291 | /* Convert the Unicode object to a wide character string. The output string |
292 | always ends with a nul character. If size is not NULL, write the number of |
293 | wide characters (excluding the null character) into *size. |
294 | |
295 | Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it) |
296 | on success. On error, returns NULL, *size is undefined and raises a |
297 | MemoryError. */ |
298 | |
299 | PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString( |
300 | PyObject *unicode, /* Unicode object */ |
301 | Py_ssize_t *size /* number of characters of the result */ |
302 | ); |
303 | |
304 | #endif |
305 | |
306 | /* --- Unicode ordinals --------------------------------------------------- */ |
307 | |
308 | /* Create a Unicode Object from the given Unicode code point ordinal. |
309 | |
310 | The ordinal must be in range(0x110000). A ValueError is |
311 | raised in case it is not. |
312 | |
313 | */ |
314 | |
315 | PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); |
316 | |
317 | /* === Builtin Codecs ===================================================== |
318 | |
319 | Many of these APIs take two arguments encoding and errors. These |
320 | parameters encoding and errors have the same semantics as the ones |
321 | of the builtin str() API. |
322 | |
323 | Setting encoding to NULL causes the default encoding (UTF-8) to be used. |
324 | |
325 | Error handling is set by errors which may also be set to NULL |
326 | meaning to use the default handling defined for the codec. Default |
327 | error handling for all builtin codecs is "strict" (ValueErrors are |
328 | raised). |
329 | |
330 | The codecs all use a similar interface. Only deviation from the |
331 | generic ones are documented. |
332 | |
333 | */ |
334 | |
335 | /* --- Manage the default encoding ---------------------------------------- */ |
336 | |
337 | /* Returns "utf-8". */ |
338 | PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); |
339 | |
340 | /* --- Generic Codecs ----------------------------------------------------- */ |
341 | |
342 | /* Create a Unicode object by decoding the encoded string s of the |
343 | given size. */ |
344 | |
345 | PyAPI_FUNC(PyObject*) PyUnicode_Decode( |
346 | const char *s, /* encoded string */ |
347 | Py_ssize_t size, /* size of buffer */ |
348 | const char *encoding, /* encoding */ |
349 | const char *errors /* error handling */ |
350 | ); |
351 | |
352 | /* Decode a Unicode object unicode and return the result as Python |
353 | object. |
354 | |
355 | This API is DEPRECATED. The only supported standard encoding is rot13. |
356 | Use PyCodec_Decode() to decode with rot13 and non-standard codecs |
357 | that decode from str. */ |
358 | |
359 | Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject( |
360 | PyObject *unicode, /* Unicode object */ |
361 | const char *encoding, /* encoding */ |
362 | const char *errors /* error handling */ |
363 | ); |
364 | |
365 | /* Decode a Unicode object unicode and return the result as Unicode |
366 | object. |
367 | |
368 | This API is DEPRECATED. The only supported standard encoding is rot13. |
369 | Use PyCodec_Decode() to decode with rot13 and non-standard codecs |
370 | that decode from str to str. */ |
371 | |
372 | Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode( |
373 | PyObject *unicode, /* Unicode object */ |
374 | const char *encoding, /* encoding */ |
375 | const char *errors /* error handling */ |
376 | ); |
377 | |
378 | /* Encodes a Unicode object and returns the result as Python |
379 | object. |
380 | |
381 | This API is DEPRECATED. It is superseded by PyUnicode_AsEncodedString() |
382 | since all standard encodings (except rot13) encode str to bytes. |
383 | Use PyCodec_Encode() for encoding with rot13 and non-standard codecs |
384 | that encode form str to non-bytes. */ |
385 | |
386 | Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( |
387 | PyObject *unicode, /* Unicode object */ |
388 | const char *encoding, /* encoding */ |
389 | const char *errors /* error handling */ |
390 | ); |
391 | |
392 | /* Encodes a Unicode object and returns the result as Python string |
393 | object. */ |
394 | |
395 | PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( |
396 | PyObject *unicode, /* Unicode object */ |
397 | const char *encoding, /* encoding */ |
398 | const char *errors /* error handling */ |
399 | ); |
400 | |
401 | /* Encodes a Unicode object and returns the result as Unicode |
402 | object. |
403 | |
404 | This API is DEPRECATED. The only supported standard encodings is rot13. |
405 | Use PyCodec_Encode() to encode with rot13 and non-standard codecs |
406 | that encode from str to str. */ |
407 | |
408 | Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode( |
409 | PyObject *unicode, /* Unicode object */ |
410 | const char *encoding, /* encoding */ |
411 | const char *errors /* error handling */ |
412 | ); |
413 | |
414 | /* Build an encoding map. */ |
415 | |
416 | PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( |
417 | PyObject* string /* 256 character map */ |
418 | ); |
419 | |
420 | /* --- UTF-7 Codecs ------------------------------------------------------- */ |
421 | |
422 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( |
423 | const char *string, /* UTF-7 encoded string */ |
424 | Py_ssize_t length, /* size of string */ |
425 | const char *errors /* error handling */ |
426 | ); |
427 | |
428 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful( |
429 | const char *string, /* UTF-7 encoded string */ |
430 | Py_ssize_t length, /* size of string */ |
431 | const char *errors, /* error handling */ |
432 | Py_ssize_t *consumed /* bytes consumed */ |
433 | ); |
434 | |
435 | /* --- UTF-8 Codecs ------------------------------------------------------- */ |
436 | |
437 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( |
438 | const char *string, /* UTF-8 encoded string */ |
439 | Py_ssize_t length, /* size of string */ |
440 | const char *errors /* error handling */ |
441 | ); |
442 | |
443 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( |
444 | const char *string, /* UTF-8 encoded string */ |
445 | Py_ssize_t length, /* size of string */ |
446 | const char *errors, /* error handling */ |
447 | Py_ssize_t *consumed /* bytes consumed */ |
448 | ); |
449 | |
450 | PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( |
451 | PyObject *unicode /* Unicode object */ |
452 | ); |
453 | |
454 | /* Returns a pointer to the default encoding (UTF-8) of the |
455 | Unicode object unicode and the size of the encoded representation |
456 | in bytes stored in *size. |
457 | |
458 | In case of an error, no *size is set. |
459 | |
460 | This function caches the UTF-8 encoded string in the unicodeobject |
461 | and subsequent calls will return the same string. The memory is released |
462 | when the unicodeobject is deallocated. |
463 | */ |
464 | |
465 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000 |
466 | PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize( |
467 | PyObject *unicode, |
468 | Py_ssize_t *size); |
469 | #endif |
470 | |
471 | /* --- UTF-32 Codecs ------------------------------------------------------ */ |
472 | |
473 | /* Decodes length bytes from a UTF-32 encoded buffer string and returns |
474 | the corresponding Unicode object. |
475 | |
476 | errors (if non-NULL) defines the error handling. It defaults |
477 | to "strict". |
478 | |
479 | If byteorder is non-NULL, the decoder starts decoding using the |
480 | given byte order: |
481 | |
482 | *byteorder == -1: little endian |
483 | *byteorder == 0: native order |
484 | *byteorder == 1: big endian |
485 | |
486 | In native mode, the first four bytes of the stream are checked for a |
487 | BOM mark. If found, the BOM mark is analysed, the byte order |
488 | adjusted and the BOM skipped. In the other modes, no BOM mark |
489 | interpretation is done. After completion, *byteorder is set to the |
490 | current byte order at the end of input data. |
491 | |
492 | If byteorder is NULL, the codec starts in native order mode. |
493 | |
494 | */ |
495 | |
496 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( |
497 | const char *string, /* UTF-32 encoded string */ |
498 | Py_ssize_t length, /* size of string */ |
499 | const char *errors, /* error handling */ |
500 | int *byteorder /* pointer to byteorder to use |
501 | 0=native;-1=LE,1=BE; updated on |
502 | exit */ |
503 | ); |
504 | |
505 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( |
506 | const char *string, /* UTF-32 encoded string */ |
507 | Py_ssize_t length, /* size of string */ |
508 | const char *errors, /* error handling */ |
509 | int *byteorder, /* pointer to byteorder to use |
510 | 0=native;-1=LE,1=BE; updated on |
511 | exit */ |
512 | Py_ssize_t *consumed /* bytes consumed */ |
513 | ); |
514 | |
515 | /* Returns a Python string using the UTF-32 encoding in native byte |
516 | order. The string always starts with a BOM mark. */ |
517 | |
518 | PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( |
519 | PyObject *unicode /* Unicode object */ |
520 | ); |
521 | |
522 | /* Returns a Python string object holding the UTF-32 encoded value of |
523 | the Unicode data. |
524 | |
525 | If byteorder is not 0, output is written according to the following |
526 | byte order: |
527 | |
528 | byteorder == -1: little endian |
529 | byteorder == 0: native byte order (writes a BOM mark) |
530 | byteorder == 1: big endian |
531 | |
532 | If byteorder is 0, the output string will always start with the |
533 | Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is |
534 | prepended. |
535 | |
536 | */ |
537 | |
538 | /* --- UTF-16 Codecs ------------------------------------------------------ */ |
539 | |
540 | /* Decodes length bytes from a UTF-16 encoded buffer string and returns |
541 | the corresponding Unicode object. |
542 | |
543 | errors (if non-NULL) defines the error handling. It defaults |
544 | to "strict". |
545 | |
546 | If byteorder is non-NULL, the decoder starts decoding using the |
547 | given byte order: |
548 | |
549 | *byteorder == -1: little endian |
550 | *byteorder == 0: native order |
551 | *byteorder == 1: big endian |
552 | |
553 | In native mode, the first two bytes of the stream are checked for a |
554 | BOM mark. If found, the BOM mark is analysed, the byte order |
555 | adjusted and the BOM skipped. In the other modes, no BOM mark |
556 | interpretation is done. After completion, *byteorder is set to the |
557 | current byte order at the end of input data. |
558 | |
559 | If byteorder is NULL, the codec starts in native order mode. |
560 | |
561 | */ |
562 | |
563 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( |
564 | const char *string, /* UTF-16 encoded string */ |
565 | Py_ssize_t length, /* size of string */ |
566 | const char *errors, /* error handling */ |
567 | int *byteorder /* pointer to byteorder to use |
568 | 0=native;-1=LE,1=BE; updated on |
569 | exit */ |
570 | ); |
571 | |
572 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( |
573 | const char *string, /* UTF-16 encoded string */ |
574 | Py_ssize_t length, /* size of string */ |
575 | const char *errors, /* error handling */ |
576 | int *byteorder, /* pointer to byteorder to use |
577 | 0=native;-1=LE,1=BE; updated on |
578 | exit */ |
579 | Py_ssize_t *consumed /* bytes consumed */ |
580 | ); |
581 | |
582 | /* Returns a Python string using the UTF-16 encoding in native byte |
583 | order. The string always starts with a BOM mark. */ |
584 | |
585 | PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( |
586 | PyObject *unicode /* Unicode object */ |
587 | ); |
588 | |
589 | /* --- Unicode-Escape Codecs ---------------------------------------------- */ |
590 | |
591 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( |
592 | const char *string, /* Unicode-Escape encoded string */ |
593 | Py_ssize_t length, /* size of string */ |
594 | const char *errors /* error handling */ |
595 | ); |
596 | |
597 | PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( |
598 | PyObject *unicode /* Unicode object */ |
599 | ); |
600 | |
601 | /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ |
602 | |
603 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( |
604 | const char *string, /* Raw-Unicode-Escape encoded string */ |
605 | Py_ssize_t length, /* size of string */ |
606 | const char *errors /* error handling */ |
607 | ); |
608 | |
609 | PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( |
610 | PyObject *unicode /* Unicode object */ |
611 | ); |
612 | |
613 | /* --- Latin-1 Codecs ----------------------------------------------------- |
614 | |
615 | Note: Latin-1 corresponds to the first 256 Unicode ordinals. */ |
616 | |
617 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( |
618 | const char *string, /* Latin-1 encoded string */ |
619 | Py_ssize_t length, /* size of string */ |
620 | const char *errors /* error handling */ |
621 | ); |
622 | |
623 | PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( |
624 | PyObject *unicode /* Unicode object */ |
625 | ); |
626 | |
627 | /* --- ASCII Codecs ------------------------------------------------------- |
628 | |
629 | Only 7-bit ASCII data is excepted. All other codes generate errors. |
630 | |
631 | */ |
632 | |
633 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( |
634 | const char *string, /* ASCII encoded string */ |
635 | Py_ssize_t length, /* size of string */ |
636 | const char *errors /* error handling */ |
637 | ); |
638 | |
639 | PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( |
640 | PyObject *unicode /* Unicode object */ |
641 | ); |
642 | |
643 | /* --- Character Map Codecs ----------------------------------------------- |
644 | |
645 | This codec uses mappings to encode and decode characters. |
646 | |
647 | Decoding mappings must map byte ordinals (integers in the range from 0 to |
648 | 255) to Unicode strings, integers (which are then interpreted as Unicode |
649 | ordinals) or None. Unmapped data bytes (ones which cause a LookupError) |
650 | as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined |
651 | mapping" and cause an error. |
652 | |
653 | Encoding mappings must map Unicode ordinal integers to bytes objects, |
654 | integers in the range from 0 to 255 or None. Unmapped character |
655 | ordinals (ones which cause a LookupError) as well as mapped to |
656 | None are treated as "undefined mapping" and cause an error. |
657 | |
658 | */ |
659 | |
660 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( |
661 | const char *string, /* Encoded string */ |
662 | Py_ssize_t length, /* size of string */ |
663 | PyObject *mapping, /* decoding mapping */ |
664 | const char *errors /* error handling */ |
665 | ); |
666 | |
667 | PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( |
668 | PyObject *unicode, /* Unicode object */ |
669 | PyObject *mapping /* encoding mapping */ |
670 | ); |
671 | |
672 | /* --- MBCS codecs for Windows -------------------------------------------- */ |
673 | |
674 | #ifdef MS_WINDOWS |
675 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( |
676 | const char *string, /* MBCS encoded string */ |
677 | Py_ssize_t length, /* size of string */ |
678 | const char *errors /* error handling */ |
679 | ); |
680 | |
681 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( |
682 | const char *string, /* MBCS encoded string */ |
683 | Py_ssize_t length, /* size of string */ |
684 | const char *errors, /* error handling */ |
685 | Py_ssize_t *consumed /* bytes consumed */ |
686 | ); |
687 | |
688 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
689 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful( |
690 | int code_page, /* code page number */ |
691 | const char *string, /* encoded string */ |
692 | Py_ssize_t length, /* size of string */ |
693 | const char *errors, /* error handling */ |
694 | Py_ssize_t *consumed /* bytes consumed */ |
695 | ); |
696 | #endif |
697 | |
698 | PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( |
699 | PyObject *unicode /* Unicode object */ |
700 | ); |
701 | |
702 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
703 | PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage( |
704 | int code_page, /* code page number */ |
705 | PyObject *unicode, /* Unicode object */ |
706 | const char *errors /* error handling */ |
707 | ); |
708 | #endif |
709 | |
710 | #endif /* MS_WINDOWS */ |
711 | |
712 | /* --- Locale encoding --------------------------------------------------- */ |
713 | |
714 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
715 | /* Decode a string from the current locale encoding. The decoder is strict if |
716 | *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape' |
717 | error handler (PEP 383) to escape undecodable bytes. If a byte sequence can |
718 | be decoded as a surrogate character and *surrogateescape* is not equal to |
719 | zero, the byte sequence is escaped using the 'surrogateescape' error handler |
720 | instead of being decoded. *str* must end with a null character but cannot |
721 | contain embedded null characters. */ |
722 | |
723 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize( |
724 | const char *str, |
725 | Py_ssize_t len, |
726 | const char *errors); |
727 | |
728 | /* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string |
729 | length using strlen(). */ |
730 | |
731 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale( |
732 | const char *str, |
733 | const char *errors); |
734 | |
735 | /* Encode a Unicode object to the current locale encoding. The encoder is |
736 | strict is *surrogateescape* is equal to zero, otherwise the |
737 | "surrogateescape" error handler is used. Return a bytes object. The string |
738 | cannot contain embedded null characters. */ |
739 | |
740 | PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale( |
741 | PyObject *unicode, |
742 | const char *errors |
743 | ); |
744 | #endif |
745 | |
746 | /* --- File system encoding ---------------------------------------------- */ |
747 | |
748 | /* ParseTuple converter: encode str objects to bytes using |
749 | PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */ |
750 | |
751 | PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*); |
752 | |
753 | /* ParseTuple converter: decode bytes objects to unicode using |
754 | PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */ |
755 | |
756 | PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*); |
757 | |
758 | /* Decode a null-terminated string from the Python filesystem encoding |
759 | and error handler. |
760 | |
761 | If the string length is known, use PyUnicode_DecodeFSDefaultAndSize(). */ |
762 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( |
763 | const char *s /* encoded string */ |
764 | ); |
765 | |
766 | /* Decode a string from the Python filesystem encoding and error handler. */ |
767 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( |
768 | const char *s, /* encoded string */ |
769 | Py_ssize_t size /* size */ |
770 | ); |
771 | |
772 | /* Encode a Unicode object to the Python filesystem encoding and error handler. |
773 | Return bytes. */ |
774 | PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault( |
775 | PyObject *unicode |
776 | ); |
777 | |
778 | /* --- Methods & Slots ---------------------------------------------------- |
779 | |
780 | These are capable of handling Unicode objects and strings on input |
781 | (we refer to them as strings in the descriptions) and return |
782 | Unicode objects or integers as appropriate. */ |
783 | |
784 | /* Concat two strings giving a new Unicode string. */ |
785 | |
786 | PyAPI_FUNC(PyObject*) PyUnicode_Concat( |
787 | PyObject *left, /* Left string */ |
788 | PyObject *right /* Right string */ |
789 | ); |
790 | |
791 | /* Concat two strings and put the result in *pleft |
792 | (sets *pleft to NULL on error) */ |
793 | |
794 | PyAPI_FUNC(void) PyUnicode_Append( |
795 | PyObject **pleft, /* Pointer to left string */ |
796 | PyObject *right /* Right string */ |
797 | ); |
798 | |
799 | /* Concat two strings, put the result in *pleft and drop the right object |
800 | (sets *pleft to NULL on error) */ |
801 | |
802 | PyAPI_FUNC(void) PyUnicode_AppendAndDel( |
803 | PyObject **pleft, /* Pointer to left string */ |
804 | PyObject *right /* Right string */ |
805 | ); |
806 | |
807 | /* Split a string giving a list of Unicode strings. |
808 | |
809 | If sep is NULL, splitting will be done at all whitespace |
810 | substrings. Otherwise, splits occur at the given separator. |
811 | |
812 | At most maxsplit splits will be done. If negative, no limit is set. |
813 | |
814 | Separators are not included in the resulting list. |
815 | |
816 | */ |
817 | |
818 | PyAPI_FUNC(PyObject*) PyUnicode_Split( |
819 | PyObject *s, /* String to split */ |
820 | PyObject *sep, /* String separator */ |
821 | Py_ssize_t maxsplit /* Maxsplit count */ |
822 | ); |
823 | |
824 | /* Dito, but split at line breaks. |
825 | |
826 | CRLF is considered to be one line break. Line breaks are not |
827 | included in the resulting list. */ |
828 | |
829 | PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( |
830 | PyObject *s, /* String to split */ |
831 | int keepends /* If true, line end markers are included */ |
832 | ); |
833 | |
834 | /* Partition a string using a given separator. */ |
835 | |
836 | PyAPI_FUNC(PyObject*) PyUnicode_Partition( |
837 | PyObject *s, /* String to partition */ |
838 | PyObject *sep /* String separator */ |
839 | ); |
840 | |
841 | /* Partition a string using a given separator, searching from the end of the |
842 | string. */ |
843 | |
844 | PyAPI_FUNC(PyObject*) PyUnicode_RPartition( |
845 | PyObject *s, /* String to partition */ |
846 | PyObject *sep /* String separator */ |
847 | ); |
848 | |
849 | /* Split a string giving a list of Unicode strings. |
850 | |
851 | If sep is NULL, splitting will be done at all whitespace |
852 | substrings. Otherwise, splits occur at the given separator. |
853 | |
854 | At most maxsplit splits will be done. But unlike PyUnicode_Split |
855 | PyUnicode_RSplit splits from the end of the string. If negative, |
856 | no limit is set. |
857 | |
858 | Separators are not included in the resulting list. |
859 | |
860 | */ |
861 | |
862 | PyAPI_FUNC(PyObject*) PyUnicode_RSplit( |
863 | PyObject *s, /* String to split */ |
864 | PyObject *sep, /* String separator */ |
865 | Py_ssize_t maxsplit /* Maxsplit count */ |
866 | ); |
867 | |
868 | /* Translate a string by applying a character mapping table to it and |
869 | return the resulting Unicode object. |
870 | |
871 | The mapping table must map Unicode ordinal integers to Unicode strings, |
872 | Unicode ordinal integers or None (causing deletion of the character). |
873 | |
874 | Mapping tables may be dictionaries or sequences. Unmapped character |
875 | ordinals (ones which cause a LookupError) are left untouched and |
876 | are copied as-is. |
877 | |
878 | */ |
879 | |
880 | PyAPI_FUNC(PyObject *) PyUnicode_Translate( |
881 | PyObject *str, /* String */ |
882 | PyObject *table, /* Translate table */ |
883 | const char *errors /* error handling */ |
884 | ); |
885 | |
886 | /* Join a sequence of strings using the given separator and return |
887 | the resulting Unicode string. */ |
888 | |
889 | PyAPI_FUNC(PyObject*) PyUnicode_Join( |
890 | PyObject *separator, /* Separator string */ |
891 | PyObject *seq /* Sequence object */ |
892 | ); |
893 | |
894 | /* Return 1 if substr matches str[start:end] at the given tail end, 0 |
895 | otherwise. */ |
896 | |
897 | PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch( |
898 | PyObject *str, /* String */ |
899 | PyObject *substr, /* Prefix or Suffix string */ |
900 | Py_ssize_t start, /* Start index */ |
901 | Py_ssize_t end, /* Stop index */ |
902 | int direction /* Tail end: -1 prefix, +1 suffix */ |
903 | ); |
904 | |
905 | /* Return the first position of substr in str[start:end] using the |
906 | given search direction or -1 if not found. -2 is returned in case |
907 | an error occurred and an exception is set. */ |
908 | |
909 | PyAPI_FUNC(Py_ssize_t) PyUnicode_Find( |
910 | PyObject *str, /* String */ |
911 | PyObject *substr, /* Substring to find */ |
912 | Py_ssize_t start, /* Start index */ |
913 | Py_ssize_t end, /* Stop index */ |
914 | int direction /* Find direction: +1 forward, -1 backward */ |
915 | ); |
916 | |
917 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
918 | /* Like PyUnicode_Find, but search for single character only. */ |
919 | PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar( |
920 | PyObject *str, |
921 | Py_UCS4 ch, |
922 | Py_ssize_t start, |
923 | Py_ssize_t end, |
924 | int direction |
925 | ); |
926 | #endif |
927 | |
928 | /* Count the number of occurrences of substr in str[start:end]. */ |
929 | |
930 | PyAPI_FUNC(Py_ssize_t) PyUnicode_Count( |
931 | PyObject *str, /* String */ |
932 | PyObject *substr, /* Substring to count */ |
933 | Py_ssize_t start, /* Start index */ |
934 | Py_ssize_t end /* Stop index */ |
935 | ); |
936 | |
937 | /* Replace at most maxcount occurrences of substr in str with replstr |
938 | and return the resulting Unicode object. */ |
939 | |
940 | PyAPI_FUNC(PyObject *) PyUnicode_Replace( |
941 | PyObject *str, /* String */ |
942 | PyObject *substr, /* Substring to find */ |
943 | PyObject *replstr, /* Substring to replace */ |
944 | Py_ssize_t maxcount /* Max. number of replacements to apply; |
945 | -1 = all */ |
946 | ); |
947 | |
948 | /* Compare two strings and return -1, 0, 1 for less than, equal, |
949 | greater than resp. |
950 | Raise an exception and return -1 on error. */ |
951 | |
952 | PyAPI_FUNC(int) PyUnicode_Compare( |
953 | PyObject *left, /* Left string */ |
954 | PyObject *right /* Right string */ |
955 | ); |
956 | |
957 | /* Compare a Unicode object with C string and return -1, 0, 1 for less than, |
958 | equal, and greater than, respectively. It is best to pass only |
959 | ASCII-encoded strings, but the function interprets the input string as |
960 | ISO-8859-1 if it contains non-ASCII characters. |
961 | This function does not raise exceptions. */ |
962 | |
963 | PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( |
964 | PyObject *left, |
965 | const char *right /* ASCII-encoded string */ |
966 | ); |
967 | |
968 | /* Rich compare two strings and return one of the following: |
969 | |
970 | - NULL in case an exception was raised |
971 | - Py_True or Py_False for successful comparisons |
972 | - Py_NotImplemented in case the type combination is unknown |
973 | |
974 | Possible values for op: |
975 | |
976 | Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE |
977 | |
978 | */ |
979 | |
980 | PyAPI_FUNC(PyObject *) PyUnicode_RichCompare( |
981 | PyObject *left, /* Left string */ |
982 | PyObject *right, /* Right string */ |
983 | int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */ |
984 | ); |
985 | |
986 | /* Apply an argument tuple or dictionary to a format string and return |
987 | the resulting Unicode string. */ |
988 | |
989 | PyAPI_FUNC(PyObject *) PyUnicode_Format( |
990 | PyObject *format, /* Format string */ |
991 | PyObject *args /* Argument tuple or dictionary */ |
992 | ); |
993 | |
994 | /* Checks whether element is contained in container and return 1/0 |
995 | accordingly. |
996 | |
997 | element has to coerce to a one element Unicode string. -1 is |
998 | returned in case of an error. */ |
999 | |
1000 | PyAPI_FUNC(int) PyUnicode_Contains( |
1001 | PyObject *container, /* Container string */ |
1002 | PyObject *element /* Element string */ |
1003 | ); |
1004 | |
1005 | /* Checks whether argument is a valid identifier. */ |
1006 | |
1007 | PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); |
1008 | |
1009 | /* === Characters Type APIs =============================================== */ |
1010 | |
1011 | #ifndef Py_LIMITED_API |
1012 | # define Py_CPYTHON_UNICODEOBJECT_H |
1013 | # include "cpython/unicodeobject.h" |
1014 | # undef Py_CPYTHON_UNICODEOBJECT_H |
1015 | #endif |
1016 | |
1017 | #ifdef __cplusplus |
1018 | } |
1019 | #endif |
1020 | #endif /* !Py_UNICODEOBJECT_H */ |