Coverage Report

Created: 2022-07-08 09:39

/home/mdboom/Work/builds/cpython/Include/unicodeobject.h
Line
Count
Source
1
#ifndef Py_UNICODEOBJECT_H
2
#define Py_UNICODEOBJECT_H
3
4
#include <stdarg.h>               // va_list
5
6
/*
7
8
Unicode implementation based on original code by Fredrik Lundh,
9
modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
10
Unicode Integration Proposal. (See
11
http://www.egenix.com/files/python/unicode-proposal.txt).
12
13
Copyright (c) Corporation for National Research Initiatives.
14
15
16
 Original header:
17
 --------------------------------------------------------------------
18
19
 * Yet another Unicode string type for Python.  This type supports the
20
 * 16-bit Basic Multilingual Plane (BMP) only.
21
 *
22
 * Written by Fredrik Lundh, January 1999.
23
 *
24
 * Copyright (c) 1999 by Secret Labs AB.
25
 * Copyright (c) 1999 by Fredrik Lundh.
26
 *
27
 * fredrik@pythonware.com
28
 * http://www.pythonware.com
29
 *
30
 * --------------------------------------------------------------------
31
 * This Unicode String Type is
32
 *
33
 * Copyright (c) 1999 by Secret Labs AB
34
 * Copyright (c) 1999 by Fredrik Lundh
35
 *
36
 * By obtaining, using, and/or copying this software and/or its
37
 * associated documentation, you agree that you have read, understood,
38
 * and will comply with the following terms and conditions:
39
 *
40
 * Permission to use, copy, modify, and distribute this software and its
41
 * associated documentation for any purpose and without fee is hereby
42
 * granted, provided that the above copyright notice appears in all
43
 * copies, and that both that copyright notice and this permission notice
44
 * appear in supporting documentation, and that the name of Secret Labs
45
 * AB or the author not be used in advertising or publicity pertaining to
46
 * distribution of the software without specific, written prior
47
 * permission.
48
 *
49
 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50
 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51
 * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52
 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55
 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56
 * -------------------------------------------------------------------- */
57
58
#include <ctype.h>
59
60
/* === Internal API ======================================================= */
61
62
/* --- Internal Unicode Format -------------------------------------------- */
63
64
/* Python 3.x requires unicode */
65
#define Py_USING_UNICODE
66
67
#ifndef SIZEOF_WCHAR_T
68
#error Must define SIZEOF_WCHAR_T
69
#endif
70
71
#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73
/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74
   Otherwise, Unicode strings are stored as UCS-2 (with limited support
75
   for UTF-16) */
76
77
#if Py_UNICODE_SIZE >= 4
78
#define Py_UNICODE_WIDE
79
#endif
80
81
/* Set these flags if the platform has "wchar.h" and the
82
   wchar_t type is a 16-bit unsigned type */
83
/* #define HAVE_WCHAR_H */
84
/* #define HAVE_USABLE_WCHAR_T */
85
86
/* If the compiler provides a wchar_t type we try to support it
87
   through the interface functions PyUnicode_FromWideChar(),
88
   PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
89
90
#ifdef HAVE_USABLE_WCHAR_T
91
# ifndef HAVE_WCHAR_H
92
#  define HAVE_WCHAR_H
93
# endif
94
#endif
95
96
#ifdef HAVE_WCHAR_H
97
#  include <wchar.h>
98
#endif
99
100
/* Py_UCS4 and Py_UCS2 are typedefs for the respective
101
   unicode representations. */
102
typedef uint32_t Py_UCS4;
103
typedef uint16_t Py_UCS2;
104
typedef uint8_t Py_UCS1;
105
106
#ifdef __cplusplus
107
extern "C" {
108
#endif
109
110
111
PyAPI_DATA(PyTypeObject) PyUnicode_Type;
112
PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
113
114
#define PyUnicode_Check(op) \
115
    PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
116
#define PyUnicode_CheckExact(op) Py_IS_TYPE((op), &PyUnicode_Type)
117
118
/* --- Constants ---------------------------------------------------------- */
119
120
/* This Unicode character will be used as replacement character during
121
   decoding if the errors argument is set to "replace". Note: the
122
   Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
123
   Unicode 3.0. */
124
125
#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
126
127
/* === Public API ========================================================= */
128
129
/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
130
PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
131
    const char *u,             /* UTF-8 encoded string */
132
    Py_ssize_t size            /* size of buffer */
133
    );
134
135
/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
136
   UTF-8 encoded bytes.  The size is determined with strlen(). */
137
PyAPI_FUNC(PyObject*) PyUnicode_FromString(
138
    const char *u              /* UTF-8 encoded string */
139
    );
140
141
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
142
PyAPI_FUNC(PyObject*) PyUnicode_Substring(
143
    PyObject *str,
144
    Py_ssize_t start,
145
    Py_ssize_t end);
146
#endif
147
148
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
149
/* Copy the string into a UCS4 buffer including the null character if copy_null
150
   is set. Return NULL and raise an exception on error. Raise a SystemError if
151
   the buffer is smaller than the string. Return buffer on success.
152
153
   buflen is the length of the buffer in (Py_UCS4) characters. */
154
PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
155
    PyObject *unicode,
156
    Py_UCS4* buffer,
157
    Py_ssize_t buflen,
158
    int copy_null);
159
160
/* Copy the string into a UCS4 buffer. A new buffer is allocated using
161
 * PyMem_Malloc; if this fails, NULL is returned with a memory error
162
   exception set. */
163
PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
164
#endif
165
166
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
167
/* Get the length of the Unicode object. */
168
169
PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
170
    PyObject *unicode
171
);
172
#endif
173
174
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
175
/* Read a character from the string. */
176
177
PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
178
    PyObject *unicode,
179
    Py_ssize_t index
180
    );
181
182
/* Write a character to the string. The string must have been created through
183
   PyUnicode_New, must not be shared, and must not have been hashed yet.
184
185
   Return 0 on success, -1 on error. */
186
187
PyAPI_FUNC(int) PyUnicode_WriteChar(
188
    PyObject *unicode,
189
    Py_ssize_t index,
190
    Py_UCS4 character
191
    );
192
#endif
193
194
/* Resize a Unicode object. The length is the number of codepoints.
195
196
   *unicode is modified to point to the new (resized) object and 0
197
   returned on success.
198
199
   Try to resize the string in place (which is usually faster than allocating
200
   a new string and copy characters), or create a new string.
201
202
   Error handling is implemented as follows: an exception is set, -1
203
   is returned and *unicode left untouched.
204
205
   WARNING: The function doesn't check string content, the result may not be a
206
            string in canonical representation. */
207
208
PyAPI_FUNC(int) PyUnicode_Resize(
209
    PyObject **unicode,         /* Pointer to the Unicode object */
210
    Py_ssize_t length           /* New length */
211
    );
212
213
/* Decode obj to a Unicode object.
214
215
   bytes, bytearray and other bytes-like objects are decoded according to the
216
   given encoding and error handler. The encoding and error handler can be
217
   NULL to have the interface use UTF-8 and "strict".
218
219
   All other objects (including Unicode objects) raise an exception.
220
221
   The API returns NULL in case of an error. The caller is responsible
222
   for decref'ing the returned objects.
223
224
*/
225
226
PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
227
    PyObject *obj,              /* Object */
228
    const char *encoding,       /* encoding */
229
    const char *errors          /* error handling */
230
    );
231
232
/* Copy an instance of a Unicode subtype to a new true Unicode object if
233
   necessary. If obj is already a true Unicode object (not a subtype), return
234
   the reference with *incremented* refcount.
235
236
   The API returns NULL in case of an error. The caller is responsible
237
   for decref'ing the returned objects.
238
239
*/
240
241
PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
242
    PyObject *obj      /* Object */
243
    );
244
245
PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
246
    const char *format,   /* ASCII-encoded string  */
247
    va_list vargs
248
    );
249
PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
250
    const char *format,   /* ASCII-encoded string  */
251
    ...
252
    );
253
254
PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
255
PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
256
    const char *u              /* UTF-8 encoded string */
257
    );
258
259
/* --- wchar_t support for platforms which support it --------------------- */
260
261
#ifdef HAVE_WCHAR_H
262
263
/* Create a Unicode Object from the wchar_t buffer w of the given
264
   size.
265
266
   The buffer is copied into the new object. */
267
268
PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
269
    const wchar_t *w,           /* wchar_t buffer */
270
    Py_ssize_t size             /* size of buffer */
271
    );
272
273
/* Copies the Unicode Object contents into the wchar_t buffer w.  At
274
   most size wchar_t characters are copied.
275
276
   Note that the resulting wchar_t string may or may not be
277
   0-terminated.  It is the responsibility of the caller to make sure
278
   that the wchar_t string is 0-terminated in case this is required by
279
   the application.
280
281
   Returns the number of wchar_t characters copied (excluding a
282
   possibly trailing 0-termination character) or -1 in case of an
283
   error. */
284
285
PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
286
    PyObject *unicode,          /* Unicode object */
287
    wchar_t *w,                 /* wchar_t buffer */
288
    Py_ssize_t size             /* size of buffer */
289
    );
290
291
/* Convert the Unicode object to a wide character string. The output string
292
   always ends with a nul character. If size is not NULL, write the number of
293
   wide characters (excluding the null character) into *size.
294
295
   Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
296
   on success. On error, returns NULL, *size is undefined and raises a
297
   MemoryError. */
298
299
PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
300
    PyObject *unicode,          /* Unicode object */
301
    Py_ssize_t *size            /* number of characters of the result */
302
    );
303
304
#endif
305
306
/* --- Unicode ordinals --------------------------------------------------- */
307
308
/* Create a Unicode Object from the given Unicode code point ordinal.
309
310
   The ordinal must be in range(0x110000). A ValueError is
311
   raised in case it is not.
312
313
*/
314
315
PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
316
317
/* === Builtin Codecs =====================================================
318
319
   Many of these APIs take two arguments encoding and errors. These
320
   parameters encoding and errors have the same semantics as the ones
321
   of the builtin str() API.
322
323
   Setting encoding to NULL causes the default encoding (UTF-8) to be used.
324
325
   Error handling is set by errors which may also be set to NULL
326
   meaning to use the default handling defined for the codec. Default
327
   error handling for all builtin codecs is "strict" (ValueErrors are
328
   raised).
329
330
   The codecs all use a similar interface. Only deviation from the
331
   generic ones are documented.
332
333
*/
334
335
/* --- Manage the default encoding ---------------------------------------- */
336
337
/* Returns "utf-8".  */
338
PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
339
340
/* --- Generic Codecs ----------------------------------------------------- */
341
342
/* Create a Unicode object by decoding the encoded string s of the
343
   given size. */
344
345
PyAPI_FUNC(PyObject*) PyUnicode_Decode(
346
    const char *s,              /* encoded string */
347
    Py_ssize_t size,            /* size of buffer */
348
    const char *encoding,       /* encoding */
349
    const char *errors          /* error handling */
350
    );
351
352
/* Decode a Unicode object unicode and return the result as Python
353
   object.
354
355
   This API is DEPRECATED. The only supported standard encoding is rot13.
356
   Use PyCodec_Decode() to decode with rot13 and non-standard codecs
357
   that decode from str. */
358
359
Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
360
    PyObject *unicode,          /* Unicode object */
361
    const char *encoding,       /* encoding */
362
    const char *errors          /* error handling */
363
    );
364
365
/* Decode a Unicode object unicode and return the result as Unicode
366
   object.
367
368
   This API is DEPRECATED. The only supported standard encoding is rot13.
369
   Use PyCodec_Decode() to decode with rot13 and non-standard codecs
370
   that decode from str to str. */
371
372
Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
373
    PyObject *unicode,          /* Unicode object */
374
    const char *encoding,       /* encoding */
375
    const char *errors          /* error handling */
376
    );
377
378
/* Encodes a Unicode object and returns the result as Python
379
   object.
380
381
   This API is DEPRECATED.  It is superseded by PyUnicode_AsEncodedString()
382
   since all standard encodings (except rot13) encode str to bytes.
383
   Use PyCodec_Encode() for encoding with rot13 and non-standard codecs
384
   that encode form str to non-bytes. */
385
386
Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
387
    PyObject *unicode,          /* Unicode object */
388
    const char *encoding,       /* encoding */
389
    const char *errors          /* error handling */
390
    );
391
392
/* Encodes a Unicode object and returns the result as Python string
393
   object. */
394
395
PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
396
    PyObject *unicode,          /* Unicode object */
397
    const char *encoding,       /* encoding */
398
    const char *errors          /* error handling */
399
    );
400
401
/* Encodes a Unicode object and returns the result as Unicode
402
   object.
403
404
   This API is DEPRECATED.  The only supported standard encodings is rot13.
405
   Use PyCodec_Encode() to encode with rot13 and non-standard codecs
406
   that encode from str to str. */
407
408
Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
409
    PyObject *unicode,          /* Unicode object */
410
    const char *encoding,       /* encoding */
411
    const char *errors          /* error handling */
412
    );
413
414
/* Build an encoding map. */
415
416
PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
417
    PyObject* string            /* 256 character map */
418
   );
419
420
/* --- UTF-7 Codecs ------------------------------------------------------- */
421
422
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
423
    const char *string,         /* UTF-7 encoded string */
424
    Py_ssize_t length,          /* size of string */
425
    const char *errors          /* error handling */
426
    );
427
428
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
429
    const char *string,         /* UTF-7 encoded string */
430
    Py_ssize_t length,          /* size of string */
431
    const char *errors,         /* error handling */
432
    Py_ssize_t *consumed        /* bytes consumed */
433
    );
434
435
/* --- UTF-8 Codecs ------------------------------------------------------- */
436
437
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
438
    const char *string,         /* UTF-8 encoded string */
439
    Py_ssize_t length,          /* size of string */
440
    const char *errors          /* error handling */
441
    );
442
443
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
444
    const char *string,         /* UTF-8 encoded string */
445
    Py_ssize_t length,          /* size of string */
446
    const char *errors,         /* error handling */
447
    Py_ssize_t *consumed        /* bytes consumed */
448
    );
449
450
PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
451
    PyObject *unicode           /* Unicode object */
452
    );
453
454
/* Returns a pointer to the default encoding (UTF-8) of the
455
   Unicode object unicode and the size of the encoded representation
456
   in bytes stored in *size.
457
458
   In case of an error, no *size is set.
459
460
   This function caches the UTF-8 encoded string in the unicodeobject
461
   and subsequent calls will return the same string.  The memory is released
462
   when the unicodeobject is deallocated.
463
*/
464
465
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000
466
PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize(
467
    PyObject *unicode,
468
    Py_ssize_t *size);
469
#endif
470
471
/* --- UTF-32 Codecs ------------------------------------------------------ */
472
473
/* Decodes length bytes from a UTF-32 encoded buffer string and returns
474
   the corresponding Unicode object.
475
476
   errors (if non-NULL) defines the error handling. It defaults
477
   to "strict".
478
479
   If byteorder is non-NULL, the decoder starts decoding using the
480
   given byte order:
481
482
    *byteorder == -1: little endian
483
    *byteorder == 0:  native order
484
    *byteorder == 1:  big endian
485
486
   In native mode, the first four bytes of the stream are checked for a
487
   BOM mark. If found, the BOM mark is analysed, the byte order
488
   adjusted and the BOM skipped.  In the other modes, no BOM mark
489
   interpretation is done. After completion, *byteorder is set to the
490
   current byte order at the end of input data.
491
492
   If byteorder is NULL, the codec starts in native order mode.
493
494
*/
495
496
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
497
    const char *string,         /* UTF-32 encoded string */
498
    Py_ssize_t length,          /* size of string */
499
    const char *errors,         /* error handling */
500
    int *byteorder              /* pointer to byteorder to use
501
                                   0=native;-1=LE,1=BE; updated on
502
                                   exit */
503
    );
504
505
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
506
    const char *string,         /* UTF-32 encoded string */
507
    Py_ssize_t length,          /* size of string */
508
    const char *errors,         /* error handling */
509
    int *byteorder,             /* pointer to byteorder to use
510
                                   0=native;-1=LE,1=BE; updated on
511
                                   exit */
512
    Py_ssize_t *consumed        /* bytes consumed */
513
    );
514
515
/* Returns a Python string using the UTF-32 encoding in native byte
516
   order. The string always starts with a BOM mark.  */
517
518
PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
519
    PyObject *unicode           /* Unicode object */
520
    );
521
522
/* Returns a Python string object holding the UTF-32 encoded value of
523
   the Unicode data.
524
525
   If byteorder is not 0, output is written according to the following
526
   byte order:
527
528
   byteorder == -1: little endian
529
   byteorder == 0:  native byte order (writes a BOM mark)
530
   byteorder == 1:  big endian
531
532
   If byteorder is 0, the output string will always start with the
533
   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
534
   prepended.
535
536
*/
537
538
/* --- UTF-16 Codecs ------------------------------------------------------ */
539
540
/* Decodes length bytes from a UTF-16 encoded buffer string and returns
541
   the corresponding Unicode object.
542
543
   errors (if non-NULL) defines the error handling. It defaults
544
   to "strict".
545
546
   If byteorder is non-NULL, the decoder starts decoding using the
547
   given byte order:
548
549
    *byteorder == -1: little endian
550
    *byteorder == 0:  native order
551
    *byteorder == 1:  big endian
552
553
   In native mode, the first two bytes of the stream are checked for a
554
   BOM mark. If found, the BOM mark is analysed, the byte order
555
   adjusted and the BOM skipped.  In the other modes, no BOM mark
556
   interpretation is done. After completion, *byteorder is set to the
557
   current byte order at the end of input data.
558
559
   If byteorder is NULL, the codec starts in native order mode.
560
561
*/
562
563
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
564
    const char *string,         /* UTF-16 encoded string */
565
    Py_ssize_t length,          /* size of string */
566
    const char *errors,         /* error handling */
567
    int *byteorder              /* pointer to byteorder to use
568
                                   0=native;-1=LE,1=BE; updated on
569
                                   exit */
570
    );
571
572
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
573
    const char *string,         /* UTF-16 encoded string */
574
    Py_ssize_t length,          /* size of string */
575
    const char *errors,         /* error handling */
576
    int *byteorder,             /* pointer to byteorder to use
577
                                   0=native;-1=LE,1=BE; updated on
578
                                   exit */
579
    Py_ssize_t *consumed        /* bytes consumed */
580
    );
581
582
/* Returns a Python string using the UTF-16 encoding in native byte
583
   order. The string always starts with a BOM mark.  */
584
585
PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
586
    PyObject *unicode           /* Unicode object */
587
    );
588
589
/* --- Unicode-Escape Codecs ---------------------------------------------- */
590
591
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
592
    const char *string,         /* Unicode-Escape encoded string */
593
    Py_ssize_t length,          /* size of string */
594
    const char *errors          /* error handling */
595
    );
596
597
PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
598
    PyObject *unicode           /* Unicode object */
599
    );
600
601
/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
602
603
PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
604
    const char *string,         /* Raw-Unicode-Escape encoded string */
605
    Py_ssize_t length,          /* size of string */
606
    const char *errors          /* error handling */
607
    );
608
609
PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
610
    PyObject *unicode           /* Unicode object */
611
    );
612
613
/* --- Latin-1 Codecs -----------------------------------------------------
614
615
   Note: Latin-1 corresponds to the first 256 Unicode ordinals. */
616
617
PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
618
    const char *string,         /* Latin-1 encoded string */
619
    Py_ssize_t length,          /* size of string */
620
    const char *errors          /* error handling */
621
    );
622
623
PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
624
    PyObject *unicode           /* Unicode object */
625
    );
626
627
/* --- ASCII Codecs -------------------------------------------------------
628
629
   Only 7-bit ASCII data is excepted. All other codes generate errors.
630
631
*/
632
633
PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
634
    const char *string,         /* ASCII encoded string */
635
    Py_ssize_t length,          /* size of string */
636
    const char *errors          /* error handling */
637
    );
638
639
PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
640
    PyObject *unicode           /* Unicode object */
641
    );
642
643
/* --- Character Map Codecs -----------------------------------------------
644
645
   This codec uses mappings to encode and decode characters.
646
647
   Decoding mappings must map byte ordinals (integers in the range from 0 to
648
   255) to Unicode strings, integers (which are then interpreted as Unicode
649
   ordinals) or None.  Unmapped data bytes (ones which cause a LookupError)
650
   as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined
651
   mapping" and cause an error.
652
653
   Encoding mappings must map Unicode ordinal integers to bytes objects,
654
   integers in the range from 0 to 255 or None.  Unmapped character
655
   ordinals (ones which cause a LookupError) as well as mapped to
656
   None are treated as "undefined mapping" and cause an error.
657
658
*/
659
660
PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
661
    const char *string,         /* Encoded string */
662
    Py_ssize_t length,          /* size of string */
663
    PyObject *mapping,          /* decoding mapping */
664
    const char *errors          /* error handling */
665
    );
666
667
PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
668
    PyObject *unicode,          /* Unicode object */
669
    PyObject *mapping           /* encoding mapping */
670
    );
671
672
/* --- MBCS codecs for Windows -------------------------------------------- */
673
674
#ifdef MS_WINDOWS
675
PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
676
    const char *string,         /* MBCS encoded string */
677
    Py_ssize_t length,          /* size of string */
678
    const char *errors          /* error handling */
679
    );
680
681
PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
682
    const char *string,         /* MBCS encoded string */
683
    Py_ssize_t length,          /* size of string */
684
    const char *errors,         /* error handling */
685
    Py_ssize_t *consumed        /* bytes consumed */
686
    );
687
688
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
689
PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
690
    int code_page,              /* code page number */
691
    const char *string,         /* encoded string */
692
    Py_ssize_t length,          /* size of string */
693
    const char *errors,         /* error handling */
694
    Py_ssize_t *consumed        /* bytes consumed */
695
    );
696
#endif
697
698
PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
699
    PyObject *unicode           /* Unicode object */
700
    );
701
702
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
703
PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
704
    int code_page,              /* code page number */
705
    PyObject *unicode,          /* Unicode object */
706
    const char *errors          /* error handling */
707
    );
708
#endif
709
710
#endif /* MS_WINDOWS */
711
712
/* --- Locale encoding --------------------------------------------------- */
713
714
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
715
/* Decode a string from the current locale encoding. The decoder is strict if
716
   *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
717
   error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
718
   be decoded as a surrogate character and *surrogateescape* is not equal to
719
   zero, the byte sequence is escaped using the 'surrogateescape' error handler
720
   instead of being decoded. *str* must end with a null character but cannot
721
   contain embedded null characters. */
722
723
PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
724
    const char *str,
725
    Py_ssize_t len,
726
    const char *errors);
727
728
/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
729
   length using strlen(). */
730
731
PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
732
    const char *str,
733
    const char *errors);
734
735
/* Encode a Unicode object to the current locale encoding. The encoder is
736
   strict is *surrogateescape* is equal to zero, otherwise the
737
   "surrogateescape" error handler is used. Return a bytes object. The string
738
   cannot contain embedded null characters. */
739
740
PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
741
    PyObject *unicode,
742
    const char *errors
743
    );
744
#endif
745
746
/* --- File system encoding ---------------------------------------------- */
747
748
/* ParseTuple converter: encode str objects to bytes using
749
   PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
750
751
PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
752
753
/* ParseTuple converter: decode bytes objects to unicode using
754
   PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
755
756
PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
757
758
/* Decode a null-terminated string from the Python filesystem encoding
759
   and error handler.
760
761
   If the string length is known, use PyUnicode_DecodeFSDefaultAndSize(). */
762
PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
763
    const char *s               /* encoded string */
764
    );
765
766
/* Decode a string from the Python filesystem encoding and error handler. */
767
PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
768
    const char *s,               /* encoded string */
769
    Py_ssize_t size              /* size */
770
    );
771
772
/* Encode a Unicode object to the Python filesystem encoding and error handler.
773
   Return bytes. */
774
PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
775
    PyObject *unicode
776
    );
777
778
/* --- Methods & Slots ----------------------------------------------------
779
780
   These are capable of handling Unicode objects and strings on input
781
   (we refer to them as strings in the descriptions) and return
782
   Unicode objects or integers as appropriate. */
783
784
/* Concat two strings giving a new Unicode string. */
785
786
PyAPI_FUNC(PyObject*) PyUnicode_Concat(
787
    PyObject *left,             /* Left string */
788
    PyObject *right             /* Right string */
789
    );
790
791
/* Concat two strings and put the result in *pleft
792
   (sets *pleft to NULL on error) */
793
794
PyAPI_FUNC(void) PyUnicode_Append(
795
    PyObject **pleft,           /* Pointer to left string */
796
    PyObject *right             /* Right string */
797
    );
798
799
/* Concat two strings, put the result in *pleft and drop the right object
800
   (sets *pleft to NULL on error) */
801
802
PyAPI_FUNC(void) PyUnicode_AppendAndDel(
803
    PyObject **pleft,           /* Pointer to left string */
804
    PyObject *right             /* Right string */
805
    );
806
807
/* Split a string giving a list of Unicode strings.
808
809
   If sep is NULL, splitting will be done at all whitespace
810
   substrings. Otherwise, splits occur at the given separator.
811
812
   At most maxsplit splits will be done. If negative, no limit is set.
813
814
   Separators are not included in the resulting list.
815
816
*/
817
818
PyAPI_FUNC(PyObject*) PyUnicode_Split(
819
    PyObject *s,                /* String to split */
820
    PyObject *sep,              /* String separator */
821
    Py_ssize_t maxsplit         /* Maxsplit count */
822
    );
823
824
/* Dito, but split at line breaks.
825
826
   CRLF is considered to be one line break. Line breaks are not
827
   included in the resulting list. */
828
829
PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
830
    PyObject *s,                /* String to split */
831
    int keepends                /* If true, line end markers are included */
832
    );
833
834
/* Partition a string using a given separator. */
835
836
PyAPI_FUNC(PyObject*) PyUnicode_Partition(
837
    PyObject *s,                /* String to partition */
838
    PyObject *sep               /* String separator */
839
    );
840
841
/* Partition a string using a given separator, searching from the end of the
842
   string. */
843
844
PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
845
    PyObject *s,                /* String to partition */
846
    PyObject *sep               /* String separator */
847
    );
848
849
/* Split a string giving a list of Unicode strings.
850
851
   If sep is NULL, splitting will be done at all whitespace
852
   substrings. Otherwise, splits occur at the given separator.
853
854
   At most maxsplit splits will be done. But unlike PyUnicode_Split
855
   PyUnicode_RSplit splits from the end of the string. If negative,
856
   no limit is set.
857
858
   Separators are not included in the resulting list.
859
860
*/
861
862
PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
863
    PyObject *s,                /* String to split */
864
    PyObject *sep,              /* String separator */
865
    Py_ssize_t maxsplit         /* Maxsplit count */
866
    );
867
868
/* Translate a string by applying a character mapping table to it and
869
   return the resulting Unicode object.
870
871
   The mapping table must map Unicode ordinal integers to Unicode strings,
872
   Unicode ordinal integers or None (causing deletion of the character).
873
874
   Mapping tables may be dictionaries or sequences. Unmapped character
875
   ordinals (ones which cause a LookupError) are left untouched and
876
   are copied as-is.
877
878
*/
879
880
PyAPI_FUNC(PyObject *) PyUnicode_Translate(
881
    PyObject *str,              /* String */
882
    PyObject *table,            /* Translate table */
883
    const char *errors          /* error handling */
884
    );
885
886
/* Join a sequence of strings using the given separator and return
887
   the resulting Unicode string. */
888
889
PyAPI_FUNC(PyObject*) PyUnicode_Join(
890
    PyObject *separator,        /* Separator string */
891
    PyObject *seq               /* Sequence object */
892
    );
893
894
/* Return 1 if substr matches str[start:end] at the given tail end, 0
895
   otherwise. */
896
897
PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
898
    PyObject *str,              /* String */
899
    PyObject *substr,           /* Prefix or Suffix string */
900
    Py_ssize_t start,           /* Start index */
901
    Py_ssize_t end,             /* Stop index */
902
    int direction               /* Tail end: -1 prefix, +1 suffix */
903
    );
904
905
/* Return the first position of substr in str[start:end] using the
906
   given search direction or -1 if not found. -2 is returned in case
907
   an error occurred and an exception is set. */
908
909
PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
910
    PyObject *str,              /* String */
911
    PyObject *substr,           /* Substring to find */
912
    Py_ssize_t start,           /* Start index */
913
    Py_ssize_t end,             /* Stop index */
914
    int direction               /* Find direction: +1 forward, -1 backward */
915
    );
916
917
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
918
/* Like PyUnicode_Find, but search for single character only. */
919
PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
920
    PyObject *str,
921
    Py_UCS4 ch,
922
    Py_ssize_t start,
923
    Py_ssize_t end,
924
    int direction
925
    );
926
#endif
927
928
/* Count the number of occurrences of substr in str[start:end]. */
929
930
PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
931
    PyObject *str,              /* String */
932
    PyObject *substr,           /* Substring to count */
933
    Py_ssize_t start,           /* Start index */
934
    Py_ssize_t end              /* Stop index */
935
    );
936
937
/* Replace at most maxcount occurrences of substr in str with replstr
938
   and return the resulting Unicode object. */
939
940
PyAPI_FUNC(PyObject *) PyUnicode_Replace(
941
    PyObject *str,              /* String */
942
    PyObject *substr,           /* Substring to find */
943
    PyObject *replstr,          /* Substring to replace */
944
    Py_ssize_t maxcount         /* Max. number of replacements to apply;
945
                                   -1 = all */
946
    );
947
948
/* Compare two strings and return -1, 0, 1 for less than, equal,
949
   greater than resp.
950
   Raise an exception and return -1 on error. */
951
952
PyAPI_FUNC(int) PyUnicode_Compare(
953
    PyObject *left,             /* Left string */
954
    PyObject *right             /* Right string */
955
    );
956
957
/* Compare a Unicode object with C string and return -1, 0, 1 for less than,
958
   equal, and greater than, respectively.  It is best to pass only
959
   ASCII-encoded strings, but the function interprets the input string as
960
   ISO-8859-1 if it contains non-ASCII characters.
961
   This function does not raise exceptions. */
962
963
PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
964
    PyObject *left,
965
    const char *right           /* ASCII-encoded string */
966
    );
967
968
/* Rich compare two strings and return one of the following:
969
970
   - NULL in case an exception was raised
971
   - Py_True or Py_False for successful comparisons
972
   - Py_NotImplemented in case the type combination is unknown
973
974
   Possible values for op:
975
976
     Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
977
978
*/
979
980
PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
981
    PyObject *left,             /* Left string */
982
    PyObject *right,            /* Right string */
983
    int op                      /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
984
    );
985
986
/* Apply an argument tuple or dictionary to a format string and return
987
   the resulting Unicode string. */
988
989
PyAPI_FUNC(PyObject *) PyUnicode_Format(
990
    PyObject *format,           /* Format string */
991
    PyObject *args              /* Argument tuple or dictionary */
992
    );
993
994
/* Checks whether element is contained in container and return 1/0
995
   accordingly.
996
997
   element has to coerce to a one element Unicode string. -1 is
998
   returned in case of an error. */
999
1000
PyAPI_FUNC(int) PyUnicode_Contains(
1001
    PyObject *container,        /* Container string */
1002
    PyObject *element           /* Element string */
1003
    );
1004
1005
/* Checks whether argument is a valid identifier. */
1006
1007
PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1008
1009
/* === Characters Type APIs =============================================== */
1010
1011
#ifndef Py_LIMITED_API
1012
#  define Py_CPYTHON_UNICODEOBJECT_H
1013
#  include "cpython/unicodeobject.h"
1014
#  undef Py_CPYTHON_UNICODEOBJECT_H
1015
#endif
1016
1017
#ifdef __cplusplus
1018
}
1019
#endif
1020
#endif /* !Py_UNICODEOBJECT_H */