LCOV - CPython lcov report - Modules/unicodedata.c

LCOV - code coverage report

Current view:	top level - Modules - unicodedata.c (source / functions)		Hit	Total	Coverage
Test:	CPython lcov report	Lines:	561	635	88.3 %
Date:	2022-07-07 18:19:46	Functions:	35	35	100.0 %

          Line data    Source code

       1             : /* ------------------------------------------------------------------------
       2             : 
       3             :    unicodedata -- Provides access to the Unicode database.
       4             : 
       5             :    The current version number is reported in the unidata_version constant.
       6             : 
       7             :    Written by Marc-Andre Lemburg (mal@lemburg.com).
       8             :    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
       9             :    Modified by Martin v. Löwis (martin@v.loewis.de)
      10             : 
      11             :    Copyright (c) Corporation for National Research Initiatives.
      12             : 
      13             :    ------------------------------------------------------------------------ */
      14             : 
      15             : #ifndef Py_BUILD_CORE_BUILTIN
      16             : #  define Py_BUILD_CORE_MODULE 1
      17             : #endif
      18             : 
      19             : #define PY_SSIZE_T_CLEAN
      20             : 
      21             : #include "Python.h"
      22             : #include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
      23             : #include "structmember.h"         // PyMemberDef
      24             : 
      25             : #include <stdbool.h>
      26             : 
      27             : /*[clinic input]
      28             : module unicodedata
      29             : class unicodedata.UCD 'PreviousDBVersion *' '<not used>'
      30             : [clinic start generated code]*/
      31             : /*[clinic end generated code: output=da39a3ee5e6b4b0d input=e47113e05924be43]*/
      32             : 
      33             : /* character properties */
      34             : 
      35             : typedef struct {
      36             :     const unsigned char category;       /* index into
      37             :                                            _PyUnicode_CategoryNames */
      38             :     const unsigned char combining;      /* combining class value 0 - 255 */
      39             :     const unsigned char bidirectional;  /* index into
      40             :                                            _PyUnicode_BidirectionalNames */
      41             :     const unsigned char mirrored;       /* true if mirrored in bidir mode */
      42             :     const unsigned char east_asian_width;       /* index into
      43             :                                                    _PyUnicode_EastAsianWidth */
      44             :     const unsigned char normalization_quick_check; /* see is_normalized() */
      45             : } _PyUnicode_DatabaseRecord;
      46             : 
      47             : typedef struct change_record {
      48             :     /* sequence of fields should be the same as in merge_old_version */
      49             :     const unsigned char bidir_changed;
      50             :     const unsigned char category_changed;
      51             :     const unsigned char decimal_changed;
      52             :     const unsigned char mirrored_changed;
      53             :     const unsigned char east_asian_width_changed;
      54             :     const double numeric_changed;
      55             : } change_record;
      56             : 
      57             : /* data file generated by Tools/unicode/makeunicodedata.py */
      58             : #include "unicodedata_db.h"
      59             : 
      60             : static const _PyUnicode_DatabaseRecord*
      61    12644700 : _getrecord_ex(Py_UCS4 code)
      62             : {
      63             :     int index;
      64    12644700 :     if (code >= 0x110000)
      65           0 :         index = 0;
      66             :     else {
      67    12644700 :         index = index1[(code>>SHIFT)];
      68    12644700 :         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
      69             :     }
      70             : 
      71    12644700 :     return &_PyUnicode_Database_Records[index];
      72             : }
      73             : 
      74             : /* ------------- Previous-version API ------------------------------------- */
      75             : typedef struct previous_version {
      76             :     PyObject_HEAD
      77             :     const char *name;
      78             :     const change_record* (*getrecord)(Py_UCS4);
      79             :     Py_UCS4 (*normalization)(Py_UCS4);
      80             : } PreviousDBVersion;
      81             : 
      82             : #include "clinic/unicodedata.c.h"
      83             : 
      84             : #define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
      85             : 
      86             : static PyMemberDef DB_members[] = {
      87             :         {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
      88             :         {NULL}
      89             : };
      90             : 
      91             : // Check if self is an unicodedata.UCD instance.
      92             : // If self is NULL (when the PyCapsule C API is used), return 0.
      93             : // PyModule_Check() is used to avoid having to retrieve the ucd_type.
      94             : // See unicodedata_functions comment to the rationale of this macro.
      95             : #define UCD_Check(self) (self != NULL && !PyModule_Check(self))
      96             : 
      97             : static PyObject*
      98          81 : new_previous_version(PyTypeObject *ucd_type,
      99             :                      const char*name, const change_record* (*getrecord)(Py_UCS4),
     100             :                      Py_UCS4 (*normalization)(Py_UCS4))
     101             : {
     102             :     PreviousDBVersion *self;
     103          81 :     self = PyObject_GC_New(PreviousDBVersion, ucd_type);
     104          81 :     if (self == NULL)
     105           0 :         return NULL;
     106          81 :     self->name = name;
     107          81 :     self->getrecord = getrecord;
     108          81 :     self->normalization = normalization;
     109          81 :     PyObject_GC_Track(self);
     110          81 :     return (PyObject*)self;
     111             : }
     112             : 
     113             : 
     114             : /* --- Module API --------------------------------------------------------- */
     115             : 
     116             : /*[clinic input]
     117             : unicodedata.UCD.decimal
     118             : 
     119             :     self: self
     120             :     chr: int(accept={str})
     121             :     default: object=NULL
     122             :     /
     123             : 
     124             : Converts a Unicode character into its equivalent decimal value.
     125             : 
     126             : Returns the decimal value assigned to the character chr as integer.
     127             : If no such value is defined, default is returned, or, if not given,
     128             : ValueError is raised.
     129             : [clinic start generated code]*/
     130             : 
     131             : static PyObject *
     132     1179660 : unicodedata_UCD_decimal_impl(PyObject *self, int chr,
     133             :                              PyObject *default_value)
     134             : /*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
     135             : {
     136     1179660 :     int have_old = 0;
     137             :     long rc;
     138     1179660 :     Py_UCS4 c = (Py_UCS4)chr;
     139             : 
     140     1179660 :     if (UCD_Check(self)) {
     141           0 :         const change_record *old = get_old_record(self, c);
     142           0 :         if (old->category_changed == 0) {
     143             :             /* unassigned */
     144           0 :             have_old = 1;
     145           0 :             rc = -1;
     146             :         }
     147           0 :         else if (old->decimal_changed != 0xFF) {
     148           0 :             have_old = 1;
     149           0 :             rc = old->decimal_changed;
     150             :         }
     151             :     }
     152             : 
     153     1179660 :     if (!have_old)
     154     1179660 :         rc = Py_UNICODE_TODECIMAL(c);
     155     1179660 :     if (rc < 0) {
     156     1178620 :         if (default_value == NULL) {
     157           1 :             PyErr_SetString(PyExc_ValueError,
     158             :                             "not a decimal");
     159           1 :             return NULL;
     160             :         }
     161             :         else {
     162     1178620 :             Py_INCREF(default_value);
     163     1178620 :             return default_value;
     164             :         }
     165             :     }
     166        1032 :     return PyLong_FromLong(rc);
     167             : }
     168             : 
     169             : /*[clinic input]
     170             : unicodedata.UCD.digit
     171             : 
     172             :     self: self
     173             :     chr: int(accept={str})
     174             :     default: object=NULL
     175             :     /
     176             : 
     177             : Converts a Unicode character into its equivalent digit value.
     178             : 
     179             : Returns the digit value assigned to the character chr as integer.
     180             : If no such value is defined, default is returned, or, if not given,
     181             : ValueError is raised.
     182             : [clinic start generated code]*/
     183             : 
     184             : static PyObject *
     185     1179660 : unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
     186             : /*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
     187             : {
     188             :     long rc;
     189     1179660 :     Py_UCS4 c = (Py_UCS4)chr;
     190     1179660 :     rc = Py_UNICODE_TODIGIT(c);
     191     1179660 :     if (rc < 0) {
     192     1178400 :         if (default_value == NULL) {
     193           1 :             PyErr_SetString(PyExc_ValueError, "not a digit");
     194           1 :             return NULL;
     195             :         }
     196             :         else {
     197     1178400 :             Py_INCREF(default_value);
     198     1178400 :             return default_value;
     199             :         }
     200             :     }
     201        1256 :     return PyLong_FromLong(rc);
     202             : }
     203             : 
     204             : /*[clinic input]
     205             : unicodedata.UCD.numeric
     206             : 
     207             :     self: self
     208             :     chr: int(accept={str})
     209             :     default: object=NULL
     210             :     /
     211             : 
     212             : Converts a Unicode character into its equivalent numeric value.
     213             : 
     214             : Returns the numeric value assigned to the character chr as float.
     215             : If no such value is defined, default is returned, or, if not given,
     216             : ValueError is raised.
     217             : [clinic start generated code]*/
     218             : 
     219             : static PyObject *
     220     1114960 : unicodedata_UCD_numeric_impl(PyObject *self, int chr,
     221             :                              PyObject *default_value)
     222             : /*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
     223             : {
     224     1114960 :     int have_old = 0;
     225             :     double rc;
     226     1114960 :     Py_UCS4 c = (Py_UCS4)chr;
     227             : 
     228     1114960 :     if (UCD_Check(self)) {
     229           0 :         const change_record *old = get_old_record(self, c);
     230           0 :         if (old->category_changed == 0) {
     231             :             /* unassigned */
     232           0 :             have_old = 1;
     233           0 :             rc = -1.0;
     234             :         }
     235           0 :         else if (old->decimal_changed != 0xFF) {
     236           0 :             have_old = 1;
     237           0 :             rc = old->decimal_changed;
     238             :         }
     239             :     }
     240             : 
     241     1114960 :     if (!have_old)
     242     1114960 :         rc = Py_UNICODE_TONUMERIC(c);
     243     1114960 :     if (rc == -1.0) {
     244     1112240 :         if (default_value == NULL) {
     245           1 :             PyErr_SetString(PyExc_ValueError, "not a numeric character");
     246           1 :             return NULL;
     247             :         }
     248             :         else {
     249     1112240 :             Py_INCREF(default_value);
     250     1112240 :             return default_value;
     251             :         }
     252             :     }
     253        2712 :     return PyFloat_FromDouble(rc);
     254             : }
     255             : 
     256             : /*[clinic input]
     257             : unicodedata.UCD.category
     258             : 
     259             :     self: self
     260             :     chr: int(accept={str})
     261             :     /
     262             : 
     263             : Returns the general category assigned to the character chr as string.
     264             : [clinic start generated code]*/
     265             : 
     266             : static PyObject *
     267     2229040 : unicodedata_UCD_category_impl(PyObject *self, int chr)
     268             : /*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
     269             : {
     270             :     int index;
     271     2229040 :     Py_UCS4 c = (Py_UCS4)chr;
     272     2229040 :     index = (int) _getrecord_ex(c)->category;
     273     2229040 :     if (UCD_Check(self)) {
     274         810 :         const change_record *old = get_old_record(self, c);
     275         810 :         if (old->category_changed != 0xFF)
     276           5 :             index = old->category_changed;
     277             :     }
     278     2229040 :     return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
     279             : }
     280             : 
     281             : /*[clinic input]
     282             : unicodedata.UCD.bidirectional
     283             : 
     284             :     self: self
     285             :     chr: int(accept={str})
     286             :     /
     287             : 
     288             : Returns the bidirectional class assigned to the character chr as string.
     289             : 
     290             : If no such value is defined, an empty string is returned.
     291             : [clinic start generated code]*/
     292             : 
     293             : static PyObject *
     294     2228460 : unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
     295             : /*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
     296             : {
     297             :     int index;
     298     2228460 :     Py_UCS4 c = (Py_UCS4)chr;
     299     2228460 :     index = (int) _getrecord_ex(c)->bidirectional;
     300     2228460 :     if (UCD_Check(self)) {
     301         236 :         const change_record *old = get_old_record(self, c);
     302         236 :         if (old->category_changed == 0)
     303           0 :             index = 0; /* unassigned */
     304         236 :         else if (old->bidir_changed != 0xFF)
     305           0 :             index = old->bidir_changed;
     306             :     }
     307     2228460 :     return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
     308             : }
     309             : 
     310             : /*[clinic input]
     311             : unicodedata.UCD.combining -> int
     312             : 
     313             :     self: self
     314             :     chr: int(accept={str})
     315             :     /
     316             : 
     317             : Returns the canonical combining class assigned to the character chr as integer.
     318             : 
     319             : Returns 0 if no combining class is defined.
     320             : [clinic start generated code]*/
     321             : 
     322             : static int
     323     1114120 : unicodedata_UCD_combining_impl(PyObject *self, int chr)
     324             : /*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
     325             : {
     326             :     int index;
     327     1114120 :     Py_UCS4 c = (Py_UCS4)chr;
     328     1114120 :     index = (int) _getrecord_ex(c)->combining;
     329     1114120 :     if (UCD_Check(self)) {
     330           0 :         const change_record *old = get_old_record(self, c);
     331           0 :         if (old->category_changed == 0)
     332           0 :             index = 0; /* unassigned */
     333             :     }
     334     1114120 :     return index;
     335             : }
     336             : 
     337             : /*[clinic input]
     338             : unicodedata.UCD.mirrored -> int
     339             : 
     340             :     self: self
     341             :     chr: int(accept={str})
     342             :     /
     343             : 
     344             : Returns the mirrored property assigned to the character chr as integer.
     345             : 
     346             : Returns 1 if the character has been identified as a "mirrored"
     347             : character in bidirectional text, 0 otherwise.
     348             : [clinic start generated code]*/
     349             : 
     350             : static int
     351     1114120 : unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
     352             : /*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
     353             : {
     354             :     int index;
     355     1114120 :     Py_UCS4 c = (Py_UCS4)chr;
     356     1114120 :     index = (int) _getrecord_ex(c)->mirrored;
     357     1114120 :     if (UCD_Check(self)) {
     358           1 :         const change_record *old = get_old_record(self, c);
     359           1 :         if (old->category_changed == 0)
     360           0 :             index = 0; /* unassigned */
     361           1 :         else if (old->mirrored_changed != 0xFF)
     362           1 :             index = old->mirrored_changed;
     363             :     }
     364     1114120 :     return index;
     365             : }
     366             : 
     367             : /*[clinic input]
     368             : unicodedata.UCD.east_asian_width
     369             : 
     370             :     self: self
     371             :     chr: int(accept={str})
     372             :     /
     373             : 
     374             : Returns the east asian width assigned to the character chr as string.
     375             : [clinic start generated code]*/
     376             : 
     377             : static PyObject *
     378           9 : unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
     379             : /*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
     380             : {
     381             :     int index;
     382           9 :     Py_UCS4 c = (Py_UCS4)chr;
     383           9 :     index = (int) _getrecord_ex(c)->east_asian_width;
     384           9 :     if (UCD_Check(self)) {
     385           1 :         const change_record *old = get_old_record(self, c);
     386           1 :         if (old->category_changed == 0)
     387           0 :             index = 0; /* unassigned */
     388           1 :         else if (old->east_asian_width_changed != 0xFF)
     389           1 :             index = old->east_asian_width_changed;
     390             :     }
     391           9 :     return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
     392             : }
     393             : 
     394             : /*[clinic input]
     395             : unicodedata.UCD.decomposition
     396             : 
     397             :     self: self
     398             :     chr: int(accept={str})
     399             :     /
     400             : 
     401             : Returns the character decomposition mapping assigned to the character chr as string.
     402             : 
     403             : An empty string is returned in case no such mapping is defined.
     404             : [clinic start generated code]*/
     405             : 
     406             : static PyObject *
     407     2233890 : unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
     408             : /*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
     409             : {
     410             :     char decomp[256];
     411             :     int code, index, count;
     412             :     size_t i;
     413             :     unsigned int prefix_index;
     414     2233890 :     Py_UCS4 c = (Py_UCS4)chr;
     415             : 
     416     2233890 :     code = (int)c;
     417             : 
     418     2233890 :     if (UCD_Check(self)) {
     419           0 :         const change_record *old = get_old_record(self, c);
     420           0 :         if (old->category_changed == 0)
     421           0 :             return PyUnicode_FromString(""); /* unassigned */
     422             :     }
     423             : 
     424     2233890 :     if (code < 0 || code >= 0x110000)
     425           0 :         index = 0;
     426             :     else {
     427     2233890 :         index = decomp_index1[(code>>DECOMP_SHIFT)];
     428     2233890 :         index = decomp_index2[(index<<DECOMP_SHIFT)+
     429     2233890 :                              (code&((1<<DECOMP_SHIFT)-1))];
     430             :     }
     431             : 
     432             :     /* high byte is number of hex bytes (usually one or two), low byte
     433             :        is prefix code (from*/
     434     2233890 :     count = decomp_data[index] >> 8;
     435             : 
     436             :     /* XXX: could allocate the PyString up front instead
     437             :        (strlen(prefix) + 5 * count + 1 bytes) */
     438             : 
     439             :     /* Based on how index is calculated above and decomp_data is generated
     440             :        from Tools/unicode/makeunicodedata.py, it should not be possible
     441             :        to overflow decomp_prefix. */
     442     2233890 :     prefix_index = decomp_data[index] & 255;
     443     2233890 :     assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
     444             : 
     445             :     /* copy prefix */
     446     2233890 :     i = strlen(decomp_prefix[prefix_index]);
     447     2233890 :     memcpy(decomp, decomp_prefix[prefix_index], i);
     448             : 
     449     2259700 :     while (count-- > 0) {
     450       25806 :         if (i)
     451       19623 :             decomp[i++] = ' ';
     452       25806 :         assert(i < sizeof(decomp));
     453       25806 :         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
     454             :                       decomp_data[++index]);
     455       25806 :         i += strlen(decomp + i);
     456             :     }
     457     2233890 :     return PyUnicode_FromStringAndSize(decomp, i);
     458             : }
     459             : 
     460             : static void
     461      375095 : get_decomp_record(PyObject *self, Py_UCS4 code,
     462             :                   int *index, int *prefix, int *count)
     463             : {
     464      375095 :     if (code >= 0x110000) {
     465           0 :         *index = 0;
     466             :     }
     467      375095 :     else if (UCD_Check(self)
     468         891 :              && get_old_record(self, code)->category_changed==0) {
     469             :         /* unassigned in old version */
     470           0 :         *index = 0;
     471             :     }
     472             :     else {
     473      375095 :         *index = decomp_index1[(code>>DECOMP_SHIFT)];
     474      375095 :         *index = decomp_index2[(*index<<DECOMP_SHIFT)+
     475      375095 :                                (code&((1<<DECOMP_SHIFT)-1))];
     476             :     }
     477             : 
     478             :     /* high byte is number of hex bytes (usually one or two), low byte
     479             :        is prefix code (from*/
     480      375095 :     *count = decomp_data[*index] >> 8;
     481      375095 :     *prefix = decomp_data[*index] & 255;
     482             : 
     483      375095 :     (*index)++;
     484      375095 : }
     485             : 
     486             : #define SBase   0xAC00
     487             : #define LBase   0x1100
     488             : #define VBase   0x1161
     489             : #define TBase   0x11A7
     490             : #define LCount  19
     491             : #define VCount  21
     492             : #define TCount  28
     493             : #define NCount  (VCount*TCount)
     494             : #define SCount  (LCount*NCount)
     495             : 
     496             : static PyObject*
     497      173824 : nfd_nfkd(PyObject *self, PyObject *input, int k)
     498             : {
     499             :     PyObject *result;
     500             :     Py_UCS4 *output;
     501             :     Py_ssize_t i, o, osize;
     502             :     int kind;
     503             :     const void *data;
     504             :     /* Longest decomposition in Unicode 3.2: U+FDFA */
     505             :     Py_UCS4 stack[20];
     506             :     Py_ssize_t space, isize;
     507             :     int index, prefix, count, stackptr;
     508             :     unsigned char prev, cur;
     509             : 
     510      173824 :     stackptr = 0;
     511      173824 :     isize = PyUnicode_GET_LENGTH(input);
     512      173824 :     space = isize;
     513             :     /* Overallocate at most 10 characters. */
     514      173824 :     if (space > 10) {
     515         319 :         if (space <= PY_SSIZE_T_MAX - 10)
     516         319 :             space += 10;
     517             :     }
     518             :     else {
     519      173505 :         space *= 2;
     520             :     }
     521      173824 :     osize = space;
     522      173824 :     output = PyMem_NEW(Py_UCS4, space);
     523      173824 :     if (!output) {
     524           0 :         PyErr_NoMemory();
     525           0 :         return NULL;
     526             :     }
     527      173824 :     i = o = 0;
     528      173824 :     kind = PyUnicode_KIND(input);
     529      173824 :     data = PyUnicode_DATA(input);
     530             : 
     531      553713 :     while (i < isize) {
     532      379889 :         stack[stackptr++] = PyUnicode_READ(kind, data, i++);
     533      823495 :         while(stackptr) {
     534      443606 :             Py_UCS4 code = stack[--stackptr];
     535             :             /* Hangul Decomposition adds three characters in
     536             :                a single step, so we need at least that much room. */
     537      443606 :             if (space < 3) {
     538             :                 Py_UCS4 *new_output;
     539      101096 :                 osize += 10;
     540      101096 :                 space += 10;
     541      101096 :                 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
     542      101096 :                 if (new_output == NULL) {
     543           0 :                     PyMem_Free(output);
     544           0 :                     PyErr_NoMemory();
     545           0 :                     return NULL;
     546             :                 }
     547      101096 :                 output = new_output;
     548             :             }
     549             :             /* Hangul Decomposition. */
     550      443606 :             if (SBase <= code && code < (SBase+SCount)) {
     551       68511 :                 int SIndex = code - SBase;
     552       68511 :                 int L = LBase + SIndex / NCount;
     553       68511 :                 int V = VBase + (SIndex % NCount) / TCount;
     554       68511 :                 int T = TBase + SIndex % TCount;
     555       68511 :                 output[o++] = L;
     556       68511 :                 output[o++] = V;
     557       68511 :                 space -= 2;
     558       68511 :                 if (T != TBase) {
     559       64657 :                     output[o++] = T;
     560       64657 :                     space --;
     561             :                 }
     562       68511 :                 continue;
     563             :             }
     564             :             /* normalization changes */
     565      375095 :             if (UCD_Check(self)) {
     566         891 :                 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
     567         891 :                 if (value != 0) {
     568           0 :                     stack[stackptr++] = value;
     569           0 :                     continue;
     570             :                 }
     571             :             }
     572             : 
     573             :             /* Other decompositions. */
     574      375095 :             get_decomp_record(self, code, &index, &prefix, &count);
     575             : 
     576             :             /* Copy character if it is not decomposable, or has a
     577             :                compatibility decomposition, but we do NFD. */
     578      375095 :             if (!count || (prefix && !k)) {
     579      334643 :                 output[o++] = code;
     580      334643 :                 space--;
     581      334643 :                 continue;
     582             :             }
     583             :             /* Copy decomposition onto the stack, in reverse
     584             :                order.  */
     585      104169 :             while(count) {
     586       63717 :                 code = decomp_data[index + (--count)];
     587       63717 :                 stack[stackptr++] = code;
     588             :             }
     589             :         }
     590             :     }
     591             : 
     592      173824 :     result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
     593             :                                        output, o);
     594      173824 :     PyMem_Free(output);
     595      173824 :     if (!result)
     596           0 :         return NULL;
     597             :     /* result is guaranteed to be ready, as it is compact. */
     598      173824 :     kind = PyUnicode_KIND(result);
     599      173824 :     data = PyUnicode_DATA(result);
     600             : 
     601             :     /* Sort canonically. */
     602      173824 :     i = 0;
     603      173824 :     prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
     604      536322 :     for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
     605      362498 :         cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
     606      362498 :         if (prev == 0 || cur == 0 || prev <= cur) {
     607      341913 :             prev = cur;
     608      341913 :             continue;
     609             :         }
     610             :         /* Non-canonical order. Need to switch *i with previous. */
     611       20585 :         o = i - 1;
     612       10580 :         while (1) {
     613       31165 :             Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
     614       31165 :             PyUnicode_WRITE(kind, data, o+1,
     615             :                             PyUnicode_READ(kind, data, o));
     616       31165 :             PyUnicode_WRITE(kind, data, o, tmp);
     617       31165 :             o--;
     618       31165 :             if (o < 0)
     619          12 :                 break;
     620       31153 :             prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
     621       31153 :             if (prev == 0 || prev <= cur)
     622             :                 break;
     623             :         }
     624       20585 :         prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
     625             :     }
     626      173824 :     return result;
     627             : }
     628             : 
     629             : static int
     630      177057 : find_nfc_index(const struct reindex* nfc, Py_UCS4 code)
     631             : {
     632             :     unsigned int index;
     633     8360920 :     for (index = 0; nfc[index].start; index++) {
     634     8355210 :         unsigned int start = nfc[index].start;
     635     8355210 :         if (code < start)
     636      114064 :             return -1;
     637     8241150 :         if (code <= start + nfc[index].count) {
     638       57291 :             unsigned int delta = code - start;
     639       57291 :             return nfc[index].index + delta;
     640             :         }
     641             :     }
     642        5702 :     return -1;
     643             : }
     644             : 
     645             : static PyObject*
     646       80612 : nfc_nfkc(PyObject *self, PyObject *input, int k)
     647             : {
     648             :     PyObject *result;
     649             :     int kind;
     650             :     const void *data;
     651             :     Py_UCS4 *output;
     652             :     Py_ssize_t i, i1, o, len;
     653             :     int f,l,index,index1,comb;
     654             :     Py_UCS4 code;
     655             :     Py_ssize_t skipped[20];
     656       80612 :     int cskipped = 0;
     657             : 
     658       80612 :     result = nfd_nfkd(self, input, k);
     659       80612 :     if (!result)
     660           0 :         return NULL;
     661             :     /* result will be "ready". */
     662       80612 :     kind = PyUnicode_KIND(result);
     663       80612 :     data = PyUnicode_DATA(result);
     664       80612 :     len = PyUnicode_GET_LENGTH(result);
     665             : 
     666             :     /* We allocate a buffer for the output.
     667             :        If we find that we made no changes, we still return
     668             :        the NFD result. */
     669       80612 :     output = PyMem_NEW(Py_UCS4, len);
     670       80612 :     if (!output) {
     671           0 :         PyErr_NoMemory();
     672           0 :         Py_DECREF(result);
     673           0 :         return 0;
     674             :     }
     675       80612 :     i = o = 0;
     676             : 
     677       90225 :   again:
     678      254198 :     while (i < len) {
     679      177421 :       for (index = 0; index < cskipped; index++) {
     680       13448 :           if (skipped[index] == i) {
     681             :               /* *i character is skipped.
     682             :                  Remove from list. */
     683        9613 :               skipped[index] = skipped[cskipped-1];
     684        9613 :               cskipped--;
     685        9613 :               i++;
     686        9613 :               goto again; /* continue while */
     687             :           }
     688             :       }
     689             :       /* Hangul Composition. We don't need to check for <LV,T>
     690             :          pairs, since we always have decomposed data. */
     691      163973 :       code = PyUnicode_READ(kind, data, i);
     692      163973 :       if (LBase <= code && code < (LBase+LCount) &&
     693       93278 :           i + 1 < len &&
     694       92657 :           VBase <= PyUnicode_READ(kind, data, i+1) &&
     695       46096 :           PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
     696             :           /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
     697             :              and V character is a modern vowel (0x1161 ~ 0x1175). */
     698             :           int LIndex, VIndex;
     699       46095 :           LIndex = code - LBase;
     700       46095 :           VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
     701       46095 :           code = SBase + (LIndex*VCount+VIndex)*TCount;
     702       46095 :           i+=2;
     703       90509 :           if (i < len &&
     704       87536 :               TBase < PyUnicode_READ(kind, data, i) &&
     705       43122 :               PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
     706             :               /* check T character is a modern trailing consonant
     707             :                  (0x11A8 ~ 0x11C2). */
     708       43121 :               code += PyUnicode_READ(kind, data, i)-TBase;
     709       43121 :               i++;
     710             :           }
     711       46095 :           output[o++] = code;
     712       46095 :           continue;
     713             :       }
     714             : 
     715             :       /* code is still input[i] here */
     716      117878 :       f = find_nfc_index(nfc_first, code);
     717      117878 :       if (f == -1) {
     718       75837 :           output[o++] = code;
     719       75837 :           i++;
     720       75837 :           continue;
     721             :       }
     722             :       /* Find next unblocked character. */
     723       42041 :       i1 = i+1;
     724       42041 :       comb = 0;
     725             :       /* output base character for now; might be updated later. */
     726       42041 :       output[o] = PyUnicode_READ(kind, data, i);
     727       91157 :       while (i1 < len) {
     728       71755 :           Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
     729       71755 :           int comb1 = _getrecord_ex(code1)->combining;
     730       71755 :           if (comb) {
     731       47258 :               if (comb1 == 0)
     732       11367 :                   break;
     733       35891 :               if (comb >= comb1) {
     734             :                   /* Character is blocked. */
     735       10822 :                   i1++;
     736       10822 :                   continue;
     737             :               }
     738             :           }
     739       49566 :           l = find_nfc_index(nfc_last, code1);
     740             :           /* i1 cannot be combined with i. If i1
     741             :              is a starter, we don't need to look further.
     742             :              Otherwise, record the combining class. */
     743       49566 :           if (l == -1) {
     744       36231 :             not_combinable:
     745       39953 :               if (comb1 == 0)
     746        3574 :                   break;
     747       36379 :               comb = comb1;
     748       36379 :               i1++;
     749       36379 :               continue;
     750             :           }
     751       13335 :           index = f*TOTAL_LAST + l;
     752       13335 :           index1 = comp_index[index >> COMP_SHIFT];
     753       13335 :           code = comp_data[(index1<<COMP_SHIFT)+
     754       13335 :                            (index&((1<<COMP_SHIFT)-1))];
     755       13335 :           if (code == 0)
     756        3722 :               goto not_combinable;
     757             : 
     758             :           /* Replace the original character. */
     759        9613 :           output[o] = code;
     760             :           /* Mark the second character unused. */
     761        9613 :           assert(cskipped < 20);
     762        9613 :           skipped[cskipped++] = i1;
     763        9613 :           i1++;
     764        9613 :           f = find_nfc_index(nfc_first, output[o]);
     765        9613 :           if (f == -1)
     766        7698 :               break;
     767             :       }
     768             :       /* Output character was already written.
     769             :          Just advance the indices. */
     770       42041 :       o++; i++;
     771             :     }
     772       80612 :     if (o == len) {
     773             :         /* No changes. Return original string. */
     774       26237 :         PyMem_Free(output);
     775       26237 :         return result;
     776             :     }
     777       54375 :     Py_DECREF(result);
     778       54375 :     result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
     779             :                                        output, o);
     780       54375 :     PyMem_Free(output);
     781       54375 :     return result;
     782             : }
     783             : 
     784             : // This needs to match the logic in makeunicodedata.py
     785             : // which constructs the quickcheck data.
     786             : typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
     787             : 
     788             : /* Run the Unicode normalization "quickcheck" algorithm.
     789             :  *
     790             :  * Return YES or NO if quickcheck determines the input is certainly
     791             :  * normalized or certainly not, and MAYBE if quickcheck is unable to
     792             :  * tell.
     793             :  *
     794             :  * If `yes_only` is true, then return MAYBE as soon as we determine
     795             :  * the answer is not YES.
     796             :  *
     797             :  * For background and details on the algorithm, see UAX #15:
     798             :  *   https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
     799             :  */
     800             : static QuickcheckResult
     801     4884420 : is_normalized_quickcheck(PyObject *self, PyObject *input, bool nfc, bool k,
     802             :                          bool yes_only)
     803             : {
     804             :     /* UCD 3.2.0 is requested, quickchecks must be disabled. */
     805     4884420 :     if (UCD_Check(self)) {
     806         490 :         return NO;
     807             :     }
     808             : 
     809     4883930 :     if (PyUnicode_IS_ASCII(input)) {
     810       17450 :         return YES;
     811             :     }
     812             : 
     813             :     Py_ssize_t i, len;
     814             :     int kind;
     815             :     const void *data;
     816     4866480 :     unsigned char prev_combining = 0;
     817             : 
     818             :     /* The two quickcheck bits at this shift have type QuickcheckResult. */
     819     4866480 :     int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
     820             : 
     821     4866480 :     QuickcheckResult result = YES; /* certainly normalized, unless we find something */
     822             : 
     823     4866480 :     i = 0;
     824     4866480 :     kind = PyUnicode_KIND(input);
     825     4866480 :     data = PyUnicode_DATA(input);
     826     4866480 :     len = PyUnicode_GET_LENGTH(input);
     827     9995350 :     while (i < len) {
     828     5299100 :         Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
     829     5299100 :         const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
     830             : 
     831     5299100 :         unsigned char combining = record->combining;
     832     5299100 :         if (combining && prev_combining > combining)
     833        7052 :             return NO; /* non-canonical sort order, not normalized */
     834     5292050 :         prev_combining = combining;
     835             : 
     836     5292050 :         unsigned char quickcheck_whole = record->normalization_quick_check;
     837     5292050 :         if (yes_only) {
     838     5052660 :             if (quickcheck_whole & (3 << quickcheck_shift))
     839      163182 :                 return MAYBE;
     840             :         } else {
     841      239385 :             switch ((quickcheck_whole >> quickcheck_shift) & 3) {
     842           0 :             case NO:
     843           0 :               return NO;
     844        3384 :             case MAYBE:
     845        3384 :               result = MAYBE; /* this string might need normalization */
     846             :             }
     847             :         }
     848             :     }
     849     4696250 :     return result;
     850             : }
     851             : 
     852             : /*[clinic input]
     853             : unicodedata.UCD.is_normalized
     854             : 
     855             :     self: self
     856             :     form: unicode
     857             :     unistr as input: unicode
     858             :     /
     859             : 
     860             : Return whether the Unicode string unistr is in the normal form 'form'.
     861             : 
     862             : Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
     863             : [clinic start generated code]*/
     864             : 
     865             : static PyObject *
     866      113952 : unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
     867             :                                    PyObject *input)
     868             : /*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
     869             : {
     870      113952 :     if (PyUnicode_READY(input) == -1) {
     871           0 :         return NULL;
     872             :     }
     873             : 
     874      113952 :     if (PyUnicode_GET_LENGTH(input) == 0) {
     875             :         /* special case empty input strings. */
     876           0 :         Py_RETURN_TRUE;
     877             :     }
     878             : 
     879             :     PyObject *result;
     880      113952 :     bool nfc = false;
     881      113952 :     bool k = false;
     882             :     QuickcheckResult m;
     883             : 
     884             :     PyObject *cmp;
     885      113952 :     int match = 0;
     886             : 
     887      113952 :     if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
     888       37984 :         nfc = true;
     889             :     }
     890       75968 :     else if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
     891       18992 :         nfc = true;
     892       18992 :         k = true;
     893             :     }
     894       56976 :     else if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
     895             :         /* matches default values for `nfc` and `k` */
     896             :     }
     897       18992 :     else if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
     898       18992 :         k = true;
     899             :     }
     900             :     else {
     901           0 :         PyErr_SetString(PyExc_ValueError, "invalid normalization form");
     902           0 :         return NULL;
     903             :     }
     904             : 
     905      113952 :     m = is_normalized_quickcheck(self, input, nfc, k, false);
     906             : 
     907      113952 :     if (m == MAYBE) {
     908        3100 :         cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
     909        3100 :         if (cmp == NULL) {
     910           0 :             return NULL;
     911             :         }
     912        3100 :         match = PyUnicode_Compare(input, cmp);
     913        3100 :         Py_DECREF(cmp);
     914        3100 :         result = (match == 0) ? Py_True : Py_False;
     915             :     }
     916             :     else {
     917      110852 :         result = (m == YES) ? Py_True : Py_False;
     918             :     }
     919             : 
     920      113952 :     Py_INCREF(result);
     921      113952 :     return result;
     922             : }
     923             : 
     924             : 
     925             : /*[clinic input]
     926             : unicodedata.UCD.normalize
     927             : 
     928             :     self: self
     929             :     form: unicode
     930             :     unistr as input: unicode
     931             :     /
     932             : 
     933             : Return the normal form 'form' for the Unicode string unistr.
     934             : 
     935             : Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
     936             : [clinic start generated code]*/
     937             : 
     938             : static PyObject *
     939     4770480 : unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
     940             :                                PyObject *input)
     941             : /*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
     942             : {
     943     4770480 :     if (PyUnicode_GET_LENGTH(input) == 0) {
     944             :         /* Special case empty input strings, since resizing
     945             :            them  later would cause internal errors. */
     946           3 :         Py_INCREF(input);
     947           3 :         return input;
     948             :     }
     949             : 
     950     4770470 :     if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
     951     1192320 :         if (is_normalized_quickcheck(self, input,
     952             :                                      true,  false, true) == YES) {
     953     1159500 :             Py_INCREF(input);
     954     1159500 :             return input;
     955             :         }
     956       32819 :         return nfc_nfkc(self, input, 0);
     957             :     }
     958     3578150 :     if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
     959     1193510 :         if (is_normalized_quickcheck(self, input,
     960             :                                      true,  true,  true) == YES) {
     961     1148820 :             Py_INCREF(input);
     962     1148820 :             return input;
     963             :         }
     964       44693 :         return nfc_nfkc(self, input, 1);
     965             :     }
     966     2384640 :     if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
     967     1192330 :         if (is_normalized_quickcheck(self, input,
     968             :                                      false, false, true) == YES) {
     969     1151340 :             Py_INCREF(input);
     970     1151340 :             return input;
     971             :         }
     972       40986 :         return nfd_nfkd(self, input, 0);
     973             :     }
     974     1192320 :     if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
     975     1192310 :         if (is_normalized_quickcheck(self, input,
     976             :                                      false, true,  true) == YES) {
     977     1140090 :             Py_INCREF(input);
     978     1140090 :             return input;
     979             :         }
     980       52226 :         return nfd_nfkd(self, input, 1);
     981             :     }
     982           1 :     PyErr_SetString(PyExc_ValueError, "invalid normalization form");
     983           1 :     return NULL;
     984             : }
     985             : 
     986             : /* -------------------------------------------------------------------- */
     987             : /* unicode character name tables */
     988             : 
     989             : /* data file generated by Tools/unicode/makeunicodedata.py */
     990             : #include "unicodename_db.h"
     991             : 
     992             : /* -------------------------------------------------------------------- */
     993             : /* database code (cut and pasted from the unidb package) */
     994             : 
     995             : static unsigned long
     996       18410 : _gethash(const char *s, int len, int scale)
     997             : {
     998             :     int i;
     999       18410 :     unsigned long h = 0;
    1000             :     unsigned long ix;
    1001      588990 :     for (i = 0; i < len; i++) {
    1002      570580 :         h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]);
    1003      570580 :         ix = h & 0xff000000;
    1004      570580 :         if (ix)
    1005      488798 :             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
    1006             :     }
    1007       18410 :     return h;
    1008             : }
    1009             : 
    1010             : static const char * const hangul_syllables[][3] = {
    1011             :     { "G",  "A",   ""   },
    1012             :     { "GG", "AE",  "G"  },
    1013             :     { "N",  "YA",  "GG" },
    1014             :     { "D",  "YAE", "GS" },
    1015             :     { "DD", "EO",  "N", },
    1016             :     { "R",  "E",   "NJ" },
    1017             :     { "M",  "YEO", "NH" },
    1018             :     { "B",  "YE",  "D"  },
    1019             :     { "BB", "O",   "L"  },
    1020             :     { "S",  "WA",  "LG" },
    1021             :     { "SS", "WAE", "LM" },
    1022             :     { "",   "OE",  "LB" },
    1023             :     { "J",  "YO",  "LS" },
    1024             :     { "JJ", "U",   "LT" },
    1025             :     { "C",  "WEO", "LP" },
    1026             :     { "K",  "WE",  "LH" },
    1027             :     { "T",  "WI",  "M"  },
    1028             :     { "P",  "YU",  "B"  },
    1029             :     { "H",  "EU",  "BS" },
    1030             :     { 0,    "YI",  "S"  },
    1031             :     { 0,    "I",   "SS" },
    1032             :     { 0,    0,     "NG" },
    1033             :     { 0,    0,     "J"  },
    1034             :     { 0,    0,     "C"  },
    1035             :     { 0,    0,     "K"  },
    1036             :     { 0,    0,     "T"  },
    1037             :     { 0,    0,     "P"  },
    1038             :     { 0,    0,     "H"  }
    1039             : };
    1040             : 
    1041             : /* These ranges need to match makeunicodedata.py:cjk_ranges. */
    1042             : static int
    1043      127288 : is_unified_ideograph(Py_UCS4 code)
    1044             : {
    1045             :     return
    1046      127288 :         (0x3400 <= code && code <= 0x4DBF)   || /* CJK Ideograph Extension A */
    1047      114102 :         (0x4E00 <= code && code <= 0x9FFF)   || /* CJK Ideograph */
    1048       72107 :         (0x20000 <= code && code <= 0x2A6DF) || /* CJK Ideograph Extension B */
    1049       72105 :         (0x2A700 <= code && code <= 0x2B738) || /* CJK Ideograph Extension C */
    1050       72103 :         (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
    1051       72101 :         (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
    1052      258757 :         (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
    1053        4181 :         (0x30000 <= code && code <= 0x3134A);   /* CJK Ideograph Extension G */
    1054             : }
    1055             : 
    1056             : /* macros used to determine if the given code point is in the PUA range that
    1057             :  * we are using to store aliases and named sequences */
    1058             : #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
    1059             : #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
    1060             :                           (cp < named_sequences_end))
    1061             : 
    1062             : static int
    1063      112952 : _getucname(PyObject *self,
    1064             :            Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq)
    1065             : {
    1066             :     /* Find the name associated with the given code point.
    1067             :      * If with_alias_and_seq is 1, check for names in the Private Use Area 15
    1068             :      * that we are using for aliases and named sequences. */
    1069             :     int offset;
    1070             :     int i;
    1071             :     int word;
    1072             :     const unsigned char* w;
    1073             : 
    1074      112952 :     if (code >= 0x110000)
    1075           0 :         return 0;
    1076             : 
    1077             :     /* XXX should we just skip all the code points in the PUAs here? */
    1078      112952 :     if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
    1079         931 :         return 0;
    1080             : 
    1081      112021 :     if (UCD_Check(self)) {
    1082             :         /* in 3.2.0 there are no aliases and named sequences */
    1083             :         const change_record *old;
    1084        1580 :         if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
    1085         524 :             return 0;
    1086        1056 :         old = get_old_record(self, code);
    1087        1056 :         if (old->category_changed == 0) {
    1088             :             /* unassigned */
    1089         632 :             return 0;
    1090             :         }
    1091             :     }
    1092             : 
    1093      110865 :     if (SBase <= code && code < SBase+SCount) {
    1094             :         /* Hangul syllable. */
    1095       11172 :         int SIndex = code - SBase;
    1096       11172 :         int L = SIndex / NCount;
    1097       11172 :         int V = (SIndex % NCount) / TCount;
    1098       11172 :         int T = SIndex % TCount;
    1099             : 
    1100       11172 :         if (buflen < 27)
    1101             :             /* Worst case: HANGUL SYLLABLE <10chars>. */
    1102           0 :             return 0;
    1103       11172 :         strcpy(buffer, "HANGUL SYLLABLE ");
    1104       11172 :         buffer += 16;
    1105       11172 :         strcpy(buffer, hangul_syllables[L][0]);
    1106       11172 :         buffer += strlen(hangul_syllables[L][0]);
    1107       11172 :         strcpy(buffer, hangul_syllables[V][1]);
    1108       11172 :         buffer += strlen(hangul_syllables[V][1]);
    1109       11172 :         strcpy(buffer, hangul_syllables[T][2]);
    1110       11172 :         buffer += strlen(hangul_syllables[T][2]);
    1111       11172 :         *buffer = '\0';
    1112       11172 :         return 1;
    1113             :     }
    1114             : 
    1115       99693 :     if (is_unified_ideograph(code)) {
    1116       27593 :         if (buflen < 28)
    1117             :             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
    1118           0 :             return 0;
    1119       27593 :         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
    1120       27593 :         return 1;
    1121             :     }
    1122             : 
    1123             :     /* get offset into phrasebook */
    1124       72100 :     offset = phrasebook_offset1[(code>>phrasebook_shift)];
    1125       72100 :     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
    1126       72100 :                                (code&((1<<phrasebook_shift)-1))];
    1127       72100 :     if (!offset)
    1128       13168 :         return 0;
    1129             : 
    1130       58932 :     i = 0;
    1131             : 
    1132             :     for (;;) {
    1133             :         /* get word index */
    1134      229772 :         word = phrasebook[offset] - phrasebook_short;
    1135      229772 :         if (word >= 0) {
    1136       80778 :             word = (word << 8) + phrasebook[offset+1];
    1137       80778 :             offset += 2;
    1138             :         } else
    1139      148994 :             word = phrasebook[offset++];
    1140      229772 :         if (i) {
    1141      170840 :             if (i > buflen)
    1142           0 :                 return 0; /* buffer overflow */
    1143      170840 :             buffer[i++] = ' ';
    1144             :         }
    1145             :         /* copy word string from lexicon.  the last character in the
    1146             :            word has bit 7 set.  the last word in a string ends with
    1147             :            0x80 */
    1148      229772 :         w = lexicon + lexicon_offset[word];
    1149     1297320 :         while (*w < 128) {
    1150     1067550 :             if (i >= buflen)
    1151           0 :                 return 0; /* buffer overflow */
    1152     1067550 :             buffer[i++] = *w++;
    1153             :         }
    1154      229772 :         if (i >= buflen)
    1155           0 :             return 0; /* buffer overflow */
    1156      229772 :         buffer[i++] = *w & 127;
    1157      229772 :         if (*w == 128)
    1158       58932 :             break; /* end of word */
    1159             :     }
    1160             : 
    1161       58932 :     return 1;
    1162             : }
    1163             : 
    1164             : static int
    1165       20060 : capi_getucname(Py_UCS4 code,
    1166             :                char* buffer, int buflen,
    1167             :                int with_alias_and_seq)
    1168             : {
    1169       20060 :     return _getucname(NULL, code, buffer, buflen, with_alias_and_seq);
    1170             : 
    1171             : }
    1172             : 
    1173             : static int
    1174       23215 : _cmpname(PyObject *self, int code, const char* name, int namelen)
    1175             : {
    1176             :     /* check if code corresponds to the given name */
    1177             :     int i;
    1178             :     char buffer[NAME_MAXLEN+1];
    1179       23215 :     if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
    1180        1156 :         return 0;
    1181      483111 :     for (i = 0; i < namelen; i++) {
    1182      465185 :         if (Py_TOUPPER(name[i]) != buffer[i])
    1183        4133 :             return 0;
    1184             :     }
    1185       17926 :     return buffer[namelen] == '\0';
    1186             : }
    1187             : 
    1188             : static void
    1189       33555 : find_syllable(const char *str, int *len, int *pos, int count, int column)
    1190             : {
    1191             :     int i, len1;
    1192       33555 :     *len = -1;
    1193      794135 :     for (i = 0; i < count; i++) {
    1194      760580 :         const char *s = hangul_syllables[i][column];
    1195      760580 :         len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
    1196      760580 :         if (len1 <= *len)
    1197      282823 :             continue;
    1198      477757 :         if (strncmp(str, s, len1) == 0) {
    1199       59655 :             *len = len1;
    1200       59655 :             *pos = i;
    1201             :         }
    1202             :     }
    1203       33555 :     if (*len == -1) {
    1204           0 :         *len = 0;
    1205             :     }
    1206       33555 : }
    1207             : 
    1208             : static int
    1209       17926 : _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
    1210             : {
    1211             :     /* check if named sequences are allowed */
    1212       17926 :     if (!with_named_seq && IS_NAMED_SEQ(cp))
    1213         466 :         return 0;
    1214             :     /* if the code point is in the PUA range that we use for aliases,
    1215             :      * convert it to obtain the right code point */
    1216       17460 :     if (IS_ALIAS(cp))
    1217          22 :         *code = name_aliases[cp-aliases_start];
    1218             :     else
    1219       17438 :         *code = cp;
    1220       17460 :     return 1;
    1221             : }
    1222             : 
    1223             : static int
    1224       57190 : _getcode(PyObject* self,
    1225             :          const char* name, int namelen, Py_UCS4* code, int with_named_seq)
    1226             : {
    1227             :     /* Return the code point associated with the given name.
    1228             :      * Named aliases are resolved too (unless self != NULL (i.e. we are using
    1229             :      * 3.2.0)).  If with_named_seq is 1, returns the PUA code point that we are
    1230             :      * using for the named sequence, and the caller must then convert it. */
    1231             :     unsigned int h, v;
    1232       57190 :     unsigned int mask = code_size-1;
    1233             :     unsigned int i, incr;
    1234             : 
    1235             :     /* Check for hangul syllables. */
    1236       57190 :     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
    1237       11185 :         int len, L = -1, V = -1, T = -1;
    1238       11185 :         const char *pos = name + 16;
    1239       11185 :         find_syllable(pos, &len, &L, LCount, 0);
    1240       11185 :         pos += len;
    1241       11185 :         find_syllable(pos, &len, &V, VCount, 1);
    1242       11185 :         pos += len;
    1243       11185 :         find_syllable(pos, &len, &T, TCount, 2);
    1244       11185 :         pos += len;
    1245       11185 :         if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
    1246       11185 :             *code = SBase + (L*VCount+V)*TCount + T;
    1247       11185 :             return 1;
    1248             :         }
    1249             :         /* Otherwise, it's an illegal syllable name. */
    1250           0 :         return 0;
    1251             :     }
    1252             : 
    1253             :     /* Check for unified ideographs. */
    1254       46005 :     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
    1255             :         /* Four or five hexdigits must follow. */
    1256       27595 :         v = 0;
    1257       27595 :         name += 22;
    1258       27595 :         namelen -= 22;
    1259       27595 :         if (namelen != 4 && namelen != 5)
    1260           0 :             return 0;
    1261      137982 :         while (namelen--) {
    1262      110387 :             v *= 16;
    1263      110387 :             if (*name >= '0' && *name <= '9')
    1264       79037 :                 v += *name - '0';
    1265       31350 :             else if (*name >= 'A' && *name <= 'F')
    1266       31350 :                 v += *name - 'A' + 10;
    1267             :             else
    1268           0 :                 return 0;
    1269      110387 :             name++;
    1270             :         }
    1271       27595 :         if (!is_unified_ideograph(v))
    1272           0 :             return 0;
    1273       27595 :         *code = v;
    1274       27595 :         return 1;
    1275             :     }
    1276             : 
    1277             :     /* the following is the same as python's dictionary lookup, with
    1278             :        only minor changes.  see the makeunicodedata script for more
    1279             :        details */
    1280             : 
    1281       18410 :     h = (unsigned int) _gethash(name, namelen, code_magic);
    1282       18410 :     i = (~h) & mask;
    1283       18410 :     v = code_hash[i];
    1284       18410 :     if (!v)
    1285           2 :         return 0;
    1286       18408 :     if (_cmpname(self, v, name, namelen)) {
    1287       15274 :         return _check_alias_and_seq(v, code, with_named_seq);
    1288             :     }
    1289        3134 :     incr = (h ^ (h >> 3)) & mask;
    1290        3134 :     if (!incr)
    1291           0 :         incr = mask;
    1292             :     for (;;) {
    1293        5289 :         i = (i + incr) & mask;
    1294        5289 :         v = code_hash[i];
    1295        5289 :         if (!v)
    1296         482 :             return 0;
    1297        4807 :         if (_cmpname(self, v, name, namelen)) {
    1298        2652 :             return _check_alias_and_seq(v, code, with_named_seq);
    1299             :         }
    1300        2155 :         incr = incr << 1;
    1301        2155 :         if (incr > mask)
    1302        1139 :             incr = incr ^ code_poly;
    1303             :     }
    1304             : }
    1305             : 
    1306             : static int
    1307         620 : capi_getcode(const char* name, int namelen, Py_UCS4* code,
    1308             :              int with_named_seq)
    1309             : {
    1310         620 :     return _getcode(NULL, name, namelen, code, with_named_seq);
    1311             : 
    1312             : }
    1313             : 
    1314             : static void
    1315          81 : unicodedata_destroy_capi(PyObject *capsule)
    1316             : {
    1317          81 :     void *capi = PyCapsule_GetPointer(capsule, PyUnicodeData_CAPSULE_NAME);
    1318          81 :     PyMem_Free(capi);
    1319          81 : }
    1320             : 
    1321             : static PyObject *
    1322          81 : unicodedata_create_capi(void)
    1323             : {
    1324          81 :     _PyUnicode_Name_CAPI *capi = PyMem_Malloc(sizeof(_PyUnicode_Name_CAPI));
    1325          81 :     if (capi == NULL) {
    1326           0 :         PyErr_NoMemory();
    1327           0 :         return NULL;
    1328             :     }
    1329          81 :     capi->getname = capi_getucname;
    1330          81 :     capi->getcode = capi_getcode;
    1331             : 
    1332          81 :     PyObject *capsule = PyCapsule_New(capi,
    1333             :                                       PyUnicodeData_CAPSULE_NAME,
    1334             :                                       unicodedata_destroy_capi);
    1335          81 :     if (capsule == NULL) {
    1336           0 :         PyMem_Free(capi);
    1337             :     }
    1338          81 :     return capsule;
    1339             : };
    1340             : 
    1341             : 
    1342             : /* -------------------------------------------------------------------- */
    1343             : /* Python bindings */
    1344             : 
    1345             : /*[clinic input]
    1346             : unicodedata.UCD.name
    1347             : 
    1348             :     self: self
    1349             :     chr: int(accept={str})
    1350             :     default: object=NULL
    1351             :     /
    1352             : 
    1353             : Returns the name assigned to the character chr as a string.
    1354             : 
    1355             : If no name is defined, default is returned, or, if not given,
    1356             : ValueError is raised.
    1357             : [clinic start generated code]*/
    1358             : 
    1359             : static PyObject *
    1360       69677 : unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
    1361             : /*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
    1362             : {
    1363             :     char name[NAME_MAXLEN+1];
    1364       69677 :     Py_UCS4 c = (Py_UCS4)chr;
    1365             : 
    1366       69677 :     if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
    1367       14065 :         if (default_value == NULL) {
    1368        4096 :             PyErr_SetString(PyExc_ValueError, "no such name");
    1369        4096 :             return NULL;
    1370             :         }
    1371             :         else {
    1372        9969 :             Py_INCREF(default_value);
    1373        9969 :             return default_value;
    1374             :         }
    1375             :     }
    1376             : 
    1377       55612 :     return PyUnicode_FromString(name);
    1378             : }
    1379             : 
    1380             : /*[clinic input]
    1381             : unicodedata.UCD.lookup
    1382             : 
    1383             :     self: self
    1384             :     name: str(accept={str, robuffer}, zeroes=True)
    1385             :     /
    1386             : 
    1387             : Look up character by name.
    1388             : 
    1389             : If a character with the given name is found, return the
    1390             : corresponding character.  If not found, KeyError is raised.
    1391             : [clinic start generated code]*/
    1392             : 
    1393             : static PyObject *
    1394       56570 : unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
    1395             :                             Py_ssize_t name_length)
    1396             : /*[clinic end generated code: output=7f03fc4959b242f6 input=a557be0f8607a0d6]*/
    1397             : {
    1398             :     Py_UCS4 code;
    1399             :     unsigned int index;
    1400       56570 :     if (name_length > NAME_MAXLEN) {
    1401           0 :         PyErr_SetString(PyExc_KeyError, "name too long");
    1402           0 :         return NULL;
    1403             :     }
    1404             : 
    1405       56570 :     if (!_getcode(self, name, (int)name_length, &code, 1)) {
    1406         481 :         PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
    1407         481 :         return NULL;
    1408             :     }
    1409             :     /* check if code is in the PUA range that we use for named sequences
    1410             :        and convert it */
    1411       56089 :     if (IS_NAMED_SEQ(code)) {
    1412         468 :         index = code-named_sequences_start;
    1413         468 :         return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
    1414         468 :                                          named_sequences[index].seq,
    1415         468 :                                          named_sequences[index].seqlen);
    1416             :     }
    1417       55621 :     return PyUnicode_FromOrdinal(code);
    1418             : }
    1419             : 
    1420             : // List of functions used to define module functions *AND* unicodedata.UCD
    1421             : // methods. For module functions, self is the module. For UCD methods, self
    1422             : // is an UCD instance. The UCD_Check() macro is used to check if self is
    1423             : // an UCD instance.
    1424             : static PyMethodDef unicodedata_functions[] = {
    1425             :     UNICODEDATA_UCD_DECIMAL_METHODDEF
    1426             :     UNICODEDATA_UCD_DIGIT_METHODDEF
    1427             :     UNICODEDATA_UCD_NUMERIC_METHODDEF
    1428             :     UNICODEDATA_UCD_CATEGORY_METHODDEF
    1429             :     UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
    1430             :     UNICODEDATA_UCD_COMBINING_METHODDEF
    1431             :     UNICODEDATA_UCD_MIRRORED_METHODDEF
    1432             :     UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
    1433             :     UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
    1434             :     UNICODEDATA_UCD_NAME_METHODDEF
    1435             :     UNICODEDATA_UCD_LOOKUP_METHODDEF
    1436             :     UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
    1437             :     UNICODEDATA_UCD_NORMALIZE_METHODDEF
    1438             :     {NULL, NULL}                /* sentinel */
    1439             : };
    1440             : 
    1441             : static int
    1442        9947 : ucd_traverse(PreviousDBVersion *self, visitproc visit, void *arg)
    1443             : {
    1444        9947 :     Py_VISIT(Py_TYPE(self));
    1445        9947 :     return 0;
    1446             : }
    1447             : 
    1448             : static void
    1449          81 : ucd_dealloc(PreviousDBVersion *self)
    1450             : {
    1451          81 :     PyTypeObject *tp = Py_TYPE(self);
    1452          81 :     PyObject_GC_UnTrack(self);
    1453          81 :     PyObject_GC_Del(self);
    1454          81 :     Py_DECREF(tp);
    1455          81 : }
    1456             : 
    1457             : static PyType_Slot ucd_type_slots[] = {
    1458             :     {Py_tp_dealloc, ucd_dealloc},
    1459             :     {Py_tp_traverse, ucd_traverse},
    1460             :     {Py_tp_getattro, PyObject_GenericGetAttr},
    1461             :     {Py_tp_methods, unicodedata_functions},
    1462             :     {Py_tp_members, DB_members},
    1463             :     {0, 0}
    1464             : };
    1465             : 
    1466             : static PyType_Spec ucd_type_spec = {
    1467             :     .name = "unicodedata.UCD",
    1468             :     .basicsize = sizeof(PreviousDBVersion),
    1469             :     .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION |
    1470             :               Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_IMMUTABLETYPE),
    1471             :     .slots = ucd_type_slots
    1472             : };
    1473             : 
    1474             : PyDoc_STRVAR(unicodedata_docstring,
    1475             : "This module provides access to the Unicode Character Database which\n\
    1476             : defines character properties for all Unicode characters. The data in\n\
    1477             : this database is based on the UnicodeData.txt file version\n\
    1478             : " UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
    1479             : \n\
    1480             : The module uses the same names and symbols as defined by the\n\
    1481             : UnicodeData File Format " UNIDATA_VERSION ".");
    1482             : 
    1483             : static int
    1484          81 : unicodedata_exec(PyObject *module)
    1485             : {
    1486          81 :     if (PyModule_AddStringConstant(module, "unidata_version", UNIDATA_VERSION) < 0) {
    1487           0 :         return -1;
    1488             :     }
    1489             : 
    1490          81 :     PyTypeObject *ucd_type = (PyTypeObject *)PyType_FromSpec(&ucd_type_spec);
    1491          81 :     if (ucd_type == NULL) {
    1492           0 :         return -1;
    1493             :     }
    1494             : 
    1495          81 :     if (PyModule_AddType(module, ucd_type) < 0) {
    1496           0 :         Py_DECREF(ucd_type);
    1497           0 :         return -1;
    1498             :     }
    1499             : 
    1500             :     // Unicode database version 3.2.0 used by the IDNA encoding
    1501             :     PyObject *v;
    1502          81 :     v = new_previous_version(ucd_type, "3.2.0",
    1503             :                              get_change_3_2_0, normalization_3_2_0);
    1504          81 :     Py_DECREF(ucd_type);
    1505          81 :     if (v == NULL) {
    1506           0 :         return -1;
    1507             :     }
    1508          81 :     if (PyModule_AddObject(module, "ucd_3_2_0", v) < 0) {
    1509           0 :         Py_DECREF(v);
    1510           0 :         return -1;
    1511             :     }
    1512             : 
    1513             :     /* Export C API */
    1514          81 :     PyObject *capsule = unicodedata_create_capi();
    1515          81 :     if (capsule == NULL) {
    1516           0 :         return -1;
    1517             :     }
    1518          81 :     int rc = PyModule_AddObjectRef(module, "_ucnhash_CAPI", capsule);
    1519          81 :     Py_DECREF(capsule);
    1520          81 :     if (rc < 0) {
    1521           0 :         return -1;
    1522             :     }
    1523          81 :     return 0;
    1524             : }
    1525             : 
    1526             : static PyModuleDef_Slot unicodedata_slots[] = {
    1527             :     {Py_mod_exec, unicodedata_exec},
    1528             :     {0, NULL}
    1529             : };
    1530             : 
    1531             : static struct PyModuleDef unicodedata_module = {
    1532             :     PyModuleDef_HEAD_INIT,
    1533             :     .m_name = "unicodedata",
    1534             :     .m_doc = unicodedata_docstring,
    1535             :     .m_size = 0,
    1536             :     .m_methods = unicodedata_functions,
    1537             :     .m_slots = unicodedata_slots,
    1538             : };
    1539             : 
    1540             : PyMODINIT_FUNC
    1541          81 : PyInit_unicodedata(void)
    1542             : {
    1543          81 :     return PyModuleDef_Init(&unicodedata_module);
    1544             : }
    1545             : 
    1546             : 
    1547             : /*
    1548             : Local variables:
    1549             : c-basic-offset: 4
    1550             : indent-tabs-mode: nil
    1551             : End:
    1552             : */

Generated by: LCOV version 1.14