Line data Source code
1 : /* ------------------------------------------------------------------------
2 :
3 : unicodedata -- Provides access to the Unicode database.
4 :
5 : The current version number is reported in the unidata_version constant.
6 :
7 : Written by Marc-Andre Lemburg (mal@lemburg.com).
8 : Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9 : Modified by Martin v. Löwis (martin@v.loewis.de)
10 :
11 : Copyright (c) Corporation for National Research Initiatives.
12 :
13 : ------------------------------------------------------------------------ */
14 :
15 : #ifndef Py_BUILD_CORE_BUILTIN
16 : # define Py_BUILD_CORE_MODULE 1
17 : #endif
18 :
19 : #define PY_SSIZE_T_CLEAN
20 :
21 : #include "Python.h"
22 : #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
23 : #include "structmember.h" // PyMemberDef
24 :
25 : #include <stdbool.h>
26 :
27 : /*[clinic input]
28 : module unicodedata
29 : class unicodedata.UCD 'PreviousDBVersion *' '<not used>'
30 : [clinic start generated code]*/
31 : /*[clinic end generated code: output=da39a3ee5e6b4b0d input=e47113e05924be43]*/
32 :
33 : /* character properties */
34 :
35 : typedef struct {
36 : const unsigned char category; /* index into
37 : _PyUnicode_CategoryNames */
38 : const unsigned char combining; /* combining class value 0 - 255 */
39 : const unsigned char bidirectional; /* index into
40 : _PyUnicode_BidirectionalNames */
41 : const unsigned char mirrored; /* true if mirrored in bidir mode */
42 : const unsigned char east_asian_width; /* index into
43 : _PyUnicode_EastAsianWidth */
44 : const unsigned char normalization_quick_check; /* see is_normalized() */
45 : } _PyUnicode_DatabaseRecord;
46 :
47 : typedef struct change_record {
48 : /* sequence of fields should be the same as in merge_old_version */
49 : const unsigned char bidir_changed;
50 : const unsigned char category_changed;
51 : const unsigned char decimal_changed;
52 : const unsigned char mirrored_changed;
53 : const unsigned char east_asian_width_changed;
54 : const double numeric_changed;
55 : } change_record;
56 :
57 : /* data file generated by Tools/unicode/makeunicodedata.py */
58 : #include "unicodedata_db.h"
59 :
60 : static const _PyUnicode_DatabaseRecord*
61 12644700 : _getrecord_ex(Py_UCS4 code)
62 : {
63 : int index;
64 12644700 : if (code >= 0x110000)
65 0 : index = 0;
66 : else {
67 12644700 : index = index1[(code>>SHIFT)];
68 12644700 : index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
69 : }
70 :
71 12644700 : return &_PyUnicode_Database_Records[index];
72 : }
73 :
74 : /* ------------- Previous-version API ------------------------------------- */
75 : typedef struct previous_version {
76 : PyObject_HEAD
77 : const char *name;
78 : const change_record* (*getrecord)(Py_UCS4);
79 : Py_UCS4 (*normalization)(Py_UCS4);
80 : } PreviousDBVersion;
81 :
82 : #include "clinic/unicodedata.c.h"
83 :
84 : #define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
85 :
86 : static PyMemberDef DB_members[] = {
87 : {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
88 : {NULL}
89 : };
90 :
91 : // Check if self is an unicodedata.UCD instance.
92 : // If self is NULL (when the PyCapsule C API is used), return 0.
93 : // PyModule_Check() is used to avoid having to retrieve the ucd_type.
94 : // See unicodedata_functions comment to the rationale of this macro.
95 : #define UCD_Check(self) (self != NULL && !PyModule_Check(self))
96 :
97 : static PyObject*
98 81 : new_previous_version(PyTypeObject *ucd_type,
99 : const char*name, const change_record* (*getrecord)(Py_UCS4),
100 : Py_UCS4 (*normalization)(Py_UCS4))
101 : {
102 : PreviousDBVersion *self;
103 81 : self = PyObject_GC_New(PreviousDBVersion, ucd_type);
104 81 : if (self == NULL)
105 0 : return NULL;
106 81 : self->name = name;
107 81 : self->getrecord = getrecord;
108 81 : self->normalization = normalization;
109 81 : PyObject_GC_Track(self);
110 81 : return (PyObject*)self;
111 : }
112 :
113 :
114 : /* --- Module API --------------------------------------------------------- */
115 :
116 : /*[clinic input]
117 : unicodedata.UCD.decimal
118 :
119 : self: self
120 : chr: int(accept={str})
121 : default: object=NULL
122 : /
123 :
124 : Converts a Unicode character into its equivalent decimal value.
125 :
126 : Returns the decimal value assigned to the character chr as integer.
127 : If no such value is defined, default is returned, or, if not given,
128 : ValueError is raised.
129 : [clinic start generated code]*/
130 :
131 : static PyObject *
132 1179660 : unicodedata_UCD_decimal_impl(PyObject *self, int chr,
133 : PyObject *default_value)
134 : /*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
135 : {
136 1179660 : int have_old = 0;
137 : long rc;
138 1179660 : Py_UCS4 c = (Py_UCS4)chr;
139 :
140 1179660 : if (UCD_Check(self)) {
141 0 : const change_record *old = get_old_record(self, c);
142 0 : if (old->category_changed == 0) {
143 : /* unassigned */
144 0 : have_old = 1;
145 0 : rc = -1;
146 : }
147 0 : else if (old->decimal_changed != 0xFF) {
148 0 : have_old = 1;
149 0 : rc = old->decimal_changed;
150 : }
151 : }
152 :
153 1179660 : if (!have_old)
154 1179660 : rc = Py_UNICODE_TODECIMAL(c);
155 1179660 : if (rc < 0) {
156 1178620 : if (default_value == NULL) {
157 1 : PyErr_SetString(PyExc_ValueError,
158 : "not a decimal");
159 1 : return NULL;
160 : }
161 : else {
162 1178620 : Py_INCREF(default_value);
163 1178620 : return default_value;
164 : }
165 : }
166 1032 : return PyLong_FromLong(rc);
167 : }
168 :
169 : /*[clinic input]
170 : unicodedata.UCD.digit
171 :
172 : self: self
173 : chr: int(accept={str})
174 : default: object=NULL
175 : /
176 :
177 : Converts a Unicode character into its equivalent digit value.
178 :
179 : Returns the digit value assigned to the character chr as integer.
180 : If no such value is defined, default is returned, or, if not given,
181 : ValueError is raised.
182 : [clinic start generated code]*/
183 :
184 : static PyObject *
185 1179660 : unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
186 : /*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
187 : {
188 : long rc;
189 1179660 : Py_UCS4 c = (Py_UCS4)chr;
190 1179660 : rc = Py_UNICODE_TODIGIT(c);
191 1179660 : if (rc < 0) {
192 1178400 : if (default_value == NULL) {
193 1 : PyErr_SetString(PyExc_ValueError, "not a digit");
194 1 : return NULL;
195 : }
196 : else {
197 1178400 : Py_INCREF(default_value);
198 1178400 : return default_value;
199 : }
200 : }
201 1256 : return PyLong_FromLong(rc);
202 : }
203 :
204 : /*[clinic input]
205 : unicodedata.UCD.numeric
206 :
207 : self: self
208 : chr: int(accept={str})
209 : default: object=NULL
210 : /
211 :
212 : Converts a Unicode character into its equivalent numeric value.
213 :
214 : Returns the numeric value assigned to the character chr as float.
215 : If no such value is defined, default is returned, or, if not given,
216 : ValueError is raised.
217 : [clinic start generated code]*/
218 :
219 : static PyObject *
220 1114960 : unicodedata_UCD_numeric_impl(PyObject *self, int chr,
221 : PyObject *default_value)
222 : /*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
223 : {
224 1114960 : int have_old = 0;
225 : double rc;
226 1114960 : Py_UCS4 c = (Py_UCS4)chr;
227 :
228 1114960 : if (UCD_Check(self)) {
229 0 : const change_record *old = get_old_record(self, c);
230 0 : if (old->category_changed == 0) {
231 : /* unassigned */
232 0 : have_old = 1;
233 0 : rc = -1.0;
234 : }
235 0 : else if (old->decimal_changed != 0xFF) {
236 0 : have_old = 1;
237 0 : rc = old->decimal_changed;
238 : }
239 : }
240 :
241 1114960 : if (!have_old)
242 1114960 : rc = Py_UNICODE_TONUMERIC(c);
243 1114960 : if (rc == -1.0) {
244 1112240 : if (default_value == NULL) {
245 1 : PyErr_SetString(PyExc_ValueError, "not a numeric character");
246 1 : return NULL;
247 : }
248 : else {
249 1112240 : Py_INCREF(default_value);
250 1112240 : return default_value;
251 : }
252 : }
253 2712 : return PyFloat_FromDouble(rc);
254 : }
255 :
256 : /*[clinic input]
257 : unicodedata.UCD.category
258 :
259 : self: self
260 : chr: int(accept={str})
261 : /
262 :
263 : Returns the general category assigned to the character chr as string.
264 : [clinic start generated code]*/
265 :
266 : static PyObject *
267 2229040 : unicodedata_UCD_category_impl(PyObject *self, int chr)
268 : /*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
269 : {
270 : int index;
271 2229040 : Py_UCS4 c = (Py_UCS4)chr;
272 2229040 : index = (int) _getrecord_ex(c)->category;
273 2229040 : if (UCD_Check(self)) {
274 810 : const change_record *old = get_old_record(self, c);
275 810 : if (old->category_changed != 0xFF)
276 5 : index = old->category_changed;
277 : }
278 2229040 : return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
279 : }
280 :
281 : /*[clinic input]
282 : unicodedata.UCD.bidirectional
283 :
284 : self: self
285 : chr: int(accept={str})
286 : /
287 :
288 : Returns the bidirectional class assigned to the character chr as string.
289 :
290 : If no such value is defined, an empty string is returned.
291 : [clinic start generated code]*/
292 :
293 : static PyObject *
294 2228460 : unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
295 : /*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
296 : {
297 : int index;
298 2228460 : Py_UCS4 c = (Py_UCS4)chr;
299 2228460 : index = (int) _getrecord_ex(c)->bidirectional;
300 2228460 : if (UCD_Check(self)) {
301 236 : const change_record *old = get_old_record(self, c);
302 236 : if (old->category_changed == 0)
303 0 : index = 0; /* unassigned */
304 236 : else if (old->bidir_changed != 0xFF)
305 0 : index = old->bidir_changed;
306 : }
307 2228460 : return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
308 : }
309 :
310 : /*[clinic input]
311 : unicodedata.UCD.combining -> int
312 :
313 : self: self
314 : chr: int(accept={str})
315 : /
316 :
317 : Returns the canonical combining class assigned to the character chr as integer.
318 :
319 : Returns 0 if no combining class is defined.
320 : [clinic start generated code]*/
321 :
322 : static int
323 1114120 : unicodedata_UCD_combining_impl(PyObject *self, int chr)
324 : /*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
325 : {
326 : int index;
327 1114120 : Py_UCS4 c = (Py_UCS4)chr;
328 1114120 : index = (int) _getrecord_ex(c)->combining;
329 1114120 : if (UCD_Check(self)) {
330 0 : const change_record *old = get_old_record(self, c);
331 0 : if (old->category_changed == 0)
332 0 : index = 0; /* unassigned */
333 : }
334 1114120 : return index;
335 : }
336 :
337 : /*[clinic input]
338 : unicodedata.UCD.mirrored -> int
339 :
340 : self: self
341 : chr: int(accept={str})
342 : /
343 :
344 : Returns the mirrored property assigned to the character chr as integer.
345 :
346 : Returns 1 if the character has been identified as a "mirrored"
347 : character in bidirectional text, 0 otherwise.
348 : [clinic start generated code]*/
349 :
350 : static int
351 1114120 : unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
352 : /*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
353 : {
354 : int index;
355 1114120 : Py_UCS4 c = (Py_UCS4)chr;
356 1114120 : index = (int) _getrecord_ex(c)->mirrored;
357 1114120 : if (UCD_Check(self)) {
358 1 : const change_record *old = get_old_record(self, c);
359 1 : if (old->category_changed == 0)
360 0 : index = 0; /* unassigned */
361 1 : else if (old->mirrored_changed != 0xFF)
362 1 : index = old->mirrored_changed;
363 : }
364 1114120 : return index;
365 : }
366 :
367 : /*[clinic input]
368 : unicodedata.UCD.east_asian_width
369 :
370 : self: self
371 : chr: int(accept={str})
372 : /
373 :
374 : Returns the east asian width assigned to the character chr as string.
375 : [clinic start generated code]*/
376 :
377 : static PyObject *
378 9 : unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
379 : /*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
380 : {
381 : int index;
382 9 : Py_UCS4 c = (Py_UCS4)chr;
383 9 : index = (int) _getrecord_ex(c)->east_asian_width;
384 9 : if (UCD_Check(self)) {
385 1 : const change_record *old = get_old_record(self, c);
386 1 : if (old->category_changed == 0)
387 0 : index = 0; /* unassigned */
388 1 : else if (old->east_asian_width_changed != 0xFF)
389 1 : index = old->east_asian_width_changed;
390 : }
391 9 : return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
392 : }
393 :
394 : /*[clinic input]
395 : unicodedata.UCD.decomposition
396 :
397 : self: self
398 : chr: int(accept={str})
399 : /
400 :
401 : Returns the character decomposition mapping assigned to the character chr as string.
402 :
403 : An empty string is returned in case no such mapping is defined.
404 : [clinic start generated code]*/
405 :
406 : static PyObject *
407 2233890 : unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
408 : /*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
409 : {
410 : char decomp[256];
411 : int code, index, count;
412 : size_t i;
413 : unsigned int prefix_index;
414 2233890 : Py_UCS4 c = (Py_UCS4)chr;
415 :
416 2233890 : code = (int)c;
417 :
418 2233890 : if (UCD_Check(self)) {
419 0 : const change_record *old = get_old_record(self, c);
420 0 : if (old->category_changed == 0)
421 0 : return PyUnicode_FromString(""); /* unassigned */
422 : }
423 :
424 2233890 : if (code < 0 || code >= 0x110000)
425 0 : index = 0;
426 : else {
427 2233890 : index = decomp_index1[(code>>DECOMP_SHIFT)];
428 2233890 : index = decomp_index2[(index<<DECOMP_SHIFT)+
429 2233890 : (code&((1<<DECOMP_SHIFT)-1))];
430 : }
431 :
432 : /* high byte is number of hex bytes (usually one or two), low byte
433 : is prefix code (from*/
434 2233890 : count = decomp_data[index] >> 8;
435 :
436 : /* XXX: could allocate the PyString up front instead
437 : (strlen(prefix) + 5 * count + 1 bytes) */
438 :
439 : /* Based on how index is calculated above and decomp_data is generated
440 : from Tools/unicode/makeunicodedata.py, it should not be possible
441 : to overflow decomp_prefix. */
442 2233890 : prefix_index = decomp_data[index] & 255;
443 2233890 : assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
444 :
445 : /* copy prefix */
446 2233890 : i = strlen(decomp_prefix[prefix_index]);
447 2233890 : memcpy(decomp, decomp_prefix[prefix_index], i);
448 :
449 2259700 : while (count-- > 0) {
450 25806 : if (i)
451 19623 : decomp[i++] = ' ';
452 25806 : assert(i < sizeof(decomp));
453 25806 : PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
454 : decomp_data[++index]);
455 25806 : i += strlen(decomp + i);
456 : }
457 2233890 : return PyUnicode_FromStringAndSize(decomp, i);
458 : }
459 :
460 : static void
461 375095 : get_decomp_record(PyObject *self, Py_UCS4 code,
462 : int *index, int *prefix, int *count)
463 : {
464 375095 : if (code >= 0x110000) {
465 0 : *index = 0;
466 : }
467 375095 : else if (UCD_Check(self)
468 891 : && get_old_record(self, code)->category_changed==0) {
469 : /* unassigned in old version */
470 0 : *index = 0;
471 : }
472 : else {
473 375095 : *index = decomp_index1[(code>>DECOMP_SHIFT)];
474 375095 : *index = decomp_index2[(*index<<DECOMP_SHIFT)+
475 375095 : (code&((1<<DECOMP_SHIFT)-1))];
476 : }
477 :
478 : /* high byte is number of hex bytes (usually one or two), low byte
479 : is prefix code (from*/
480 375095 : *count = decomp_data[*index] >> 8;
481 375095 : *prefix = decomp_data[*index] & 255;
482 :
483 375095 : (*index)++;
484 375095 : }
485 :
486 : #define SBase 0xAC00
487 : #define LBase 0x1100
488 : #define VBase 0x1161
489 : #define TBase 0x11A7
490 : #define LCount 19
491 : #define VCount 21
492 : #define TCount 28
493 : #define NCount (VCount*TCount)
494 : #define SCount (LCount*NCount)
495 :
496 : static PyObject*
497 173824 : nfd_nfkd(PyObject *self, PyObject *input, int k)
498 : {
499 : PyObject *result;
500 : Py_UCS4 *output;
501 : Py_ssize_t i, o, osize;
502 : int kind;
503 : const void *data;
504 : /* Longest decomposition in Unicode 3.2: U+FDFA */
505 : Py_UCS4 stack[20];
506 : Py_ssize_t space, isize;
507 : int index, prefix, count, stackptr;
508 : unsigned char prev, cur;
509 :
510 173824 : stackptr = 0;
511 173824 : isize = PyUnicode_GET_LENGTH(input);
512 173824 : space = isize;
513 : /* Overallocate at most 10 characters. */
514 173824 : if (space > 10) {
515 319 : if (space <= PY_SSIZE_T_MAX - 10)
516 319 : space += 10;
517 : }
518 : else {
519 173505 : space *= 2;
520 : }
521 173824 : osize = space;
522 173824 : output = PyMem_NEW(Py_UCS4, space);
523 173824 : if (!output) {
524 0 : PyErr_NoMemory();
525 0 : return NULL;
526 : }
527 173824 : i = o = 0;
528 173824 : kind = PyUnicode_KIND(input);
529 173824 : data = PyUnicode_DATA(input);
530 :
531 553713 : while (i < isize) {
532 379889 : stack[stackptr++] = PyUnicode_READ(kind, data, i++);
533 823495 : while(stackptr) {
534 443606 : Py_UCS4 code = stack[--stackptr];
535 : /* Hangul Decomposition adds three characters in
536 : a single step, so we need at least that much room. */
537 443606 : if (space < 3) {
538 : Py_UCS4 *new_output;
539 101096 : osize += 10;
540 101096 : space += 10;
541 101096 : new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
542 101096 : if (new_output == NULL) {
543 0 : PyMem_Free(output);
544 0 : PyErr_NoMemory();
545 0 : return NULL;
546 : }
547 101096 : output = new_output;
548 : }
549 : /* Hangul Decomposition. */
550 443606 : if (SBase <= code && code < (SBase+SCount)) {
551 68511 : int SIndex = code - SBase;
552 68511 : int L = LBase + SIndex / NCount;
553 68511 : int V = VBase + (SIndex % NCount) / TCount;
554 68511 : int T = TBase + SIndex % TCount;
555 68511 : output[o++] = L;
556 68511 : output[o++] = V;
557 68511 : space -= 2;
558 68511 : if (T != TBase) {
559 64657 : output[o++] = T;
560 64657 : space --;
561 : }
562 68511 : continue;
563 : }
564 : /* normalization changes */
565 375095 : if (UCD_Check(self)) {
566 891 : Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
567 891 : if (value != 0) {
568 0 : stack[stackptr++] = value;
569 0 : continue;
570 : }
571 : }
572 :
573 : /* Other decompositions. */
574 375095 : get_decomp_record(self, code, &index, &prefix, &count);
575 :
576 : /* Copy character if it is not decomposable, or has a
577 : compatibility decomposition, but we do NFD. */
578 375095 : if (!count || (prefix && !k)) {
579 334643 : output[o++] = code;
580 334643 : space--;
581 334643 : continue;
582 : }
583 : /* Copy decomposition onto the stack, in reverse
584 : order. */
585 104169 : while(count) {
586 63717 : code = decomp_data[index + (--count)];
587 63717 : stack[stackptr++] = code;
588 : }
589 : }
590 : }
591 :
592 173824 : result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
593 : output, o);
594 173824 : PyMem_Free(output);
595 173824 : if (!result)
596 0 : return NULL;
597 : /* result is guaranteed to be ready, as it is compact. */
598 173824 : kind = PyUnicode_KIND(result);
599 173824 : data = PyUnicode_DATA(result);
600 :
601 : /* Sort canonically. */
602 173824 : i = 0;
603 173824 : prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
604 536322 : for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
605 362498 : cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
606 362498 : if (prev == 0 || cur == 0 || prev <= cur) {
607 341913 : prev = cur;
608 341913 : continue;
609 : }
610 : /* Non-canonical order. Need to switch *i with previous. */
611 20585 : o = i - 1;
612 10580 : while (1) {
613 31165 : Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
614 31165 : PyUnicode_WRITE(kind, data, o+1,
615 : PyUnicode_READ(kind, data, o));
616 31165 : PyUnicode_WRITE(kind, data, o, tmp);
617 31165 : o--;
618 31165 : if (o < 0)
619 12 : break;
620 31153 : prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
621 31153 : if (prev == 0 || prev <= cur)
622 : break;
623 : }
624 20585 : prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
625 : }
626 173824 : return result;
627 : }
628 :
629 : static int
630 177057 : find_nfc_index(const struct reindex* nfc, Py_UCS4 code)
631 : {
632 : unsigned int index;
633 8360920 : for (index = 0; nfc[index].start; index++) {
634 8355210 : unsigned int start = nfc[index].start;
635 8355210 : if (code < start)
636 114064 : return -1;
637 8241150 : if (code <= start + nfc[index].count) {
638 57291 : unsigned int delta = code - start;
639 57291 : return nfc[index].index + delta;
640 : }
641 : }
642 5702 : return -1;
643 : }
644 :
645 : static PyObject*
646 80612 : nfc_nfkc(PyObject *self, PyObject *input, int k)
647 : {
648 : PyObject *result;
649 : int kind;
650 : const void *data;
651 : Py_UCS4 *output;
652 : Py_ssize_t i, i1, o, len;
653 : int f,l,index,index1,comb;
654 : Py_UCS4 code;
655 : Py_ssize_t skipped[20];
656 80612 : int cskipped = 0;
657 :
658 80612 : result = nfd_nfkd(self, input, k);
659 80612 : if (!result)
660 0 : return NULL;
661 : /* result will be "ready". */
662 80612 : kind = PyUnicode_KIND(result);
663 80612 : data = PyUnicode_DATA(result);
664 80612 : len = PyUnicode_GET_LENGTH(result);
665 :
666 : /* We allocate a buffer for the output.
667 : If we find that we made no changes, we still return
668 : the NFD result. */
669 80612 : output = PyMem_NEW(Py_UCS4, len);
670 80612 : if (!output) {
671 0 : PyErr_NoMemory();
672 0 : Py_DECREF(result);
673 0 : return 0;
674 : }
675 80612 : i = o = 0;
676 :
677 90225 : again:
678 254198 : while (i < len) {
679 177421 : for (index = 0; index < cskipped; index++) {
680 13448 : if (skipped[index] == i) {
681 : /* *i character is skipped.
682 : Remove from list. */
683 9613 : skipped[index] = skipped[cskipped-1];
684 9613 : cskipped--;
685 9613 : i++;
686 9613 : goto again; /* continue while */
687 : }
688 : }
689 : /* Hangul Composition. We don't need to check for <LV,T>
690 : pairs, since we always have decomposed data. */
691 163973 : code = PyUnicode_READ(kind, data, i);
692 163973 : if (LBase <= code && code < (LBase+LCount) &&
693 93278 : i + 1 < len &&
694 92657 : VBase <= PyUnicode_READ(kind, data, i+1) &&
695 46096 : PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
696 : /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
697 : and V character is a modern vowel (0x1161 ~ 0x1175). */
698 : int LIndex, VIndex;
699 46095 : LIndex = code - LBase;
700 46095 : VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
701 46095 : code = SBase + (LIndex*VCount+VIndex)*TCount;
702 46095 : i+=2;
703 90509 : if (i < len &&
704 87536 : TBase < PyUnicode_READ(kind, data, i) &&
705 43122 : PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
706 : /* check T character is a modern trailing consonant
707 : (0x11A8 ~ 0x11C2). */
708 43121 : code += PyUnicode_READ(kind, data, i)-TBase;
709 43121 : i++;
710 : }
711 46095 : output[o++] = code;
712 46095 : continue;
713 : }
714 :
715 : /* code is still input[i] here */
716 117878 : f = find_nfc_index(nfc_first, code);
717 117878 : if (f == -1) {
718 75837 : output[o++] = code;
719 75837 : i++;
720 75837 : continue;
721 : }
722 : /* Find next unblocked character. */
723 42041 : i1 = i+1;
724 42041 : comb = 0;
725 : /* output base character for now; might be updated later. */
726 42041 : output[o] = PyUnicode_READ(kind, data, i);
727 91157 : while (i1 < len) {
728 71755 : Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
729 71755 : int comb1 = _getrecord_ex(code1)->combining;
730 71755 : if (comb) {
731 47258 : if (comb1 == 0)
732 11367 : break;
733 35891 : if (comb >= comb1) {
734 : /* Character is blocked. */
735 10822 : i1++;
736 10822 : continue;
737 : }
738 : }
739 49566 : l = find_nfc_index(nfc_last, code1);
740 : /* i1 cannot be combined with i. If i1
741 : is a starter, we don't need to look further.
742 : Otherwise, record the combining class. */
743 49566 : if (l == -1) {
744 36231 : not_combinable:
745 39953 : if (comb1 == 0)
746 3574 : break;
747 36379 : comb = comb1;
748 36379 : i1++;
749 36379 : continue;
750 : }
751 13335 : index = f*TOTAL_LAST + l;
752 13335 : index1 = comp_index[index >> COMP_SHIFT];
753 13335 : code = comp_data[(index1<<COMP_SHIFT)+
754 13335 : (index&((1<<COMP_SHIFT)-1))];
755 13335 : if (code == 0)
756 3722 : goto not_combinable;
757 :
758 : /* Replace the original character. */
759 9613 : output[o] = code;
760 : /* Mark the second character unused. */
761 9613 : assert(cskipped < 20);
762 9613 : skipped[cskipped++] = i1;
763 9613 : i1++;
764 9613 : f = find_nfc_index(nfc_first, output[o]);
765 9613 : if (f == -1)
766 7698 : break;
767 : }
768 : /* Output character was already written.
769 : Just advance the indices. */
770 42041 : o++; i++;
771 : }
772 80612 : if (o == len) {
773 : /* No changes. Return original string. */
774 26237 : PyMem_Free(output);
775 26237 : return result;
776 : }
777 54375 : Py_DECREF(result);
778 54375 : result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
779 : output, o);
780 54375 : PyMem_Free(output);
781 54375 : return result;
782 : }
783 :
784 : // This needs to match the logic in makeunicodedata.py
785 : // which constructs the quickcheck data.
786 : typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
787 :
788 : /* Run the Unicode normalization "quickcheck" algorithm.
789 : *
790 : * Return YES or NO if quickcheck determines the input is certainly
791 : * normalized or certainly not, and MAYBE if quickcheck is unable to
792 : * tell.
793 : *
794 : * If `yes_only` is true, then return MAYBE as soon as we determine
795 : * the answer is not YES.
796 : *
797 : * For background and details on the algorithm, see UAX #15:
798 : * https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
799 : */
800 : static QuickcheckResult
801 4884420 : is_normalized_quickcheck(PyObject *self, PyObject *input, bool nfc, bool k,
802 : bool yes_only)
803 : {
804 : /* UCD 3.2.0 is requested, quickchecks must be disabled. */
805 4884420 : if (UCD_Check(self)) {
806 490 : return NO;
807 : }
808 :
809 4883930 : if (PyUnicode_IS_ASCII(input)) {
810 17450 : return YES;
811 : }
812 :
813 : Py_ssize_t i, len;
814 : int kind;
815 : const void *data;
816 4866480 : unsigned char prev_combining = 0;
817 :
818 : /* The two quickcheck bits at this shift have type QuickcheckResult. */
819 4866480 : int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
820 :
821 4866480 : QuickcheckResult result = YES; /* certainly normalized, unless we find something */
822 :
823 4866480 : i = 0;
824 4866480 : kind = PyUnicode_KIND(input);
825 4866480 : data = PyUnicode_DATA(input);
826 4866480 : len = PyUnicode_GET_LENGTH(input);
827 9995350 : while (i < len) {
828 5299100 : Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
829 5299100 : const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
830 :
831 5299100 : unsigned char combining = record->combining;
832 5299100 : if (combining && prev_combining > combining)
833 7052 : return NO; /* non-canonical sort order, not normalized */
834 5292050 : prev_combining = combining;
835 :
836 5292050 : unsigned char quickcheck_whole = record->normalization_quick_check;
837 5292050 : if (yes_only) {
838 5052660 : if (quickcheck_whole & (3 << quickcheck_shift))
839 163182 : return MAYBE;
840 : } else {
841 239385 : switch ((quickcheck_whole >> quickcheck_shift) & 3) {
842 0 : case NO:
843 0 : return NO;
844 3384 : case MAYBE:
845 3384 : result = MAYBE; /* this string might need normalization */
846 : }
847 : }
848 : }
849 4696250 : return result;
850 : }
851 :
852 : /*[clinic input]
853 : unicodedata.UCD.is_normalized
854 :
855 : self: self
856 : form: unicode
857 : unistr as input: unicode
858 : /
859 :
860 : Return whether the Unicode string unistr is in the normal form 'form'.
861 :
862 : Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
863 : [clinic start generated code]*/
864 :
865 : static PyObject *
866 113952 : unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
867 : PyObject *input)
868 : /*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
869 : {
870 113952 : if (PyUnicode_READY(input) == -1) {
871 0 : return NULL;
872 : }
873 :
874 113952 : if (PyUnicode_GET_LENGTH(input) == 0) {
875 : /* special case empty input strings. */
876 0 : Py_RETURN_TRUE;
877 : }
878 :
879 : PyObject *result;
880 113952 : bool nfc = false;
881 113952 : bool k = false;
882 : QuickcheckResult m;
883 :
884 : PyObject *cmp;
885 113952 : int match = 0;
886 :
887 113952 : if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
888 37984 : nfc = true;
889 : }
890 75968 : else if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
891 18992 : nfc = true;
892 18992 : k = true;
893 : }
894 56976 : else if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
895 : /* matches default values for `nfc` and `k` */
896 : }
897 18992 : else if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
898 18992 : k = true;
899 : }
900 : else {
901 0 : PyErr_SetString(PyExc_ValueError, "invalid normalization form");
902 0 : return NULL;
903 : }
904 :
905 113952 : m = is_normalized_quickcheck(self, input, nfc, k, false);
906 :
907 113952 : if (m == MAYBE) {
908 3100 : cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
909 3100 : if (cmp == NULL) {
910 0 : return NULL;
911 : }
912 3100 : match = PyUnicode_Compare(input, cmp);
913 3100 : Py_DECREF(cmp);
914 3100 : result = (match == 0) ? Py_True : Py_False;
915 : }
916 : else {
917 110852 : result = (m == YES) ? Py_True : Py_False;
918 : }
919 :
920 113952 : Py_INCREF(result);
921 113952 : return result;
922 : }
923 :
924 :
925 : /*[clinic input]
926 : unicodedata.UCD.normalize
927 :
928 : self: self
929 : form: unicode
930 : unistr as input: unicode
931 : /
932 :
933 : Return the normal form 'form' for the Unicode string unistr.
934 :
935 : Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
936 : [clinic start generated code]*/
937 :
938 : static PyObject *
939 4770480 : unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
940 : PyObject *input)
941 : /*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
942 : {
943 4770480 : if (PyUnicode_GET_LENGTH(input) == 0) {
944 : /* Special case empty input strings, since resizing
945 : them later would cause internal errors. */
946 3 : Py_INCREF(input);
947 3 : return input;
948 : }
949 :
950 4770470 : if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
951 1192320 : if (is_normalized_quickcheck(self, input,
952 : true, false, true) == YES) {
953 1159500 : Py_INCREF(input);
954 1159500 : return input;
955 : }
956 32819 : return nfc_nfkc(self, input, 0);
957 : }
958 3578150 : if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
959 1193510 : if (is_normalized_quickcheck(self, input,
960 : true, true, true) == YES) {
961 1148820 : Py_INCREF(input);
962 1148820 : return input;
963 : }
964 44693 : return nfc_nfkc(self, input, 1);
965 : }
966 2384640 : if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
967 1192330 : if (is_normalized_quickcheck(self, input,
968 : false, false, true) == YES) {
969 1151340 : Py_INCREF(input);
970 1151340 : return input;
971 : }
972 40986 : return nfd_nfkd(self, input, 0);
973 : }
974 1192320 : if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
975 1192310 : if (is_normalized_quickcheck(self, input,
976 : false, true, true) == YES) {
977 1140090 : Py_INCREF(input);
978 1140090 : return input;
979 : }
980 52226 : return nfd_nfkd(self, input, 1);
981 : }
982 1 : PyErr_SetString(PyExc_ValueError, "invalid normalization form");
983 1 : return NULL;
984 : }
985 :
986 : /* -------------------------------------------------------------------- */
987 : /* unicode character name tables */
988 :
989 : /* data file generated by Tools/unicode/makeunicodedata.py */
990 : #include "unicodename_db.h"
991 :
992 : /* -------------------------------------------------------------------- */
993 : /* database code (cut and pasted from the unidb package) */
994 :
995 : static unsigned long
996 18410 : _gethash(const char *s, int len, int scale)
997 : {
998 : int i;
999 18410 : unsigned long h = 0;
1000 : unsigned long ix;
1001 588990 : for (i = 0; i < len; i++) {
1002 570580 : h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]);
1003 570580 : ix = h & 0xff000000;
1004 570580 : if (ix)
1005 488798 : h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
1006 : }
1007 18410 : return h;
1008 : }
1009 :
1010 : static const char * const hangul_syllables[][3] = {
1011 : { "G", "A", "" },
1012 : { "GG", "AE", "G" },
1013 : { "N", "YA", "GG" },
1014 : { "D", "YAE", "GS" },
1015 : { "DD", "EO", "N", },
1016 : { "R", "E", "NJ" },
1017 : { "M", "YEO", "NH" },
1018 : { "B", "YE", "D" },
1019 : { "BB", "O", "L" },
1020 : { "S", "WA", "LG" },
1021 : { "SS", "WAE", "LM" },
1022 : { "", "OE", "LB" },
1023 : { "J", "YO", "LS" },
1024 : { "JJ", "U", "LT" },
1025 : { "C", "WEO", "LP" },
1026 : { "K", "WE", "LH" },
1027 : { "T", "WI", "M" },
1028 : { "P", "YU", "B" },
1029 : { "H", "EU", "BS" },
1030 : { 0, "YI", "S" },
1031 : { 0, "I", "SS" },
1032 : { 0, 0, "NG" },
1033 : { 0, 0, "J" },
1034 : { 0, 0, "C" },
1035 : { 0, 0, "K" },
1036 : { 0, 0, "T" },
1037 : { 0, 0, "P" },
1038 : { 0, 0, "H" }
1039 : };
1040 :
1041 : /* These ranges need to match makeunicodedata.py:cjk_ranges. */
1042 : static int
1043 127288 : is_unified_ideograph(Py_UCS4 code)
1044 : {
1045 : return
1046 127288 : (0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */
1047 114102 : (0x4E00 <= code && code <= 0x9FFF) || /* CJK Ideograph */
1048 72107 : (0x20000 <= code && code <= 0x2A6DF) || /* CJK Ideograph Extension B */
1049 72105 : (0x2A700 <= code && code <= 0x2B738) || /* CJK Ideograph Extension C */
1050 72103 : (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
1051 72101 : (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
1052 258757 : (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
1053 4181 : (0x30000 <= code && code <= 0x3134A); /* CJK Ideograph Extension G */
1054 : }
1055 :
1056 : /* macros used to determine if the given code point is in the PUA range that
1057 : * we are using to store aliases and named sequences */
1058 : #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
1059 : #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
1060 : (cp < named_sequences_end))
1061 :
1062 : static int
1063 112952 : _getucname(PyObject *self,
1064 : Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq)
1065 : {
1066 : /* Find the name associated with the given code point.
1067 : * If with_alias_and_seq is 1, check for names in the Private Use Area 15
1068 : * that we are using for aliases and named sequences. */
1069 : int offset;
1070 : int i;
1071 : int word;
1072 : const unsigned char* w;
1073 :
1074 112952 : if (code >= 0x110000)
1075 0 : return 0;
1076 :
1077 : /* XXX should we just skip all the code points in the PUAs here? */
1078 112952 : if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
1079 931 : return 0;
1080 :
1081 112021 : if (UCD_Check(self)) {
1082 : /* in 3.2.0 there are no aliases and named sequences */
1083 : const change_record *old;
1084 1580 : if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1085 524 : return 0;
1086 1056 : old = get_old_record(self, code);
1087 1056 : if (old->category_changed == 0) {
1088 : /* unassigned */
1089 632 : return 0;
1090 : }
1091 : }
1092 :
1093 110865 : if (SBase <= code && code < SBase+SCount) {
1094 : /* Hangul syllable. */
1095 11172 : int SIndex = code - SBase;
1096 11172 : int L = SIndex / NCount;
1097 11172 : int V = (SIndex % NCount) / TCount;
1098 11172 : int T = SIndex % TCount;
1099 :
1100 11172 : if (buflen < 27)
1101 : /* Worst case: HANGUL SYLLABLE <10chars>. */
1102 0 : return 0;
1103 11172 : strcpy(buffer, "HANGUL SYLLABLE ");
1104 11172 : buffer += 16;
1105 11172 : strcpy(buffer, hangul_syllables[L][0]);
1106 11172 : buffer += strlen(hangul_syllables[L][0]);
1107 11172 : strcpy(buffer, hangul_syllables[V][1]);
1108 11172 : buffer += strlen(hangul_syllables[V][1]);
1109 11172 : strcpy(buffer, hangul_syllables[T][2]);
1110 11172 : buffer += strlen(hangul_syllables[T][2]);
1111 11172 : *buffer = '\0';
1112 11172 : return 1;
1113 : }
1114 :
1115 99693 : if (is_unified_ideograph(code)) {
1116 27593 : if (buflen < 28)
1117 : /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1118 0 : return 0;
1119 27593 : sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1120 27593 : return 1;
1121 : }
1122 :
1123 : /* get offset into phrasebook */
1124 72100 : offset = phrasebook_offset1[(code>>phrasebook_shift)];
1125 72100 : offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1126 72100 : (code&((1<<phrasebook_shift)-1))];
1127 72100 : if (!offset)
1128 13168 : return 0;
1129 :
1130 58932 : i = 0;
1131 :
1132 : for (;;) {
1133 : /* get word index */
1134 229772 : word = phrasebook[offset] - phrasebook_short;
1135 229772 : if (word >= 0) {
1136 80778 : word = (word << 8) + phrasebook[offset+1];
1137 80778 : offset += 2;
1138 : } else
1139 148994 : word = phrasebook[offset++];
1140 229772 : if (i) {
1141 170840 : if (i > buflen)
1142 0 : return 0; /* buffer overflow */
1143 170840 : buffer[i++] = ' ';
1144 : }
1145 : /* copy word string from lexicon. the last character in the
1146 : word has bit 7 set. the last word in a string ends with
1147 : 0x80 */
1148 229772 : w = lexicon + lexicon_offset[word];
1149 1297320 : while (*w < 128) {
1150 1067550 : if (i >= buflen)
1151 0 : return 0; /* buffer overflow */
1152 1067550 : buffer[i++] = *w++;
1153 : }
1154 229772 : if (i >= buflen)
1155 0 : return 0; /* buffer overflow */
1156 229772 : buffer[i++] = *w & 127;
1157 229772 : if (*w == 128)
1158 58932 : break; /* end of word */
1159 : }
1160 :
1161 58932 : return 1;
1162 : }
1163 :
1164 : static int
1165 20060 : capi_getucname(Py_UCS4 code,
1166 : char* buffer, int buflen,
1167 : int with_alias_and_seq)
1168 : {
1169 20060 : return _getucname(NULL, code, buffer, buflen, with_alias_and_seq);
1170 :
1171 : }
1172 :
1173 : static int
1174 23215 : _cmpname(PyObject *self, int code, const char* name, int namelen)
1175 : {
1176 : /* check if code corresponds to the given name */
1177 : int i;
1178 : char buffer[NAME_MAXLEN+1];
1179 23215 : if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
1180 1156 : return 0;
1181 483111 : for (i = 0; i < namelen; i++) {
1182 465185 : if (Py_TOUPPER(name[i]) != buffer[i])
1183 4133 : return 0;
1184 : }
1185 17926 : return buffer[namelen] == '\0';
1186 : }
1187 :
1188 : static void
1189 33555 : find_syllable(const char *str, int *len, int *pos, int count, int column)
1190 : {
1191 : int i, len1;
1192 33555 : *len = -1;
1193 794135 : for (i = 0; i < count; i++) {
1194 760580 : const char *s = hangul_syllables[i][column];
1195 760580 : len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
1196 760580 : if (len1 <= *len)
1197 282823 : continue;
1198 477757 : if (strncmp(str, s, len1) == 0) {
1199 59655 : *len = len1;
1200 59655 : *pos = i;
1201 : }
1202 : }
1203 33555 : if (*len == -1) {
1204 0 : *len = 0;
1205 : }
1206 33555 : }
1207 :
1208 : static int
1209 17926 : _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
1210 : {
1211 : /* check if named sequences are allowed */
1212 17926 : if (!with_named_seq && IS_NAMED_SEQ(cp))
1213 466 : return 0;
1214 : /* if the code point is in the PUA range that we use for aliases,
1215 : * convert it to obtain the right code point */
1216 17460 : if (IS_ALIAS(cp))
1217 22 : *code = name_aliases[cp-aliases_start];
1218 : else
1219 17438 : *code = cp;
1220 17460 : return 1;
1221 : }
1222 :
1223 : static int
1224 57190 : _getcode(PyObject* self,
1225 : const char* name, int namelen, Py_UCS4* code, int with_named_seq)
1226 : {
1227 : /* Return the code point associated with the given name.
1228 : * Named aliases are resolved too (unless self != NULL (i.e. we are using
1229 : * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
1230 : * using for the named sequence, and the caller must then convert it. */
1231 : unsigned int h, v;
1232 57190 : unsigned int mask = code_size-1;
1233 : unsigned int i, incr;
1234 :
1235 : /* Check for hangul syllables. */
1236 57190 : if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1237 11185 : int len, L = -1, V = -1, T = -1;
1238 11185 : const char *pos = name + 16;
1239 11185 : find_syllable(pos, &len, &L, LCount, 0);
1240 11185 : pos += len;
1241 11185 : find_syllable(pos, &len, &V, VCount, 1);
1242 11185 : pos += len;
1243 11185 : find_syllable(pos, &len, &T, TCount, 2);
1244 11185 : pos += len;
1245 11185 : if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1246 11185 : *code = SBase + (L*VCount+V)*TCount + T;
1247 11185 : return 1;
1248 : }
1249 : /* Otherwise, it's an illegal syllable name. */
1250 0 : return 0;
1251 : }
1252 :
1253 : /* Check for unified ideographs. */
1254 46005 : if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1255 : /* Four or five hexdigits must follow. */
1256 27595 : v = 0;
1257 27595 : name += 22;
1258 27595 : namelen -= 22;
1259 27595 : if (namelen != 4 && namelen != 5)
1260 0 : return 0;
1261 137982 : while (namelen--) {
1262 110387 : v *= 16;
1263 110387 : if (*name >= '0' && *name <= '9')
1264 79037 : v += *name - '0';
1265 31350 : else if (*name >= 'A' && *name <= 'F')
1266 31350 : v += *name - 'A' + 10;
1267 : else
1268 0 : return 0;
1269 110387 : name++;
1270 : }
1271 27595 : if (!is_unified_ideograph(v))
1272 0 : return 0;
1273 27595 : *code = v;
1274 27595 : return 1;
1275 : }
1276 :
1277 : /* the following is the same as python's dictionary lookup, with
1278 : only minor changes. see the makeunicodedata script for more
1279 : details */
1280 :
1281 18410 : h = (unsigned int) _gethash(name, namelen, code_magic);
1282 18410 : i = (~h) & mask;
1283 18410 : v = code_hash[i];
1284 18410 : if (!v)
1285 2 : return 0;
1286 18408 : if (_cmpname(self, v, name, namelen)) {
1287 15274 : return _check_alias_and_seq(v, code, with_named_seq);
1288 : }
1289 3134 : incr = (h ^ (h >> 3)) & mask;
1290 3134 : if (!incr)
1291 0 : incr = mask;
1292 : for (;;) {
1293 5289 : i = (i + incr) & mask;
1294 5289 : v = code_hash[i];
1295 5289 : if (!v)
1296 482 : return 0;
1297 4807 : if (_cmpname(self, v, name, namelen)) {
1298 2652 : return _check_alias_and_seq(v, code, with_named_seq);
1299 : }
1300 2155 : incr = incr << 1;
1301 2155 : if (incr > mask)
1302 1139 : incr = incr ^ code_poly;
1303 : }
1304 : }
1305 :
1306 : static int
1307 620 : capi_getcode(const char* name, int namelen, Py_UCS4* code,
1308 : int with_named_seq)
1309 : {
1310 620 : return _getcode(NULL, name, namelen, code, with_named_seq);
1311 :
1312 : }
1313 :
1314 : static void
1315 81 : unicodedata_destroy_capi(PyObject *capsule)
1316 : {
1317 81 : void *capi = PyCapsule_GetPointer(capsule, PyUnicodeData_CAPSULE_NAME);
1318 81 : PyMem_Free(capi);
1319 81 : }
1320 :
1321 : static PyObject *
1322 81 : unicodedata_create_capi(void)
1323 : {
1324 81 : _PyUnicode_Name_CAPI *capi = PyMem_Malloc(sizeof(_PyUnicode_Name_CAPI));
1325 81 : if (capi == NULL) {
1326 0 : PyErr_NoMemory();
1327 0 : return NULL;
1328 : }
1329 81 : capi->getname = capi_getucname;
1330 81 : capi->getcode = capi_getcode;
1331 :
1332 81 : PyObject *capsule = PyCapsule_New(capi,
1333 : PyUnicodeData_CAPSULE_NAME,
1334 : unicodedata_destroy_capi);
1335 81 : if (capsule == NULL) {
1336 0 : PyMem_Free(capi);
1337 : }
1338 81 : return capsule;
1339 : };
1340 :
1341 :
1342 : /* -------------------------------------------------------------------- */
1343 : /* Python bindings */
1344 :
1345 : /*[clinic input]
1346 : unicodedata.UCD.name
1347 :
1348 : self: self
1349 : chr: int(accept={str})
1350 : default: object=NULL
1351 : /
1352 :
1353 : Returns the name assigned to the character chr as a string.
1354 :
1355 : If no name is defined, default is returned, or, if not given,
1356 : ValueError is raised.
1357 : [clinic start generated code]*/
1358 :
1359 : static PyObject *
1360 69677 : unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
1361 : /*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
1362 : {
1363 : char name[NAME_MAXLEN+1];
1364 69677 : Py_UCS4 c = (Py_UCS4)chr;
1365 :
1366 69677 : if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
1367 14065 : if (default_value == NULL) {
1368 4096 : PyErr_SetString(PyExc_ValueError, "no such name");
1369 4096 : return NULL;
1370 : }
1371 : else {
1372 9969 : Py_INCREF(default_value);
1373 9969 : return default_value;
1374 : }
1375 : }
1376 :
1377 55612 : return PyUnicode_FromString(name);
1378 : }
1379 :
1380 : /*[clinic input]
1381 : unicodedata.UCD.lookup
1382 :
1383 : self: self
1384 : name: str(accept={str, robuffer}, zeroes=True)
1385 : /
1386 :
1387 : Look up character by name.
1388 :
1389 : If a character with the given name is found, return the
1390 : corresponding character. If not found, KeyError is raised.
1391 : [clinic start generated code]*/
1392 :
1393 : static PyObject *
1394 56570 : unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1395 : Py_ssize_t name_length)
1396 : /*[clinic end generated code: output=7f03fc4959b242f6 input=a557be0f8607a0d6]*/
1397 : {
1398 : Py_UCS4 code;
1399 : unsigned int index;
1400 56570 : if (name_length > NAME_MAXLEN) {
1401 0 : PyErr_SetString(PyExc_KeyError, "name too long");
1402 0 : return NULL;
1403 : }
1404 :
1405 56570 : if (!_getcode(self, name, (int)name_length, &code, 1)) {
1406 481 : PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
1407 481 : return NULL;
1408 : }
1409 : /* check if code is in the PUA range that we use for named sequences
1410 : and convert it */
1411 56089 : if (IS_NAMED_SEQ(code)) {
1412 468 : index = code-named_sequences_start;
1413 468 : return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1414 468 : named_sequences[index].seq,
1415 468 : named_sequences[index].seqlen);
1416 : }
1417 55621 : return PyUnicode_FromOrdinal(code);
1418 : }
1419 :
1420 : // List of functions used to define module functions *AND* unicodedata.UCD
1421 : // methods. For module functions, self is the module. For UCD methods, self
1422 : // is an UCD instance. The UCD_Check() macro is used to check if self is
1423 : // an UCD instance.
1424 : static PyMethodDef unicodedata_functions[] = {
1425 : UNICODEDATA_UCD_DECIMAL_METHODDEF
1426 : UNICODEDATA_UCD_DIGIT_METHODDEF
1427 : UNICODEDATA_UCD_NUMERIC_METHODDEF
1428 : UNICODEDATA_UCD_CATEGORY_METHODDEF
1429 : UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1430 : UNICODEDATA_UCD_COMBINING_METHODDEF
1431 : UNICODEDATA_UCD_MIRRORED_METHODDEF
1432 : UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1433 : UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1434 : UNICODEDATA_UCD_NAME_METHODDEF
1435 : UNICODEDATA_UCD_LOOKUP_METHODDEF
1436 : UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
1437 : UNICODEDATA_UCD_NORMALIZE_METHODDEF
1438 : {NULL, NULL} /* sentinel */
1439 : };
1440 :
1441 : static int
1442 9947 : ucd_traverse(PreviousDBVersion *self, visitproc visit, void *arg)
1443 : {
1444 9947 : Py_VISIT(Py_TYPE(self));
1445 9947 : return 0;
1446 : }
1447 :
1448 : static void
1449 81 : ucd_dealloc(PreviousDBVersion *self)
1450 : {
1451 81 : PyTypeObject *tp = Py_TYPE(self);
1452 81 : PyObject_GC_UnTrack(self);
1453 81 : PyObject_GC_Del(self);
1454 81 : Py_DECREF(tp);
1455 81 : }
1456 :
1457 : static PyType_Slot ucd_type_slots[] = {
1458 : {Py_tp_dealloc, ucd_dealloc},
1459 : {Py_tp_traverse, ucd_traverse},
1460 : {Py_tp_getattro, PyObject_GenericGetAttr},
1461 : {Py_tp_methods, unicodedata_functions},
1462 : {Py_tp_members, DB_members},
1463 : {0, 0}
1464 : };
1465 :
1466 : static PyType_Spec ucd_type_spec = {
1467 : .name = "unicodedata.UCD",
1468 : .basicsize = sizeof(PreviousDBVersion),
1469 : .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION |
1470 : Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_IMMUTABLETYPE),
1471 : .slots = ucd_type_slots
1472 : };
1473 :
1474 : PyDoc_STRVAR(unicodedata_docstring,
1475 : "This module provides access to the Unicode Character Database which\n\
1476 : defines character properties for all Unicode characters. The data in\n\
1477 : this database is based on the UnicodeData.txt file version\n\
1478 : " UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
1479 : \n\
1480 : The module uses the same names and symbols as defined by the\n\
1481 : UnicodeData File Format " UNIDATA_VERSION ".");
1482 :
1483 : static int
1484 81 : unicodedata_exec(PyObject *module)
1485 : {
1486 81 : if (PyModule_AddStringConstant(module, "unidata_version", UNIDATA_VERSION) < 0) {
1487 0 : return -1;
1488 : }
1489 :
1490 81 : PyTypeObject *ucd_type = (PyTypeObject *)PyType_FromSpec(&ucd_type_spec);
1491 81 : if (ucd_type == NULL) {
1492 0 : return -1;
1493 : }
1494 :
1495 81 : if (PyModule_AddType(module, ucd_type) < 0) {
1496 0 : Py_DECREF(ucd_type);
1497 0 : return -1;
1498 : }
1499 :
1500 : // Unicode database version 3.2.0 used by the IDNA encoding
1501 : PyObject *v;
1502 81 : v = new_previous_version(ucd_type, "3.2.0",
1503 : get_change_3_2_0, normalization_3_2_0);
1504 81 : Py_DECREF(ucd_type);
1505 81 : if (v == NULL) {
1506 0 : return -1;
1507 : }
1508 81 : if (PyModule_AddObject(module, "ucd_3_2_0", v) < 0) {
1509 0 : Py_DECREF(v);
1510 0 : return -1;
1511 : }
1512 :
1513 : /* Export C API */
1514 81 : PyObject *capsule = unicodedata_create_capi();
1515 81 : if (capsule == NULL) {
1516 0 : return -1;
1517 : }
1518 81 : int rc = PyModule_AddObjectRef(module, "_ucnhash_CAPI", capsule);
1519 81 : Py_DECREF(capsule);
1520 81 : if (rc < 0) {
1521 0 : return -1;
1522 : }
1523 81 : return 0;
1524 : }
1525 :
1526 : static PyModuleDef_Slot unicodedata_slots[] = {
1527 : {Py_mod_exec, unicodedata_exec},
1528 : {0, NULL}
1529 : };
1530 :
1531 : static struct PyModuleDef unicodedata_module = {
1532 : PyModuleDef_HEAD_INIT,
1533 : .m_name = "unicodedata",
1534 : .m_doc = unicodedata_docstring,
1535 : .m_size = 0,
1536 : .m_methods = unicodedata_functions,
1537 : .m_slots = unicodedata_slots,
1538 : };
1539 :
1540 : PyMODINIT_FUNC
1541 81 : PyInit_unicodedata(void)
1542 : {
1543 81 : return PyModuleDef_Init(&unicodedata_module);
1544 : }
1545 :
1546 :
1547 : /*
1548 : Local variables:
1549 : c-basic-offset: 4
1550 : indent-tabs-mode: nil
1551 : End:
1552 : */
|