LCOV - code coverage report
Current view: top level - Objects - unicodectype.c (source / functions) Hit Total Coverage
Test: CPython lcov report Lines: 98 104 94.2 %
Date: 2022-07-07 18:19:46 Functions: 21 22 95.5 %

          Line data    Source code
       1             : /*
       2             :    Unicode character type helpers.
       3             : 
       4             :    Written by Marc-Andre Lemburg (mal@lemburg.com).
       5             :    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
       6             : 
       7             :    Copyright (c) Corporation for National Research Initiatives.
       8             : 
       9             : */
      10             : 
      11             : #include "Python.h"
      12             : 
      13             : #define ALPHA_MASK 0x01
      14             : #define DECIMAL_MASK 0x02
      15             : #define DIGIT_MASK 0x04
      16             : #define LOWER_MASK 0x08
      17             : #define TITLE_MASK 0x40
      18             : #define UPPER_MASK 0x80
      19             : #define XID_START_MASK 0x100
      20             : #define XID_CONTINUE_MASK 0x200
      21             : #define PRINTABLE_MASK 0x400
      22             : #define NUMERIC_MASK 0x800
      23             : #define CASE_IGNORABLE_MASK 0x1000
      24             : #define CASED_MASK 0x2000
      25             : #define EXTENDED_CASE_MASK 0x4000
      26             : 
      27             : typedef struct {
      28             :     /*
      29             :        These are either deltas to the character or offsets in
      30             :        _PyUnicode_ExtendedCase.
      31             :     */
      32             :     const int upper;
      33             :     const int lower;
      34             :     const int title;
      35             :     /* Note if more flag space is needed, decimal and digit could be unified. */
      36             :     const unsigned char decimal;
      37             :     const unsigned char digit;
      38             :     const unsigned short flags;
      39             : } _PyUnicode_TypeRecord;
      40             : 
      41             : #include "unicodetype_db.h"
      42             : 
      43             : static const _PyUnicode_TypeRecord *
      44   537046000 : gettyperecord(Py_UCS4 code)
      45             : {
      46             :     int index;
      47             : 
      48   537046000 :     if (code >= 0x110000)
      49           0 :         index = 0;
      50             :     else
      51             :     {
      52   537046000 :         index = index1[(code>>SHIFT)];
      53   537046000 :         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
      54             :     }
      55             : 
      56   537046000 :     return &_PyUnicode_TypeRecords[index];
      57             : }
      58             : 
      59             : /* Returns the titlecase Unicode characters corresponding to ch or just
      60             :    ch if no titlecase mapping is known. */
      61             : 
      62           0 : Py_UCS4 _PyUnicode_ToTitlecase(Py_UCS4 ch)
      63             : {
      64           0 :     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
      65             : 
      66           0 :     if (ctype->flags & EXTENDED_CASE_MASK)
      67           0 :         return _PyUnicode_ExtendedCase[ctype->title & 0xFFFF];
      68           0 :     return ch + ctype->title;
      69             : }
      70             : 
      71             : /* Returns 1 for Unicode characters having the category 'Lt', 0
      72             :    otherwise. */
      73             : 
      74    27093900 : int _PyUnicode_IsTitlecase(Py_UCS4 ch)
      75             : {
      76    27093900 :     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
      77             : 
      78    27093900 :     return (ctype->flags & TITLE_MASK) != 0;
      79             : }
      80             : 
      81             : /* Returns 1 for Unicode characters having the XID_Start property, 0
      82             :    otherwise. */
      83             : 
      84     6121190 : int _PyUnicode_IsXidStart(Py_UCS4 ch)
      85             : {
      86     6121190 :     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
      87             : 
      88     6121190 :     return (ctype->flags & XID_START_MASK) != 0;
      89             : }
      90             : 
      91             : /* Returns 1 for Unicode characters having the XID_Continue property,
      92             :    0 otherwise. */
      93             : 
      94     2231550 : int _PyUnicode_IsXidContinue(Py_UCS4 ch)
      95             : {
      96     2231550 :     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
      97             : 
      98     2231550 :     return (ctype->flags & XID_CONTINUE_MASK) != 0;
      99             : }
     100             : 
     101             : /* Returns the integer decimal (0-9) for Unicode characters having
     102             :    this property, -1 otherwise. */
     103             : 
     104   111923000 : int _PyUnicode_ToDecimalDigit(Py_UCS4 ch)
     105             : {
     106   111923000 :     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     107             : 
     108   111923000 :     return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
     109             : }
     110             : 
     111    96189800 : int _PyUnicode_IsDecimalDigit(Py_UCS4 ch)
     112             : {
     113    96189800 :     if (_PyUnicode_ToDecimalDigit(ch) < 0)
     114    67894800 :         return 0;
     115    28295000 :     return 1;
     116             : }
     117             : 
     118             : /* Returns the integer digit (0-9) for Unicode characters having
     119             :    this property, -1 otherwise. */
     120             : 
     121    67015600 : int _PyUnicode_ToDigit(Py_UCS4 ch)
     122             : {
     123    67015600 :     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     124             : 
     125    67015600 :     return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
     126             : }
     127             : 
     128    65835900 : int _PyUnicode_IsDigit(Py_UCS4 ch)
     129             : {
     130    65835900 :     if (_PyUnicode_ToDigit(ch) < 0)
     131    65808800 :         return 0;
     132       27119 :     return 1;
     133             : }
     134             : 
     135             : /* Returns the numeric value as double for Unicode characters having
     136             :    this property, -1.0 otherwise. */
     137             : 
     138    65803200 : int _PyUnicode_IsNumeric(Py_UCS4 ch)
     139             : {
     140    65803200 :     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     141             : 
     142    65803200 :     return (ctype->flags & NUMERIC_MASK) != 0;
     143             : }
     144             : 
     145             : /* Returns 1 for Unicode characters to be hex-escaped when repr()ed,
     146             :    0 otherwise.
     147             :    All characters except those characters defined in the Unicode character
     148             :    database as following categories are considered printable.
     149             :       * Cc (Other, Control)
     150             :       * Cf (Other, Format)
     151             :       * Cs (Other, Surrogate)
     152             :       * Co (Other, Private Use)
     153             :       * Cn (Other, Not Assigned)
     154             :       * Zl Separator, Line ('\u2028', LINE SEPARATOR)
     155             :       * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
     156             :       * Zs (Separator, Space) other than ASCII space('\x20').
     157             : */
     158    16287100 : int _PyUnicode_IsPrintable(Py_UCS4 ch)
     159             : {
     160    16287100 :     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     161             : 
     162    16287100 :     return (ctype->flags & PRINTABLE_MASK) != 0;
     163             : }
     164             : 
     165             : /* Returns 1 for Unicode characters having the category 'Ll', 0
     166             :    otherwise. */
     167             : 
     168    25065200 : int _PyUnicode_IsLowercase(Py_UCS4 ch)
     169             : {
     170    25065200 :     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     171             : 
     172    25065200 :     return (ctype->flags & LOWER_MASK) != 0;
     173             : }
     174             : 
     175             : /* Returns 1 for Unicode characters having the category 'Lu', 0
     176             :    otherwise. */
     177             : 
     178    12442000 : int _PyUnicode_IsUppercase(Py_UCS4 ch)
     179             : {
     180    12442000 :     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     181             : 
     182    12442000 :     return (ctype->flags & UPPER_MASK) != 0;
     183             : }
     184             : 
     185             : /* Returns the uppercase Unicode characters corresponding to ch or just
     186             :    ch if no uppercase mapping is known. */
     187             : 
     188       58626 : Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
     189             : {
     190       58626 :     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     191             : 
     192       58626 :     if (ctype->flags & EXTENDED_CASE_MASK)
     193          43 :         return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFF];
     194       58583 :     return ch + ctype->upper;
     195             : }
     196             : 
     197             : /* Returns the lowercase Unicode characters corresponding to ch or just
     198             :    ch if no lowercase mapping is known. */
     199             : 
     200      309442 : Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
     201             : {
     202      309442 :     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     203             : 
     204      309442 :     if (ctype->flags & EXTENDED_CASE_MASK)
     205         104 :         return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFF];
     206      309338 :     return ch + ctype->lower;
     207             : }
     208             : 
     209    11368900 : int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res)
     210             : {
     211    11368900 :     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     212             : 
     213    11368900 :     if (ctype->flags & EXTENDED_CASE_MASK) {
     214         967 :         int index = ctype->lower & 0xFFFF;
     215         967 :         int n = ctype->lower >> 24;
     216             :         int i;
     217        1943 :         for (i = 0; i < n; i++)
     218         976 :             res[i] = _PyUnicode_ExtendedCase[index + i];
     219         967 :         return n;
     220             :     }
     221    11367900 :     res[0] = ch + ctype->lower;
     222    11367900 :     return 1;
     223             : }
     224             : 
     225     6702170 : int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res)
     226             : {
     227     6702170 :     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     228             : 
     229     6702170 :     if (ctype->flags & EXTENDED_CASE_MASK) {
     230        1198 :         int index = ctype->title & 0xFFFF;
     231        1198 :         int n = ctype->title >> 24;
     232             :         int i;
     233        2656 :         for (i = 0; i < n; i++)
     234        1458 :             res[i] = _PyUnicode_ExtendedCase[index + i];
     235        1198 :         return n;
     236             :     }
     237     6700970 :     res[0] = ch + ctype->title;
     238     6700970 :     return 1;
     239             : }
     240             : 
     241     6693650 : int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
     242             : {
     243     6693650 :     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     244             : 
     245     6693650 :     if (ctype->flags & EXTENDED_CASE_MASK) {
     246         953 :         int index = ctype->upper & 0xFFFF;
     247         953 :         int n = ctype->upper >> 24;
     248             :         int i;
     249        2290 :         for (i = 0; i < n; i++)
     250        1337 :             res[i] = _PyUnicode_ExtendedCase[index + i];
     251         953 :         return n;
     252             :     }
     253     6692700 :     res[0] = ch + ctype->upper;
     254     6692700 :     return 1;
     255             : }
     256             : 
     257           9 : int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res)
     258             : {
     259           9 :     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     260             : 
     261           9 :     if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) {
     262           4 :         int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24);
     263           4 :         int n = (ctype->lower >> 20) & 7;
     264             :         int i;
     265          10 :         for (i = 0; i < n; i++)
     266           6 :             res[i] = _PyUnicode_ExtendedCase[index + i];
     267           4 :         return n;
     268             :     }
     269           5 :     return _PyUnicode_ToLowerFull(ch, res);
     270             : }
     271             : 
     272    11159100 : int _PyUnicode_IsCased(Py_UCS4 ch)
     273             : {
     274    11159100 :     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     275             : 
     276    11159100 :     return (ctype->flags & CASED_MASK) != 0;
     277             : }
     278             : 
     279          26 : int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch)
     280             : {
     281          26 :     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     282             : 
     283          26 :     return (ctype->flags & CASE_IGNORABLE_MASK) != 0;
     284             : }
     285             : 
     286             : /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
     287             :    'Lo' or 'Lm',  0 otherwise. */
     288             : 
     289   166772000 : int _PyUnicode_IsAlpha(Py_UCS4 ch)
     290             : {
     291   166772000 :     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     292             : 
     293   166772000 :     return (ctype->flags & ALPHA_MASK) != 0;
     294             : }
     295             : 

Generated by: LCOV version 1.14