Line data Source code
1 : /* 2 : Unicode character type helpers. 3 : 4 : Written by Marc-Andre Lemburg (mal@lemburg.com). 5 : Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com) 6 : 7 : Copyright (c) Corporation for National Research Initiatives. 8 : 9 : */ 10 : 11 : #include "Python.h" 12 : 13 : #define ALPHA_MASK 0x01 14 : #define DECIMAL_MASK 0x02 15 : #define DIGIT_MASK 0x04 16 : #define LOWER_MASK 0x08 17 : #define TITLE_MASK 0x40 18 : #define UPPER_MASK 0x80 19 : #define XID_START_MASK 0x100 20 : #define XID_CONTINUE_MASK 0x200 21 : #define PRINTABLE_MASK 0x400 22 : #define NUMERIC_MASK 0x800 23 : #define CASE_IGNORABLE_MASK 0x1000 24 : #define CASED_MASK 0x2000 25 : #define EXTENDED_CASE_MASK 0x4000 26 : 27 : typedef struct { 28 : /* 29 : These are either deltas to the character or offsets in 30 : _PyUnicode_ExtendedCase. 31 : */ 32 : const int upper; 33 : const int lower; 34 : const int title; 35 : /* Note if more flag space is needed, decimal and digit could be unified. */ 36 : const unsigned char decimal; 37 : const unsigned char digit; 38 : const unsigned short flags; 39 : } _PyUnicode_TypeRecord; 40 : 41 : #include "unicodetype_db.h" 42 : 43 : static const _PyUnicode_TypeRecord * 44 537046000 : gettyperecord(Py_UCS4 code) 45 : { 46 : int index; 47 : 48 537046000 : if (code >= 0x110000) 49 0 : index = 0; 50 : else 51 : { 52 537046000 : index = index1[(code>>SHIFT)]; 53 537046000 : index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))]; 54 : } 55 : 56 537046000 : return &_PyUnicode_TypeRecords[index]; 57 : } 58 : 59 : /* Returns the titlecase Unicode characters corresponding to ch or just 60 : ch if no titlecase mapping is known. */ 61 : 62 0 : Py_UCS4 _PyUnicode_ToTitlecase(Py_UCS4 ch) 63 : { 64 0 : const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 65 : 66 0 : if (ctype->flags & EXTENDED_CASE_MASK) 67 0 : return _PyUnicode_ExtendedCase[ctype->title & 0xFFFF]; 68 0 : return ch + ctype->title; 69 : } 70 : 71 : /* Returns 1 for Unicode characters having the category 'Lt', 0 72 : otherwise. */ 73 : 74 27093900 : int _PyUnicode_IsTitlecase(Py_UCS4 ch) 75 : { 76 27093900 : const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 77 : 78 27093900 : return (ctype->flags & TITLE_MASK) != 0; 79 : } 80 : 81 : /* Returns 1 for Unicode characters having the XID_Start property, 0 82 : otherwise. */ 83 : 84 6121190 : int _PyUnicode_IsXidStart(Py_UCS4 ch) 85 : { 86 6121190 : const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 87 : 88 6121190 : return (ctype->flags & XID_START_MASK) != 0; 89 : } 90 : 91 : /* Returns 1 for Unicode characters having the XID_Continue property, 92 : 0 otherwise. */ 93 : 94 2231550 : int _PyUnicode_IsXidContinue(Py_UCS4 ch) 95 : { 96 2231550 : const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 97 : 98 2231550 : return (ctype->flags & XID_CONTINUE_MASK) != 0; 99 : } 100 : 101 : /* Returns the integer decimal (0-9) for Unicode characters having 102 : this property, -1 otherwise. */ 103 : 104 111923000 : int _PyUnicode_ToDecimalDigit(Py_UCS4 ch) 105 : { 106 111923000 : const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 107 : 108 111923000 : return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1; 109 : } 110 : 111 96189800 : int _PyUnicode_IsDecimalDigit(Py_UCS4 ch) 112 : { 113 96189800 : if (_PyUnicode_ToDecimalDigit(ch) < 0) 114 67894800 : return 0; 115 28295000 : return 1; 116 : } 117 : 118 : /* Returns the integer digit (0-9) for Unicode characters having 119 : this property, -1 otherwise. */ 120 : 121 67015600 : int _PyUnicode_ToDigit(Py_UCS4 ch) 122 : { 123 67015600 : const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 124 : 125 67015600 : return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1; 126 : } 127 : 128 65835900 : int _PyUnicode_IsDigit(Py_UCS4 ch) 129 : { 130 65835900 : if (_PyUnicode_ToDigit(ch) < 0) 131 65808800 : return 0; 132 27119 : return 1; 133 : } 134 : 135 : /* Returns the numeric value as double for Unicode characters having 136 : this property, -1.0 otherwise. */ 137 : 138 65803200 : int _PyUnicode_IsNumeric(Py_UCS4 ch) 139 : { 140 65803200 : const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 141 : 142 65803200 : return (ctype->flags & NUMERIC_MASK) != 0; 143 : } 144 : 145 : /* Returns 1 for Unicode characters to be hex-escaped when repr()ed, 146 : 0 otherwise. 147 : All characters except those characters defined in the Unicode character 148 : database as following categories are considered printable. 149 : * Cc (Other, Control) 150 : * Cf (Other, Format) 151 : * Cs (Other, Surrogate) 152 : * Co (Other, Private Use) 153 : * Cn (Other, Not Assigned) 154 : * Zl Separator, Line ('\u2028', LINE SEPARATOR) 155 : * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR) 156 : * Zs (Separator, Space) other than ASCII space('\x20'). 157 : */ 158 16287100 : int _PyUnicode_IsPrintable(Py_UCS4 ch) 159 : { 160 16287100 : const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 161 : 162 16287100 : return (ctype->flags & PRINTABLE_MASK) != 0; 163 : } 164 : 165 : /* Returns 1 for Unicode characters having the category 'Ll', 0 166 : otherwise. */ 167 : 168 25065200 : int _PyUnicode_IsLowercase(Py_UCS4 ch) 169 : { 170 25065200 : const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 171 : 172 25065200 : return (ctype->flags & LOWER_MASK) != 0; 173 : } 174 : 175 : /* Returns 1 for Unicode characters having the category 'Lu', 0 176 : otherwise. */ 177 : 178 12442000 : int _PyUnicode_IsUppercase(Py_UCS4 ch) 179 : { 180 12442000 : const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 181 : 182 12442000 : return (ctype->flags & UPPER_MASK) != 0; 183 : } 184 : 185 : /* Returns the uppercase Unicode characters corresponding to ch or just 186 : ch if no uppercase mapping is known. */ 187 : 188 58626 : Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch) 189 : { 190 58626 : const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 191 : 192 58626 : if (ctype->flags & EXTENDED_CASE_MASK) 193 43 : return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFF]; 194 58583 : return ch + ctype->upper; 195 : } 196 : 197 : /* Returns the lowercase Unicode characters corresponding to ch or just 198 : ch if no lowercase mapping is known. */ 199 : 200 309442 : Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) 201 : { 202 309442 : const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 203 : 204 309442 : if (ctype->flags & EXTENDED_CASE_MASK) 205 104 : return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFF]; 206 309338 : return ch + ctype->lower; 207 : } 208 : 209 11368900 : int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res) 210 : { 211 11368900 : const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 212 : 213 11368900 : if (ctype->flags & EXTENDED_CASE_MASK) { 214 967 : int index = ctype->lower & 0xFFFF; 215 967 : int n = ctype->lower >> 24; 216 : int i; 217 1943 : for (i = 0; i < n; i++) 218 976 : res[i] = _PyUnicode_ExtendedCase[index + i]; 219 967 : return n; 220 : } 221 11367900 : res[0] = ch + ctype->lower; 222 11367900 : return 1; 223 : } 224 : 225 6702170 : int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res) 226 : { 227 6702170 : const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 228 : 229 6702170 : if (ctype->flags & EXTENDED_CASE_MASK) { 230 1198 : int index = ctype->title & 0xFFFF; 231 1198 : int n = ctype->title >> 24; 232 : int i; 233 2656 : for (i = 0; i < n; i++) 234 1458 : res[i] = _PyUnicode_ExtendedCase[index + i]; 235 1198 : return n; 236 : } 237 6700970 : res[0] = ch + ctype->title; 238 6700970 : return 1; 239 : } 240 : 241 6693650 : int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res) 242 : { 243 6693650 : const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 244 : 245 6693650 : if (ctype->flags & EXTENDED_CASE_MASK) { 246 953 : int index = ctype->upper & 0xFFFF; 247 953 : int n = ctype->upper >> 24; 248 : int i; 249 2290 : for (i = 0; i < n; i++) 250 1337 : res[i] = _PyUnicode_ExtendedCase[index + i]; 251 953 : return n; 252 : } 253 6692700 : res[0] = ch + ctype->upper; 254 6692700 : return 1; 255 : } 256 : 257 9 : int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res) 258 : { 259 9 : const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 260 : 261 9 : if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) { 262 4 : int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24); 263 4 : int n = (ctype->lower >> 20) & 7; 264 : int i; 265 10 : for (i = 0; i < n; i++) 266 6 : res[i] = _PyUnicode_ExtendedCase[index + i]; 267 4 : return n; 268 : } 269 5 : return _PyUnicode_ToLowerFull(ch, res); 270 : } 271 : 272 11159100 : int _PyUnicode_IsCased(Py_UCS4 ch) 273 : { 274 11159100 : const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 275 : 276 11159100 : return (ctype->flags & CASED_MASK) != 0; 277 : } 278 : 279 26 : int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch) 280 : { 281 26 : const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 282 : 283 26 : return (ctype->flags & CASE_IGNORABLE_MASK) != 0; 284 : } 285 : 286 : /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt', 287 : 'Lo' or 'Lm', 0 otherwise. */ 288 : 289 166772000 : int _PyUnicode_IsAlpha(Py_UCS4 ch) 290 : { 291 166772000 : const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 292 : 293 166772000 : return (ctype->flags & ALPHA_MASK) != 0; 294 : } 295 :