/home/mdboom/Work/builds/cpython/Objects/unicodectype.c
Line | Count | Source (jump to first uncovered line) |
1 | /* |
2 | Unicode character type helpers. |
3 | |
4 | Written by Marc-Andre Lemburg (mal@lemburg.com). |
5 | Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com) |
6 | |
7 | Copyright (c) Corporation for National Research Initiatives. |
8 | |
9 | */ |
10 | |
11 | #include "Python.h" |
12 | |
13 | #define ALPHA_MASK 0x01 |
14 | #define DECIMAL_MASK 0x02 |
15 | #define DIGIT_MASK 0x04 |
16 | #define LOWER_MASK 0x08 |
17 | #define TITLE_MASK 0x40 |
18 | #define UPPER_MASK 0x80 |
19 | #define XID_START_MASK 0x100 |
20 | #define XID_CONTINUE_MASK 0x200 |
21 | #define PRINTABLE_MASK 0x400 |
22 | #define NUMERIC_MASK 0x800 |
23 | #define CASE_IGNORABLE_MASK 0x1000 |
24 | #define CASED_MASK 0x2000 |
25 | #define EXTENDED_CASE_MASK 0x4000 |
26 | |
27 | typedef struct { |
28 | /* |
29 | These are either deltas to the character or offsets in |
30 | _PyUnicode_ExtendedCase. |
31 | */ |
32 | const int upper; |
33 | const int lower; |
34 | const int title; |
35 | /* Note if more flag space is needed, decimal and digit could be unified. */ |
36 | const unsigned char decimal; |
37 | const unsigned char digit; |
38 | const unsigned short flags; |
39 | } _PyUnicode_TypeRecord; |
40 | |
41 | #include "unicodetype_db.h" |
42 | |
43 | static const _PyUnicode_TypeRecord * |
44 | gettyperecord(Py_UCS4 code) |
45 | { |
46 | int index; |
47 | |
48 | if (code >= 0x110000) Branch (48:9): [True: 0, False: 204M]
|
49 | index = 0; |
50 | else |
51 | { |
52 | index = index1[(code>>SHIFT)]; |
53 | index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))]; |
54 | } |
55 | |
56 | return &_PyUnicode_TypeRecords[index]; |
57 | } |
58 | |
59 | /* Returns the titlecase Unicode characters corresponding to ch or just |
60 | ch if no titlecase mapping is known. */ |
61 | |
62 | Py_UCS4 _PyUnicode_ToTitlecase(Py_UCS4 ch) |
63 | { |
64 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
65 |
|
66 | if (ctype->flags & EXTENDED_CASE_MASK) Branch (66:9): [True: 0, False: 0]
|
67 | return _PyUnicode_ExtendedCase[ctype->title & 0xFFFF]; |
68 | return ch + ctype->title; |
69 | } |
70 | |
71 | /* Returns 1 for Unicode characters having the category 'Lt', 0 |
72 | otherwise. */ |
73 | |
74 | int _PyUnicode_IsTitlecase(Py_UCS4 ch) |
75 | { |
76 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
77 | |
78 | return (ctype->flags & TITLE_MASK) != 0; |
79 | } |
80 | |
81 | /* Returns 1 for Unicode characters having the XID_Start property, 0 |
82 | otherwise. */ |
83 | |
84 | int _PyUnicode_IsXidStart(Py_UCS4 ch) |
85 | { |
86 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
87 | |
88 | return (ctype->flags & XID_START_MASK) != 0; |
89 | } |
90 | |
91 | /* Returns 1 for Unicode characters having the XID_Continue property, |
92 | 0 otherwise. */ |
93 | |
94 | int _PyUnicode_IsXidContinue(Py_UCS4 ch) |
95 | { |
96 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
97 | |
98 | return (ctype->flags & XID_CONTINUE_MASK) != 0; |
99 | } |
100 | |
101 | /* Returns the integer decimal (0-9) for Unicode characters having |
102 | this property, -1 otherwise. */ |
103 | |
104 | int _PyUnicode_ToDecimalDigit(Py_UCS4 ch) |
105 | { |
106 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
107 | |
108 | return (ctype->flags & DECIMAL_MASK) ? ctype->decimal16.5M : -134.0M ; Branch (108:12): [True: 16.5M, False: 34.0M]
|
109 | } |
110 | |
111 | int _PyUnicode_IsDecimalDigit(Py_UCS4 ch) |
112 | { |
113 | if (_PyUnicode_ToDecimalDigit(ch) < 0) Branch (113:9): [True: 33.7M, False: 16.2M]
|
114 | return 0; |
115 | return 1; |
116 | } |
117 | |
118 | /* Returns the integer digit (0-9) for Unicode characters having |
119 | this property, -1 otherwise. */ |
120 | |
121 | int _PyUnicode_ToDigit(Py_UCS4 ch) |
122 | { |
123 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
124 | |
125 | return (ctype->flags & DIGIT_MASK) ? ctype->digit21.3k : -132.4M ; Branch (125:12): [True: 21.3k, False: 32.4M]
|
126 | } |
127 | |
128 | int _PyUnicode_IsDigit(Py_UCS4 ch) |
129 | { |
130 | if (_PyUnicode_ToDigit(ch) < 0) Branch (130:9): [True: 32.4M, False: 20.9k]
|
131 | return 0; |
132 | return 1; |
133 | } |
134 | |
135 | /* Returns the numeric value as double for Unicode characters having |
136 | this property, -1.0 otherwise. */ |
137 | |
138 | int _PyUnicode_IsNumeric(Py_UCS4 ch) |
139 | { |
140 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
141 | |
142 | return (ctype->flags & NUMERIC_MASK) != 0; |
143 | } |
144 | |
145 | /* Returns 1 for Unicode characters to be hex-escaped when repr()ed, |
146 | 0 otherwise. |
147 | All characters except those characters defined in the Unicode character |
148 | database as following categories are considered printable. |
149 | * Cc (Other, Control) |
150 | * Cf (Other, Format) |
151 | * Cs (Other, Surrogate) |
152 | * Co (Other, Private Use) |
153 | * Cn (Other, Not Assigned) |
154 | * Zl Separator, Line ('\u2028', LINE SEPARATOR) |
155 | * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR) |
156 | * Zs (Separator, Space) other than ASCII space('\x20'). |
157 | */ |
158 | int _PyUnicode_IsPrintable(Py_UCS4 ch) |
159 | { |
160 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
161 | |
162 | return (ctype->flags & PRINTABLE_MASK) != 0; |
163 | } |
164 | |
165 | /* Returns 1 for Unicode characters having the category 'Ll', 0 |
166 | otherwise. */ |
167 | |
168 | int _PyUnicode_IsLowercase(Py_UCS4 ch) |
169 | { |
170 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
171 | |
172 | return (ctype->flags & LOWER_MASK) != 0; |
173 | } |
174 | |
175 | /* Returns 1 for Unicode characters having the category 'Lu', 0 |
176 | otherwise. */ |
177 | |
178 | int _PyUnicode_IsUppercase(Py_UCS4 ch) |
179 | { |
180 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
181 | |
182 | return (ctype->flags & UPPER_MASK) != 0; |
183 | } |
184 | |
185 | /* Returns the uppercase Unicode characters corresponding to ch or just |
186 | ch if no uppercase mapping is known. */ |
187 | |
188 | Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch) |
189 | { |
190 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
191 | |
192 | if (ctype->flags & EXTENDED_CASE_MASK) Branch (192:9): [True: 43, False: 16.2k]
|
193 | return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFF]; |
194 | return ch + ctype->upper; |
195 | } |
196 | |
197 | /* Returns the lowercase Unicode characters corresponding to ch or just |
198 | ch if no lowercase mapping is known. */ |
199 | |
200 | Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) |
201 | { |
202 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
203 | |
204 | if (ctype->flags & EXTENDED_CASE_MASK) Branch (204:9): [True: 104, False: 95.3k]
|
205 | return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFF]; |
206 | return ch + ctype->lower; |
207 | } |
208 | |
209 | int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res) |
210 | { |
211 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
212 | |
213 | if (ctype->flags & EXTENDED_CASE_MASK) { Branch (213:9): [True: 371, False: 1.15M]
|
214 | int index = ctype->lower & 0xFFFF; |
215 | int n = ctype->lower >> 24; |
216 | int i; |
217 | for (i = 0; i < n; i++378 ) Branch (217:21): [True: 378, False: 371]
|
218 | res[i] = _PyUnicode_ExtendedCase[index + i]; |
219 | return n; |
220 | } |
221 | res[0] = ch + ctype->lower; |
222 | return 1; |
223 | } |
224 | |
225 | int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res) |
226 | { |
227 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
228 | |
229 | if (ctype->flags & EXTENDED_CASE_MASK) { Branch (229:9): [True: 304, False: 1.11M]
|
230 | int index = ctype->title & 0xFFFF; |
231 | int n = ctype->title >> 24; |
232 | int i; |
233 | for (i = 0; i < n; i++372 ) Branch (233:21): [True: 372, False: 304]
|
234 | res[i] = _PyUnicode_ExtendedCase[index + i]; |
235 | return n; |
236 | } |
237 | res[0] = ch + ctype->title; |
238 | return 1; |
239 | } |
240 | |
241 | int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res) |
242 | { |
243 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
244 | |
245 | if (ctype->flags & EXTENDED_CASE_MASK) { Branch (245:9): [True: 357, False: 1.12M]
|
246 | int index = ctype->upper & 0xFFFF; |
247 | int n = ctype->upper >> 24; |
248 | int i; |
249 | for (i = 0; i < n; i++505 ) Branch (249:21): [True: 505, False: 357]
|
250 | res[i] = _PyUnicode_ExtendedCase[index + i]; |
251 | return n; |
252 | } |
253 | res[0] = ch + ctype->upper; |
254 | return 1; |
255 | } |
256 | |
257 | int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res) |
258 | { |
259 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
260 | |
261 | if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 74 ) { Branch (261:9): [True: 4, False: 5]
Branch (261:46): [True: 4, False: 0]
|
262 | int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24); |
263 | int n = (ctype->lower >> 20) & 7; |
264 | int i; |
265 | for (i = 0; i < n; i++6 ) Branch (265:21): [True: 6, False: 4]
|
266 | res[i] = _PyUnicode_ExtendedCase[index + i]; |
267 | return n; |
268 | } |
269 | return _PyUnicode_ToLowerFull(ch, res); |
270 | } |
271 | |
272 | int _PyUnicode_IsCased(Py_UCS4 ch) |
273 | { |
274 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
275 | |
276 | return (ctype->flags & CASED_MASK) != 0; |
277 | } |
278 | |
279 | int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch) |
280 | { |
281 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
282 | |
283 | return (ctype->flags & CASE_IGNORABLE_MASK) != 0; |
284 | } |
285 | |
286 | /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt', |
287 | 'Lo' or 'Lm', 0 otherwise. */ |
288 | |
289 | int _PyUnicode_IsAlpha(Py_UCS4 ch) |
290 | { |
291 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
292 | |
293 | return (ctype->flags & ALPHA_MASK) != 0; |
294 | } |
295 | |