Coverage Report

Created: 2022-07-08 09:39

/home/mdboom/Work/builds/cpython/Objects/unicodectype.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
   Unicode character type helpers.
3
4
   Written by Marc-Andre Lemburg (mal@lemburg.com).
5
   Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
6
7
   Copyright (c) Corporation for National Research Initiatives.
8
9
*/
10
11
#include "Python.h"
12
13
#define ALPHA_MASK 0x01
14
#define DECIMAL_MASK 0x02
15
#define DIGIT_MASK 0x04
16
#define LOWER_MASK 0x08
17
#define TITLE_MASK 0x40
18
#define UPPER_MASK 0x80
19
#define XID_START_MASK 0x100
20
#define XID_CONTINUE_MASK 0x200
21
#define PRINTABLE_MASK 0x400
22
#define NUMERIC_MASK 0x800
23
#define CASE_IGNORABLE_MASK 0x1000
24
#define CASED_MASK 0x2000
25
#define EXTENDED_CASE_MASK 0x4000
26
27
typedef struct {
28
    /*
29
       These are either deltas to the character or offsets in
30
       _PyUnicode_ExtendedCase.
31
    */
32
    const int upper;
33
    const int lower;
34
    const int title;
35
    /* Note if more flag space is needed, decimal and digit could be unified. */
36
    const unsigned char decimal;
37
    const unsigned char digit;
38
    const unsigned short flags;
39
} _PyUnicode_TypeRecord;
40
41
#include "unicodetype_db.h"
42
43
static const _PyUnicode_TypeRecord *
44
gettyperecord(Py_UCS4 code)
45
{
46
    int index;
47
48
    if (code >= 0x110000)
  Branch (48:9): [True: 0, False: 204M]
49
        index = 0;
50
    else
51
    {
52
        index = index1[(code>>SHIFT)];
53
        index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
54
    }
55
56
    return &_PyUnicode_TypeRecords[index];
57
}
58
59
/* Returns the titlecase Unicode characters corresponding to ch or just
60
   ch if no titlecase mapping is known. */
61
62
Py_UCS4 _PyUnicode_ToTitlecase(Py_UCS4 ch)
63
{
64
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
65
66
    if (ctype->flags & EXTENDED_CASE_MASK)
  Branch (66:9): [True: 0, False: 0]
67
        return _PyUnicode_ExtendedCase[ctype->title & 0xFFFF];
68
    return ch + ctype->title;
69
}
70
71
/* Returns 1 for Unicode characters having the category 'Lt', 0
72
   otherwise. */
73
74
int _PyUnicode_IsTitlecase(Py_UCS4 ch)
75
{
76
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
77
78
    return (ctype->flags & TITLE_MASK) != 0;
79
}
80
81
/* Returns 1 for Unicode characters having the XID_Start property, 0
82
   otherwise. */
83
84
int _PyUnicode_IsXidStart(Py_UCS4 ch)
85
{
86
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
87
88
    return (ctype->flags & XID_START_MASK) != 0;
89
}
90
91
/* Returns 1 for Unicode characters having the XID_Continue property,
92
   0 otherwise. */
93
94
int _PyUnicode_IsXidContinue(Py_UCS4 ch)
95
{
96
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
97
98
    return (ctype->flags & XID_CONTINUE_MASK) != 0;
99
}
100
101
/* Returns the integer decimal (0-9) for Unicode characters having
102
   this property, -1 otherwise. */
103
104
int _PyUnicode_ToDecimalDigit(Py_UCS4 ch)
105
{
106
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
107
108
    return (ctype->flags & DECIMAL_MASK) ? 
ctype->decimal16.5M
:
-134.0M
;
  Branch (108:12): [True: 16.5M, False: 34.0M]
109
}
110
111
int _PyUnicode_IsDecimalDigit(Py_UCS4 ch)
112
{
113
    if (_PyUnicode_ToDecimalDigit(ch) < 0)
  Branch (113:9): [True: 33.7M, False: 16.2M]
114
        return 0;
115
    return 1;
116
}
117
118
/* Returns the integer digit (0-9) for Unicode characters having
119
   this property, -1 otherwise. */
120
121
int _PyUnicode_ToDigit(Py_UCS4 ch)
122
{
123
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
124
125
    return (ctype->flags & DIGIT_MASK) ? 
ctype->digit21.3k
:
-132.4M
;
  Branch (125:12): [True: 21.3k, False: 32.4M]
126
}
127
128
int _PyUnicode_IsDigit(Py_UCS4 ch)
129
{
130
    if (_PyUnicode_ToDigit(ch) < 0)
  Branch (130:9): [True: 32.4M, False: 20.9k]
131
        return 0;
132
    return 1;
133
}
134
135
/* Returns the numeric value as double for Unicode characters having
136
   this property, -1.0 otherwise. */
137
138
int _PyUnicode_IsNumeric(Py_UCS4 ch)
139
{
140
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
141
142
    return (ctype->flags & NUMERIC_MASK) != 0;
143
}
144
145
/* Returns 1 for Unicode characters to be hex-escaped when repr()ed,
146
   0 otherwise.
147
   All characters except those characters defined in the Unicode character
148
   database as following categories are considered printable.
149
      * Cc (Other, Control)
150
      * Cf (Other, Format)
151
      * Cs (Other, Surrogate)
152
      * Co (Other, Private Use)
153
      * Cn (Other, Not Assigned)
154
      * Zl Separator, Line ('\u2028', LINE SEPARATOR)
155
      * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
156
      * Zs (Separator, Space) other than ASCII space('\x20').
157
*/
158
int _PyUnicode_IsPrintable(Py_UCS4 ch)
159
{
160
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
161
162
    return (ctype->flags & PRINTABLE_MASK) != 0;
163
}
164
165
/* Returns 1 for Unicode characters having the category 'Ll', 0
166
   otherwise. */
167
168
int _PyUnicode_IsLowercase(Py_UCS4 ch)
169
{
170
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
171
172
    return (ctype->flags & LOWER_MASK) != 0;
173
}
174
175
/* Returns 1 for Unicode characters having the category 'Lu', 0
176
   otherwise. */
177
178
int _PyUnicode_IsUppercase(Py_UCS4 ch)
179
{
180
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
181
182
    return (ctype->flags & UPPER_MASK) != 0;
183
}
184
185
/* Returns the uppercase Unicode characters corresponding to ch or just
186
   ch if no uppercase mapping is known. */
187
188
Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
189
{
190
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
191
192
    if (ctype->flags & EXTENDED_CASE_MASK)
  Branch (192:9): [True: 43, False: 16.2k]
193
        return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFF];
194
    return ch + ctype->upper;
195
}
196
197
/* Returns the lowercase Unicode characters corresponding to ch or just
198
   ch if no lowercase mapping is known. */
199
200
Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
201
{
202
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
203
204
    if (ctype->flags & EXTENDED_CASE_MASK)
  Branch (204:9): [True: 104, False: 95.3k]
205
        return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFF];
206
    return ch + ctype->lower;
207
}
208
209
int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res)
210
{
211
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
212
213
    if (ctype->flags & EXTENDED_CASE_MASK) {
  Branch (213:9): [True: 371, False: 1.15M]
214
        int index = ctype->lower & 0xFFFF;
215
        int n = ctype->lower >> 24;
216
        int i;
217
        for (i = 0; i < n; 
i++378
)
  Branch (217:21): [True: 378, False: 371]
218
            res[i] = _PyUnicode_ExtendedCase[index + i];
219
        return n;
220
    }
221
    res[0] = ch + ctype->lower;
222
    return 1;
223
}
224
225
int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res)
226
{
227
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
228
229
    if (ctype->flags & EXTENDED_CASE_MASK) {
  Branch (229:9): [True: 304, False: 1.11M]
230
        int index = ctype->title & 0xFFFF;
231
        int n = ctype->title >> 24;
232
        int i;
233
        for (i = 0; i < n; 
i++372
)
  Branch (233:21): [True: 372, False: 304]
234
            res[i] = _PyUnicode_ExtendedCase[index + i];
235
        return n;
236
    }
237
    res[0] = ch + ctype->title;
238
    return 1;
239
}
240
241
int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
242
{
243
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
244
245
    if (ctype->flags & EXTENDED_CASE_MASK) {
  Branch (245:9): [True: 357, False: 1.12M]
246
        int index = ctype->upper & 0xFFFF;
247
        int n = ctype->upper >> 24;
248
        int i;
249
        for (i = 0; i < n; 
i++505
)
  Branch (249:21): [True: 505, False: 357]
250
            res[i] = _PyUnicode_ExtendedCase[index + i];
251
        return n;
252
    }
253
    res[0] = ch + ctype->upper;
254
    return 1;
255
}
256
257
int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res)
258
{
259
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
260
261
    if (ctype->flags & EXTENDED_CASE_MASK && 
(ctype->lower >> 20) & 74
) {
  Branch (261:9): [True: 4, False: 5]
  Branch (261:46): [True: 4, False: 0]
262
        int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24);
263
        int n = (ctype->lower >> 20) & 7;
264
        int i;
265
        for (i = 0; i < n; 
i++6
)
  Branch (265:21): [True: 6, False: 4]
266
            res[i] = _PyUnicode_ExtendedCase[index + i];
267
        return n;
268
    }
269
    return _PyUnicode_ToLowerFull(ch, res);
270
}
271
272
int _PyUnicode_IsCased(Py_UCS4 ch)
273
{
274
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
275
276
    return (ctype->flags & CASED_MASK) != 0;
277
}
278
279
int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch)
280
{
281
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
282
283
    return (ctype->flags & CASE_IGNORABLE_MASK) != 0;
284
}
285
286
/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
287
   'Lo' or 'Lm',  0 otherwise. */
288
289
int _PyUnicode_IsAlpha(Py_UCS4 ch)
290
{
291
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
292
293
    return (ctype->flags & ALPHA_MASK) != 0;
294
}
295