/home/mdboom/Work/builds/cpython/Objects/unicodectype.c

Source (jump to first uncovered line)
/*
   Unicode character type helpers.

   Written by Marc-Andre Lemburg (mal@lemburg.com).
   Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)

   Copyright (c) Corporation for National Research Initiatives.

*/

#include "Python.h"

#define ALPHA_MASK 0x01
#define DECIMAL_MASK 0x02
#define DIGIT_MASK 0x04
#define LOWER_MASK 0x08
#define TITLE_MASK 0x40
#define UPPER_MASK 0x80
#define XID_START_MASK 0x100
#define XID_CONTINUE_MASK 0x200
#define PRINTABLE_MASK 0x400
#define NUMERIC_MASK 0x800
#define CASE_IGNORABLE_MASK 0x1000
#define CASED_MASK 0x2000
#define EXTENDED_CASE_MASK 0x4000

typedef struct {
    /*
       These are either deltas to the character or offsets in
       _PyUnicode_ExtendedCase.
    */
    const int upper;
    const int lower;
    const int title;
    /* Note if more flag space is needed, decimal and digit could be unified. */
    const unsigned char decimal;
    const unsigned char digit;
    const unsigned short flags;
} _PyUnicode_TypeRecord;

#include "unicodetype_db.h"

static const _PyUnicode_TypeRecord *
gettyperecord(Py_UCS4 code)
{
    int index;

    if (code >= 0x110000)
        index = 0;
    else
    {
        index = index1[(code>>SHIFT)];
        index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
    }

    return &_PyUnicode_TypeRecords[index];
}

/* Returns the titlecase Unicode characters corresponding to ch or just
   ch if no titlecase mapping is known. */

Py_UCS4 _PyUnicode_ToTitlecase(Py_UCS4 ch)
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    if (ctype->flags & EXTENDED_CASE_MASK)
        return _PyUnicode_ExtendedCase[ctype->title & 0xFFFF];
    return ch + ctype->title;
}

/* Returns 1 for Unicode characters having the category 'Lt', 0
   otherwise. */

int _PyUnicode_IsTitlecase(Py_UCS4 ch)
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & TITLE_MASK) != 0;
}

/* Returns 1 for Unicode characters having the XID_Start property, 0
   otherwise. */

int _PyUnicode_IsXidStart(Py_UCS4 ch)
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & XID_START_MASK) != 0;
}

/* Returns 1 for Unicode characters having the XID_Continue property,
   0 otherwise. */

int _PyUnicode_IsXidContinue(Py_UCS4 ch)
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & XID_CONTINUE_MASK) != 0;
}

/* Returns the integer decimal (0-9) for Unicode characters having
   this property, -1 otherwise. */

int _PyUnicode_ToDecimalDigit(Py_UCS4 ch)
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & DECIMAL_MASK) ? ctype->decimal16.5M : -134.0M;
}

int _PyUnicode_IsDecimalDigit(Py_UCS4 ch)
{
    if (_PyUnicode_ToDecimalDigit(ch) < 0)
        return 0;
    return 1;
}

/* Returns the integer digit (0-9) for Unicode characters having
   this property, -1 otherwise. */

int _PyUnicode_ToDigit(Py_UCS4 ch)
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & DIGIT_MASK) ? ctype->digit21.3k : -132.4M;
}

int _PyUnicode_IsDigit(Py_UCS4 ch)
{
    if (_PyUnicode_ToDigit(ch) < 0)
        return 0;
    return 1;
}

/* Returns the numeric value as double for Unicode characters having
   this property, -1.0 otherwise. */

int _PyUnicode_IsNumeric(Py_UCS4 ch)
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & NUMERIC_MASK) != 0;
}

/* Returns 1 for Unicode characters to be hex-escaped when repr()ed,
   0 otherwise.
   All characters except those characters defined in the Unicode character
   database as following categories are considered printable.
      * Cc (Other, Control)
      * Cf (Other, Format)
      * Cs (Other, Surrogate)
      * Co (Other, Private Use)
      * Cn (Other, Not Assigned)
      * Zl Separator, Line ('\u2028', LINE SEPARATOR)
      * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
      * Zs (Separator, Space) other than ASCII space('\x20').
*/
int _PyUnicode_IsPrintable(Py_UCS4 ch)
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & PRINTABLE_MASK) != 0;
}

/* Returns 1 for Unicode characters having the category 'Ll', 0
   otherwise. */

int _PyUnicode_IsLowercase(Py_UCS4 ch)
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & LOWER_MASK) != 0;
}

/* Returns 1 for Unicode characters having the category 'Lu', 0
   otherwise. */

int _PyUnicode_IsUppercase(Py_UCS4 ch)
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & UPPER_MASK) != 0;
}

/* Returns the uppercase Unicode characters corresponding to ch or just
   ch if no uppercase mapping is known. */

Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    if (ctype->flags & EXTENDED_CASE_MASK)
        return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFF];
    return ch + ctype->upper;
}

/* Returns the lowercase Unicode characters corresponding to ch or just
   ch if no lowercase mapping is known. */

Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    if (ctype->flags & EXTENDED_CASE_MASK)
        return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFF];
    return ch + ctype->lower;
}

int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res)
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    if (ctype->flags & EXTENDED_CASE_MASK) {
        int index = ctype->lower & 0xFFFF;
        int n = ctype->lower >> 24;
        int i;
        for (i = 0; i < n; i++378)
            res[i] = _PyUnicode_ExtendedCase[index + i];
        return n;
    }
    res[0] = ch + ctype->lower;
    return 1;
}

int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res)
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    if (ctype->flags & EXTENDED_CASE_MASK) {
        int index = ctype->title & 0xFFFF;
        int n = ctype->title >> 24;
        int i;
        for (i = 0; i < n; i++372)
            res[i] = _PyUnicode_ExtendedCase[index + i];
        return n;
    }
    res[0] = ch + ctype->title;
    return 1;
}

int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    if (ctype->flags & EXTENDED_CASE_MASK) {
        int index = ctype->upper & 0xFFFF;
        int n = ctype->upper >> 24;
        int i;
        for (i = 0; i < n; i++505)
            res[i] = _PyUnicode_ExtendedCase[index + i];
        return n;
    }
    res[0] = ch + ctype->upper;
    return 1;
}

int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res)
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 74) {
        int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24);
        int n = (ctype->lower >> 20) & 7;
        int i;
        for (i = 0; i < n; i++6)
            res[i] = _PyUnicode_ExtendedCase[index + i];
        return n;
    }
    return _PyUnicode_ToLowerFull(ch, res);
}

int _PyUnicode_IsCased(Py_UCS4 ch)
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & CASED_MASK) != 0;
}

int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch)
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & CASE_IGNORABLE_MASK) != 0;
}

/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
   'Lo' or 'Lm',  0 otherwise. */

int _PyUnicode_IsAlpha(Py_UCS4 ch)
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & ALPHA_MASK) != 0;
}


Coverage Report

Created: 2022-07-08 09:39

Line	Count	Source (jump to first uncovered line)
1	/*
2	Unicode character type helpers.
3
4	Written by Marc-Andre Lemburg (mal@lemburg.com).
5	Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
6
7	Copyright (c) Corporation for National Research Initiatives.
8
9	*/
10
11	#include "Python.h"
12
13	#define ALPHA_MASK 0x01
14	#define DECIMAL_MASK 0x02
15	#define DIGIT_MASK 0x04
16	#define LOWER_MASK 0x08
17	#define TITLE_MASK 0x40
18	#define UPPER_MASK 0x80
19	#define XID_START_MASK 0x100
20	#define XID_CONTINUE_MASK 0x200
21	#define PRINTABLE_MASK 0x400
22	#define NUMERIC_MASK 0x800
23	#define CASE_IGNORABLE_MASK 0x1000
24	#define CASED_MASK 0x2000
25	#define EXTENDED_CASE_MASK 0x4000
26
27	typedef struct {
28	/*
29	These are either deltas to the character or offsets in
30	_PyUnicode_ExtendedCase.
31	*/
32	const int upper;
33	const int lower;
34	const int title;
35	/* Note if more flag space is needed, decimal and digit could be unified. */
36	const unsigned char decimal;
37	const unsigned char digit;
38	const unsigned short flags;
39	} _PyUnicode_TypeRecord;
40
41	#include "unicodetype_db.h"
42
43	static const _PyUnicode_TypeRecord *
44	gettyperecord(Py_UCS4 code)
45	{
46	int index;
47
48	if (code >= 0x110000) Branch (48:9): [True: 0, False: 204M]
49	index = 0;
50	else
51	{
52	index = index1[(code>>SHIFT)];
53	index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
54	}
55
56	return &_PyUnicode_TypeRecords[index];
57	}
58
59	/* Returns the titlecase Unicode characters corresponding to ch or just
60	ch if no titlecase mapping is known. */
61
62	Py_UCS4 _PyUnicode_ToTitlecase(Py_UCS4 ch)
63	{
64	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
65
66	if (ctype->flags & EXTENDED_CASE_MASK) Branch (66:9): [True: 0, False: 0]
67	return _PyUnicode_ExtendedCase[ctype->title & 0xFFFF];
68	return ch + ctype->title;
69	}
70
71	/* Returns 1 for Unicode characters having the category 'Lt', 0
72	otherwise. */
73
74	int _PyUnicode_IsTitlecase(Py_UCS4 ch)
75	{
76	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
77
78	return (ctype->flags & TITLE_MASK) != 0;
79	}
80
81	/* Returns 1 for Unicode characters having the XID_Start property, 0
82	otherwise. */
83
84	int _PyUnicode_IsXidStart(Py_UCS4 ch)
85	{
86	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
87
88	return (ctype->flags & XID_START_MASK) != 0;
89	}
90
91	/* Returns 1 for Unicode characters having the XID_Continue property,
92	0 otherwise. */
93
94	int _PyUnicode_IsXidContinue(Py_UCS4 ch)
95	{
96	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
97
98	return (ctype->flags & XID_CONTINUE_MASK) != 0;
99	}
100
101	/* Returns the integer decimal (0-9) for Unicode characters having
102	this property, -1 otherwise. */
103
104	int _PyUnicode_ToDecimalDigit(Py_UCS4 ch)
105	{
106	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
107
108	return (ctype->flags & DECIMAL_MASK) ? ctype->decimal16.5M : -134.0M ; Branch (108:12): [True: 16.5M, False: 34.0M]
109	}
110
111	int _PyUnicode_IsDecimalDigit(Py_UCS4 ch)
112	{
113	if (_PyUnicode_ToDecimalDigit(ch) < 0) Branch (113:9): [True: 33.7M, False: 16.2M]
114	return 0;
115	return 1;
116	}
117
118	/* Returns the integer digit (0-9) for Unicode characters having
119	this property, -1 otherwise. */
120
121	int _PyUnicode_ToDigit(Py_UCS4 ch)
122	{
123	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
124
125	return (ctype->flags & DIGIT_MASK) ? ctype->digit21.3k : -132.4M ; Branch (125:12): [True: 21.3k, False: 32.4M]
126	}
127
128	int _PyUnicode_IsDigit(Py_UCS4 ch)
129	{
130	if (_PyUnicode_ToDigit(ch) < 0) Branch (130:9): [True: 32.4M, False: 20.9k]
131	return 0;
132	return 1;
133	}
134
135	/* Returns the numeric value as double for Unicode characters having
136	this property, -1.0 otherwise. */
137
138	int _PyUnicode_IsNumeric(Py_UCS4 ch)
139	{
140	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
141
142	return (ctype->flags & NUMERIC_MASK) != 0;
143	}
144
145	/* Returns 1 for Unicode characters to be hex-escaped when repr()ed,
146	0 otherwise.
147	All characters except those characters defined in the Unicode character
148	database as following categories are considered printable.
149	* Cc (Other, Control)
150	* Cf (Other, Format)
151	* Cs (Other, Surrogate)
152	* Co (Other, Private Use)
153	* Cn (Other, Not Assigned)
154	* Zl Separator, Line ('\u2028', LINE SEPARATOR)
155	* Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
156	* Zs (Separator, Space) other than ASCII space('\x20').
157	*/
158	int _PyUnicode_IsPrintable(Py_UCS4 ch)
159	{
160	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
161
162	return (ctype->flags & PRINTABLE_MASK) != 0;
163	}
164
165	/* Returns 1 for Unicode characters having the category 'Ll', 0
166	otherwise. */
167
168	int _PyUnicode_IsLowercase(Py_UCS4 ch)
169	{
170	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
171
172	return (ctype->flags & LOWER_MASK) != 0;
173	}
174
175	/* Returns 1 for Unicode characters having the category 'Lu', 0
176	otherwise. */
177
178	int _PyUnicode_IsUppercase(Py_UCS4 ch)
179	{
180	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
181
182	return (ctype->flags & UPPER_MASK) != 0;
183	}
184
185	/* Returns the uppercase Unicode characters corresponding to ch or just
186	ch if no uppercase mapping is known. */
187
188	Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
189	{
190	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
191
192	if (ctype->flags & EXTENDED_CASE_MASK) Branch (192:9): [True: 43, False: 16.2k]
193	return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFF];
194	return ch + ctype->upper;
195	}
196
197	/* Returns the lowercase Unicode characters corresponding to ch or just
198	ch if no lowercase mapping is known. */
199
200	Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
201	{
202	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
203
204	if (ctype->flags & EXTENDED_CASE_MASK) Branch (204:9): [True: 104, False: 95.3k]
205	return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFF];
206	return ch + ctype->lower;
207	}
208
209	int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res)
210	{
211	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
212
213	if (ctype->flags & EXTENDED_CASE_MASK) { Branch (213:9): [True: 371, False: 1.15M]
214	int index = ctype->lower & 0xFFFF;
215	int n = ctype->lower >> 24;
216	int i;
217	for (i = 0; i < n; i++378 ) Branch (217:21): [True: 378, False: 371]
218	res[i] = _PyUnicode_ExtendedCase[index + i];
219	return n;
220	}
221	res[0] = ch + ctype->lower;
222	return 1;
223	}
224
225	int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res)
226	{
227	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
228
229	if (ctype->flags & EXTENDED_CASE_MASK) { Branch (229:9): [True: 304, False: 1.11M]
230	int index = ctype->title & 0xFFFF;
231	int n = ctype->title >> 24;
232	int i;
233	for (i = 0; i < n; i++372 ) Branch (233:21): [True: 372, False: 304]
234	res[i] = _PyUnicode_ExtendedCase[index + i];
235	return n;
236	}
237	res[0] = ch + ctype->title;
238	return 1;
239	}
240
241	int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
242	{
243	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
244
245	if (ctype->flags & EXTENDED_CASE_MASK) { Branch (245:9): [True: 357, False: 1.12M]
246	int index = ctype->upper & 0xFFFF;
247	int n = ctype->upper >> 24;
248	int i;
249	for (i = 0; i < n; i++505 ) Branch (249:21): [True: 505, False: 357]
250	res[i] = _PyUnicode_ExtendedCase[index + i];
251	return n;
252	}
253	res[0] = ch + ctype->upper;
254	return 1;
255	}
256
257	int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res)
258	{
259	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
260
261	if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 74 ) { Branch (261:9): [True: 4, False: 5] Branch (261:46): [True: 4, False: 0]
262	int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24);
263	int n = (ctype->lower >> 20) & 7;
264	int i;
265	for (i = 0; i < n; i++6 ) Branch (265:21): [True: 6, False: 4]
266	res[i] = _PyUnicode_ExtendedCase[index + i];
267	return n;
268	}
269	return _PyUnicode_ToLowerFull(ch, res);
270	}
271
272	int _PyUnicode_IsCased(Py_UCS4 ch)
273	{
274	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
275
276	return (ctype->flags & CASED_MASK) != 0;
277	}
278
279	int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch)
280	{
281	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
282
283	return (ctype->flags & CASE_IGNORABLE_MASK) != 0;
284	}
285
286	/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
287	'Lo' or 'Lm', 0 otherwise. */
288
289	int _PyUnicode_IsAlpha(Py_UCS4 ch)
290	{
291	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
292
293	return (ctype->flags & ALPHA_MASK) != 0;
294	}
295