/home/mdboom/Work/builds/cpython/Objects/stringlib/codecs.h
Line | Count | Source (jump to first uncovered line) |
1 | /* stringlib: codec implementations */ |
2 | |
3 | #if !STRINGLIB_IS_UNICODE |
4 | # error "codecs.h is specific to Unicode" |
5 | #endif |
6 | |
7 | #include "pycore_bitutils.h" // _Py_bswap32() |
8 | |
9 | /* Mask to quickly check whether a C 'size_t' contains a |
10 | non-ASCII, UTF8-encoded char. */ |
11 | #if (SIZEOF_SIZE_T == 8) |
12 | # define ASCII_CHAR_MASK 0x8080808080808080ULL |
13 | #elif (SIZEOF_SIZE_T == 4) |
14 | # define ASCII_CHAR_MASK 0x80808080U |
15 | #else |
16 | # error C 'size_t' size should be either 4 or 8! |
17 | #endif |
18 | |
19 | /* 10xxxxxx */ |
20 | #define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x804.94M && (ch) < 0xC04.94M ) |
21 | |
22 | Py_LOCAL_INLINE(Py_UCS4) |
23 | STRINGLIB(utf8_decode)(const char **inptr, const char *end, |
24 | STRINGLIB_CHAR *dest, |
25 | Py_ssize_t *outpos) |
26 | { |
27 | Py_UCS4 ch; |
28 | const char *s = *inptr; |
29 | STRINGLIB_CHAR *p = dest + *outpos; |
30 | |
31 | while (s < end) { Branch (31:12): [True: 264k, False: 380]
Branch (31:12): [True: 473k, False: 35.9k]
Branch (31:12): [True: 959k, False: 46.8k]
Branch (31:12): [True: 1.07M, False: 2.60k]
|
32 | ch = (unsigned char)*s; |
33 | |
34 | if (ch < 0x80) { Branch (34:13): [True: 1.50k, False: 263k]
Branch (34:13): [True: 440k, False: 32.2k]
Branch (34:13): [True: 214k, False: 744k]
Branch (34:13): [True: 17.4k, False: 1.05M]
|
35 | /* Fast path for runs of ASCII characters. Given that common UTF-8 |
36 | input will consist of an overwhelming majority of ASCII |
37 | characters, we try to optimize for this case by checking |
38 | as many characters as a C 'size_t' can contain. |
39 | First, check if we can do an aligned read, as most CPUs have |
40 | a penalty for unaligned reads. |
41 | */ |
42 | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { |
43 | /* Help register allocation */ |
44 | const char *_s = s; |
45 | STRINGLIB_CHAR *_p = p; |
46 | while (_s + SIZEOF_SIZE_T <= end) { Branch (46:24): [True: 24, False: 302]
Branch (46:24): [True: 325k, False: 34.6k]
Branch (46:24): [True: 523k, False: 12.1k]
Branch (46:24): [True: 87.1k, False: 1.99k]
|
47 | /* Read a whole size_t at a time (either 4 or 8 bytes), |
48 | and do a fast unrolled copy if it only contains ASCII |
49 | characters. */ |
50 | size_t value = *(const size_t *) _s; |
51 | if (value & ASCII_CHAR_MASK) Branch (51:25): [True: 0, False: 24]
Branch (51:25): [True: 13.5k, False: 312k]
Branch (51:25): [True: 14.8k, False: 508k]
Branch (51:25): [True: 179, False: 86.9k]
|
52 | break; |
53 | #if PY_LITTLE_ENDIAN |
54 | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); |
55 | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); |
56 | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); |
57 | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); |
58 | # if SIZEOF_SIZE_T == 8 |
59 | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); |
60 | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); |
61 | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); |
62 | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); |
63 | # endif |
64 | #else |
65 | # if SIZEOF_SIZE_T == 8 |
66 | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); |
67 | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); |
68 | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); |
69 | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); |
70 | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); |
71 | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); |
72 | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); |
73 | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); |
74 | # else |
75 | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); |
76 | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); |
77 | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); |
78 | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); |
79 | # endif |
80 | #endif |
81 | _s += SIZEOF_SIZE_T; |
82 | _p += SIZEOF_SIZE_T; |
83 | } |
84 | s = _s; |
85 | p = _p; |
86 | if (s == end) Branch (86:21): [True: 0, False: 302]
Branch (86:21): [True: 4.43k, False: 43.7k]
Branch (86:21): [True: 1.49k, False: 25.4k]
Branch (86:21): [True: 128, False: 2.04k]
|
87 | break; |
88 | ch = (unsigned char)*s; |
89 | } |
90 | if (ch < 0x80) { Branch (90:17): [True: 1.50k, False: 0]
Branch (90:17): [True: 436k, False: 83]
Branch (90:17): [True: 212k, False: 551]
Branch (90:17): [True: 17.2k, False: 10]
|
91 | s++; |
92 | *p++ = ch; |
93 | continue; |
94 | } |
95 | } |
96 | |
97 | if (ch < 0xE0) { Branch (97:13): [True: 59.8k, False: 203k]
Branch (97:13): [True: 29.5k, False: 2.78k]
Branch (97:13): [True: 21.2k, False: 724k]
Branch (97:13): [True: 857, False: 1.05M]
|
98 | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ |
99 | Py_UCS4 ch2; |
100 | if (ch < 0xC2) { Branch (100:17): [True: 561, False: 59.2k]
Branch (100:17): [True: 2, False: 29.5k]
Branch (100:17): [True: 368, False: 20.8k]
Branch (100:17): [True: 12, False: 845]
|
101 | /* invalid sequence |
102 | \x80-\xBF -- continuation byte |
103 | \xC0-\xC1 -- fake 0000-007F */ |
104 | goto InvalidStart; |
105 | } |
106 | if (end - s < 2) { Branch (106:17): [True: 1.20k, False: 58.0k]
Branch (106:17): [True: 49, False: 29.4k]
Branch (106:17): [True: 778, False: 20.0k]
Branch (106:17): [True: 24, False: 821]
|
107 | /* unexpected end of data: the caller will decide whether |
108 | it's an error or not */ |
109 | break; |
110 | } |
111 | ch2 = (unsigned char)s[1]; |
112 | if (!IS_CONTINUATION_BYTE(ch2)) |
113 | /* invalid continuation byte */ |
114 | goto InvalidContinuation1; |
115 | ch = (ch << 6) + ch2 - |
116 | ((0xC0 << 6) + 0x80); |
117 | assert ((ch > 0x007F) && (ch <= 0x07FF)); |
118 | s += 2; |
119 | if (STRINGLIB_MAX_CHAR <= 0x007F || Branch (119:17): [Folded - Ignored]
Branch (119:17): [Folded - Ignored]
Branch (119:17): [Folded - Ignored]
Branch (119:17): [Folded - Ignored]
|
120 | (29.4k STRINGLIB_MAX_CHAR29.4k < 0x07FF29.4k && ch > 29.4k STRINGLIB_MAX_CHAR29.4k )) Branch (120:18): [Folded - Ignored]
Branch (120:49): [True: 0, False: 0]
Branch (120:18): [Folded - Ignored]
Branch (120:49): [True: 304, False: 29.1k]
Branch (120:18): [Folded - Ignored]
Branch (120:49): [True: 0, False: 0]
Branch (120:18): [Folded - Ignored]
Branch (120:49): [True: 0, False: 0]
|
121 | /* Out-of-range */ |
122 | goto Return; |
123 | *p++ = ch; |
124 | continue; |
125 | } |
126 | |
127 | if (ch < 0xF0) { Branch (127:13): [True: 200k, False: 2.47k]
Branch (127:13): [True: 2.77k, False: 10]
Branch (127:13): [True: 718k, False: 5.56k]
Branch (127:13): [True: 5.30k, False: 1.05M]
|
128 | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ |
129 | Py_UCS4 ch2, ch3; |
130 | if (end - s < 3) { Branch (130:17): [True: 40.2k, False: 160k]
Branch (130:17): [True: 1.02k, False: 1.75k]
Branch (130:17): [True: 78.5k, False: 640k]
Branch (130:17): [True: 99, False: 5.20k]
|
131 | /* unexpected end of data: the caller will decide whether |
132 | it's an error or not */ |
133 | if (end - s < 2) Branch (133:21): [True: 16.4k, False: 23.8k]
Branch (133:21): [True: 617, False: 403]
Branch (133:21): [True: 41.0k, False: 37.5k]
Branch (133:21): [True: 81, False: 18]
|
134 | break; |
135 | ch2 = (unsigned char)s[1]; |
136 | if (!IS_CONTINUATION_BYTE(ch2) || |
137 | (61.7k ch2 < 0xA061.7k ? ch == 0xE044.6k : ch == 0xED17.0k )) Branch (137:21): [True: 22, False: 23.7k]
Branch (137:22): [True: 16.4k, False: 7.32k]
Branch (137:21): [True: 0, False: 403]
Branch (137:22): [True: 3, False: 400]
Branch (137:21): [True: 0, False: 37.4k]
Branch (137:22): [True: 28.1k, False: 9.32k]
Branch (137:21): [True: 0, False: 18]
Branch (137:22): [True: 0, False: 18]
|
138 | /* for clarification see comments below */ |
139 | goto InvalidContinuation1; |
140 | break; |
141 | } |
142 | ch2 = (unsigned char)s[1]; |
143 | ch3 = (unsigned char)s[2]; |
144 | if (!IS_CONTINUATION_BYTE(ch2)) { |
145 | /* invalid continuation byte */ |
146 | goto InvalidContinuation1; |
147 | } |
148 | if (ch == 0xE0) { Branch (148:17): [True: 3.96k, False: 152k]
Branch (148:17): [True: 1.39k, False: 351]
Branch (148:17): [True: 7.79k, False: 632k]
Branch (148:17): [True: 4, False: 5.19k]
|
149 | if (ch2 < 0xA0) Branch (149:21): [True: 68, False: 3.89k]
Branch (149:21): [True: 0, False: 1.39k]
Branch (149:21): [True: 0, False: 7.79k]
Branch (149:21): [True: 0, False: 4]
|
150 | /* invalid sequence |
151 | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ |
152 | goto InvalidContinuation1; |
153 | } else if (ch == 0xED && ch2 >= 0xA016.8k ) { Branch (153:24): [True: 3.38k, False: 148k]
Branch (153:38): [True: 399, False: 2.98k]
Branch (153:24): [True: 1, False: 350]
Branch (153:38): [True: 1, False: 0]
Branch (153:24): [True: 13.4k, False: 618k]
Branch (153:38): [True: 234, False: 13.1k]
Branch (153:24): [True: 17, False: 5.18k]
Branch (153:38): [True: 17, False: 0]
|
154 | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF |
155 | will result in surrogates in range D800-DFFF. Surrogates are |
156 | not valid UTF-8 so they are rejected. |
157 | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf |
158 | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ |
159 | goto InvalidContinuation1; |
160 | } |
161 | if (!IS_CONTINUATION_BYTE(ch3)) { |
162 | /* invalid continuation byte */ |
163 | goto InvalidContinuation2; |
164 | } |
165 | ch = (ch << 12) + (ch2 << 6) + ch3 - |
166 | ((0xE0 << 12) + (0x80 << 6) + 0x80); |
167 | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); |
168 | s += 3; |
169 | if (STRINGLIB_MAX_CHAR <= 0x07FF || Branch (169:17): [Folded - Ignored]
Branch (169:17): [Folded - Ignored]
Branch (169:17): [Folded - Ignored]
Branch (169:17): [Folded - Ignored]
|
170 | (0 STRINGLIB_MAX_CHAR0 < 0xFFFF0 && ch > 0 STRINGLIB_MAX_CHAR0 )) Branch (170:18): [Folded - Ignored]
Branch (170:49): [True: 0, False: 0]
Branch (170:18): [Folded - Ignored]
Branch (170:49): [True: 0, False: 0]
Branch (170:18): [Folded - Ignored]
Branch (170:49): [True: 0, False: 0]
Branch (170:18): [Folded - Ignored]
Branch (170:49): [True: 0, False: 0]
|
171 | /* Out-of-range */ |
172 | goto Return; |
173 | *p++ = ch; |
174 | continue; |
175 | } |
176 | |
177 | if (ch < 0xF5) { Branch (177:13): [True: 2.10k, False: 368]
Branch (177:13): [True: 5, False: 5]
Branch (177:13): [True: 4.72k, False: 833]
Branch (177:13): [True: 1.05M, False: 0]
|
178 | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ |
179 | Py_UCS4 ch2, ch3, ch4; |
180 | if (end - s < 4) { Branch (180:17): [True: 465, False: 1.64k]
Branch (180:17): [True: 1, False: 4]
Branch (180:17): [True: 3.98k, False: 748]
Branch (180:17): [True: 90, False: 1.05M]
|
181 | /* unexpected end of data: the caller will decide whether |
182 | it's an error or not */ |
183 | if (end - s < 2) Branch (183:21): [True: 56, False: 409]
Branch (183:21): [True: 0, False: 1]
Branch (183:21): [True: 3.91k, False: 68]
Branch (183:21): [True: 54, False: 36]
|
184 | break; |
185 | ch2 = (unsigned char)s[1]; |
186 | if (!IS_CONTINUATION_BYTE(ch2) || |
187 | (405 ch2 < 0x90405 ? ch == 0xF0121 : ch == 0xF4284 )) Branch (187:21): [True: 18, False: 317]
Branch (187:22): [True: 114, False: 221]
Branch (187:21): [True: 0, False: 0]
Branch (187:22): [True: 0, False: 0]
Branch (187:21): [True: 0, False: 34]
Branch (187:22): [True: 7, False: 27]
Branch (187:21): [True: 0, False: 36]
Branch (187:22): [True: 0, False: 36]
|
188 | /* for clarification see comments below */ |
189 | goto InvalidContinuation1; |
190 | if (end - s < 3) Branch (190:21): [True: 94, False: 223]
Branch (190:21): [True: 0, False: 0]
Branch (190:21): [True: 12, False: 22]
Branch (190:21): [True: 18, False: 18]
|
191 | break; |
192 | ch3 = (unsigned char)s[2]; |
193 | if (!IS_CONTINUATION_BYTE(ch3)) |
194 | goto InvalidContinuation2; |
195 | break; |
196 | } |
197 | ch2 = (unsigned char)s[1]; |
198 | ch3 = (unsigned char)s[2]; |
199 | ch4 = (unsigned char)s[3]; |
200 | if (!IS_CONTINUATION_BYTE(ch2)) { |
201 | /* invalid continuation byte */ |
202 | goto InvalidContinuation1; |
203 | } |
204 | if (ch == 0xF0) { Branch (204:17): [True: 1.05k, False: 519]
Branch (204:17): [True: 4, False: 0]
Branch (204:17): [True: 664, False: 62]
Branch (204:17): [True: 199k, False: 852k]
|
205 | if (ch2 < 0x90) Branch (205:21): [True: 36, False: 1.01k]
Branch (205:21): [True: 0, False: 4]
Branch (205:21): [True: 0, False: 664]
Branch (205:21): [True: 0, False: 199k]
|
206 | /* invalid sequence |
207 | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ |
208 | goto InvalidContinuation1; |
209 | } else if (ch == 0xF4 && ch2 >= 0x9066.7k ) { Branch (209:24): [True: 265, False: 254]
Branch (209:38): [True: 100, False: 165]
Branch (209:24): [True: 0, False: 0]
Branch (209:38): [True: 0, False: 0]
Branch (209:24): [True: 55, False: 7]
Branch (209:38): [True: 0, False: 55]
Branch (209:24): [True: 66.4k, False: 786k]
Branch (209:38): [True: 0, False: 66.4k]
|
210 | /* invalid sequence |
211 | \xF4\x90\x80\x80- -- 110000- overflow */ |
212 | goto InvalidContinuation1; |
213 | } |
214 | if (!IS_CONTINUATION_BYTE(ch3)) { |
215 | /* invalid continuation byte */ |
216 | goto InvalidContinuation2; |
217 | } |
218 | if (!IS_CONTINUATION_BYTE(ch4)) { |
219 | /* invalid continuation byte */ |
220 | goto InvalidContinuation3; |
221 | } |
222 | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - |
223 | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); |
224 | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); |
225 | s += 4; |
226 | if (STRINGLIB_MAX_CHAR <= 0xFFFF || Branch (226:17): [Folded - Ignored]
Branch (226:17): [Folded - Ignored]
Branch (226:17): [Folded - Ignored]
Branch (226:17): [Folded - Ignored]
|
227 | (0 STRINGLIB_MAX_CHAR0 < 0x10FFFF0 && ch > 0 STRINGLIB_MAX_CHAR0 )) Branch (227:18): [Folded - Ignored]
Branch (227:51): [True: 0, False: 0]
Branch (227:18): [Folded - Ignored]
Branch (227:51): [True: 0, False: 0]
Branch (227:18): [Folded - Ignored]
Branch (227:51): [True: 0, False: 0]
Branch (227:18): [Folded - Ignored]
Branch (227:51): [True: 0, False: 0]
|
228 | /* Out-of-range */ |
229 | goto Return; |
230 | *p++ = ch; |
231 | continue; |
232 | } |
233 | goto InvalidStart; |
234 | } |
235 | ch = 0; |
236 | Return: |
237 | *inptr = s; |
238 | *outpos = p - dest; |
239 | return ch; |
240 | InvalidStart: |
241 | ch = 1; |
242 | goto Return; |
243 | InvalidContinuation1: |
244 | ch = 2; |
245 | goto Return; |
246 | InvalidContinuation2: |
247 | ch = 3; |
248 | goto Return; |
249 | InvalidContinuation3: |
250 | ch = 4; |
251 | goto Return; |
252 | } unicodeobject.c:asciilib_utf8_decode Line | Count | Source | 26 | { | 27 | Py_UCS4 ch; | 28 | const char *s = *inptr; | 29 | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | 31 | while (s < end) { Branch (31:12): [True: 264k, False: 380]
| 32 | ch = (unsigned char)*s; | 33 | | 34 | if (ch < 0x80) { Branch (34:13): [True: 1.50k, False: 263k]
| 35 | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | input will consist of an overwhelming majority of ASCII | 37 | characters, we try to optimize for this case by checking | 38 | as many characters as a C 'size_t' can contain. | 39 | First, check if we can do an aligned read, as most CPUs have | 40 | a penalty for unaligned reads. | 41 | */ | 42 | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | /* Help register allocation */ | 44 | const char *_s = s; | 45 | STRINGLIB_CHAR *_p = p; | 46 | while (_s + SIZEOF_SIZE_T <= end) { Branch (46:24): [True: 24, False: 302]
| 47 | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | and do a fast unrolled copy if it only contains ASCII | 49 | characters. */ | 50 | size_t value = *(const size_t *) _s; | 51 | if (value & ASCII_CHAR_MASK) Branch (51:25): [True: 0, False: 24]
| 52 | break; | 53 | #if PY_LITTLE_ENDIAN | 54 | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | # if SIZEOF_SIZE_T == 8 | 59 | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | # endif | 64 | #else | 65 | # if SIZEOF_SIZE_T == 8 | 66 | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | # else | 75 | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | # endif | 80 | #endif | 81 | _s += SIZEOF_SIZE_T; | 82 | _p += SIZEOF_SIZE_T; | 83 | } | 84 | s = _s; | 85 | p = _p; | 86 | if (s == end) Branch (86:21): [True: 0, False: 302]
| 87 | break; | 88 | ch = (unsigned char)*s; | 89 | } | 90 | if (ch < 0x80) { Branch (90:17): [True: 1.50k, False: 0]
| 91 | s++; | 92 | *p++ = ch; | 93 | continue; | 94 | } | 95 | } | 96 | | 97 | if (ch < 0xE0) { Branch (97:13): [True: 59.8k, False: 203k]
| 98 | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | Py_UCS4 ch2; | 100 | if (ch < 0xC2) { Branch (100:17): [True: 561, False: 59.2k]
| 101 | /* invalid sequence | 102 | \x80-\xBF -- continuation byte | 103 | \xC0-\xC1 -- fake 0000-007F */ | 104 | goto InvalidStart; | 105 | } | 106 | if (end - s < 2) { Branch (106:17): [True: 1.20k, False: 58.0k]
| 107 | /* unexpected end of data: the caller will decide whether | 108 | it's an error or not */ | 109 | break; | 110 | } | 111 | ch2 = (unsigned char)s[1]; | 112 | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | /* invalid continuation byte */ | 114 | goto InvalidContinuation1; | 115 | ch = (ch << 6) + ch2 - | 116 | ((0xC0 << 6) + 0x80); | 117 | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | s += 2; | 119 | if (STRINGLIB_MAX_CHAR <= 0x007F || Branch (119:17): [Folded - Ignored]
| 120 | (0 STRINGLIB_MAX_CHAR0 < 0x07FF0 && ch > 0 STRINGLIB_MAX_CHAR0 )) Branch (120:18): [Folded - Ignored]
Branch (120:49): [True: 0, False: 0]
| 121 | /* Out-of-range */ | 122 | goto Return; | 123 | *p++ = ch; | 124 | continue; | 125 | } | 126 | | 127 | if (ch < 0xF0) { Branch (127:13): [True: 200k, False: 2.47k]
| 128 | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | Py_UCS4 ch2, ch3; | 130 | if (end - s < 3) { Branch (130:17): [True: 40.2k, False: 160k]
| 131 | /* unexpected end of data: the caller will decide whether | 132 | it's an error or not */ | 133 | if (end - s < 2) Branch (133:21): [True: 16.4k, False: 23.8k]
| 134 | break; | 135 | ch2 = (unsigned char)s[1]; | 136 | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | (23.7k ch2 < 0xA023.7k ? ch == 0xE016.4k : ch == 0xED7.32k )) Branch (137:21): [True: 22, False: 23.7k]
Branch (137:22): [True: 16.4k, False: 7.32k]
| 138 | /* for clarification see comments below */ | 139 | goto InvalidContinuation1; | 140 | break; | 141 | } | 142 | ch2 = (unsigned char)s[1]; | 143 | ch3 = (unsigned char)s[2]; | 144 | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | /* invalid continuation byte */ | 146 | goto InvalidContinuation1; | 147 | } | 148 | if (ch == 0xE0) { Branch (148:17): [True: 3.96k, False: 152k]
| 149 | if (ch2 < 0xA0) Branch (149:21): [True: 68, False: 3.89k]
| 150 | /* invalid sequence | 151 | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | goto InvalidContinuation1; | 153 | } else if (ch == 0xED && ch2 >= 0xA03.38k ) { Branch (153:24): [True: 3.38k, False: 148k]
Branch (153:38): [True: 399, False: 2.98k]
| 154 | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | will result in surrogates in range D800-DFFF. Surrogates are | 156 | not valid UTF-8 so they are rejected. | 157 | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | goto InvalidContinuation1; | 160 | } | 161 | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | /* invalid continuation byte */ | 163 | goto InvalidContinuation2; | 164 | } | 165 | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | s += 3; | 169 | if (STRINGLIB_MAX_CHAR <= 0x07FF || Branch (169:17): [Folded - Ignored]
| 170 | (0 STRINGLIB_MAX_CHAR0 < 0xFFFF0 && ch > 0 STRINGLIB_MAX_CHAR0 )) Branch (170:18): [Folded - Ignored]
Branch (170:49): [True: 0, False: 0]
| 171 | /* Out-of-range */ | 172 | goto Return; | 173 | *p++ = ch; | 174 | continue; | 175 | } | 176 | | 177 | if (ch < 0xF5) { Branch (177:13): [True: 2.10k, False: 368]
| 178 | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | Py_UCS4 ch2, ch3, ch4; | 180 | if (end - s < 4) { Branch (180:17): [True: 465, False: 1.64k]
| 181 | /* unexpected end of data: the caller will decide whether | 182 | it's an error or not */ | 183 | if (end - s < 2) Branch (183:21): [True: 56, False: 409]
| 184 | break; | 185 | ch2 = (unsigned char)s[1]; | 186 | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | (335 ch2 < 0x90335 ? ch == 0xF0114 : ch == 0xF4221 )) Branch (187:21): [True: 18, False: 317]
Branch (187:22): [True: 114, False: 221]
| 188 | /* for clarification see comments below */ | 189 | goto InvalidContinuation1; | 190 | if (end - s < 3) Branch (190:21): [True: 94, False: 223]
| 191 | break; | 192 | ch3 = (unsigned char)s[2]; | 193 | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | goto InvalidContinuation2; | 195 | break; | 196 | } | 197 | ch2 = (unsigned char)s[1]; | 198 | ch3 = (unsigned char)s[2]; | 199 | ch4 = (unsigned char)s[3]; | 200 | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | /* invalid continuation byte */ | 202 | goto InvalidContinuation1; | 203 | } | 204 | if (ch == 0xF0) { Branch (204:17): [True: 1.05k, False: 519]
| 205 | if (ch2 < 0x90) Branch (205:21): [True: 36, False: 1.01k]
| 206 | /* invalid sequence | 207 | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | goto InvalidContinuation1; | 209 | } else if (519 ch == 0xF4519 && ch2 >= 0x90265 ) { Branch (209:24): [True: 265, False: 254]
Branch (209:38): [True: 100, False: 165]
| 210 | /* invalid sequence | 211 | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | goto InvalidContinuation1; | 213 | } | 214 | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | /* invalid continuation byte */ | 216 | goto InvalidContinuation2; | 217 | } | 218 | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | /* invalid continuation byte */ | 220 | goto InvalidContinuation3; | 221 | } | 222 | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | s += 4; | 226 | if (STRINGLIB_MAX_CHAR <= 0xFFFF || Branch (226:17): [Folded - Ignored]
| 227 | (0 STRINGLIB_MAX_CHAR0 < 0x10FFFF0 && ch > 0 STRINGLIB_MAX_CHAR0 )) Branch (227:18): [Folded - Ignored]
Branch (227:51): [True: 0, False: 0]
| 228 | /* Out-of-range */ | 229 | goto Return; | 230 | *p++ = ch; | 231 | continue; | 232 | } | 233 | goto InvalidStart; | 234 | } | 235 | ch = 0; | 236 | Return: | 237 | *inptr = s; | 238 | *outpos = p - dest; | 239 | return ch; | 240 | InvalidStart: | 241 | ch = 1; | 242 | goto Return; | 243 | InvalidContinuation1: | 244 | ch = 2; | 245 | goto Return; | 246 | InvalidContinuation2: | 247 | ch = 3; | 248 | goto Return; | 249 | InvalidContinuation3: | 250 | ch = 4; | 251 | goto Return; | 252 | } |
unicodeobject.c:ucs1lib_utf8_decode Line | Count | Source | 26 | { | 27 | Py_UCS4 ch; | 28 | const char *s = *inptr; | 29 | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | 31 | while (s < end) { Branch (31:12): [True: 473k, False: 35.9k]
| 32 | ch = (unsigned char)*s; | 33 | | 34 | if (ch < 0x80) { Branch (34:13): [True: 440k, False: 32.2k]
| 35 | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | input will consist of an overwhelming majority of ASCII | 37 | characters, we try to optimize for this case by checking | 38 | as many characters as a C 'size_t' can contain. | 39 | First, check if we can do an aligned read, as most CPUs have | 40 | a penalty for unaligned reads. | 41 | */ | 42 | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | /* Help register allocation */ | 44 | const char *_s = s; | 45 | STRINGLIB_CHAR *_p = p; | 46 | while (_s + SIZEOF_SIZE_T <= end) { Branch (46:24): [True: 325k, False: 34.6k]
| 47 | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | and do a fast unrolled copy if it only contains ASCII | 49 | characters. */ | 50 | size_t value = *(const size_t *) _s; | 51 | if (value & ASCII_CHAR_MASK) Branch (51:25): [True: 13.5k, False: 312k]
| 52 | break; | 53 | #if PY_LITTLE_ENDIAN | 54 | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | # if SIZEOF_SIZE_T == 8 | 59 | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | # endif | 64 | #else | 65 | # if SIZEOF_SIZE_T == 8 | 66 | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | # else | 75 | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | # endif | 80 | #endif | 81 | _s += SIZEOF_SIZE_T; | 82 | _p += SIZEOF_SIZE_T; | 83 | } | 84 | s = _s; | 85 | p = _p; | 86 | if (s == end) Branch (86:21): [True: 4.43k, False: 43.7k]
| 87 | break; | 88 | ch = (unsigned char)*s; | 89 | } | 90 | if (ch < 0x80) { Branch (90:17): [True: 436k, False: 83]
| 91 | s++; | 92 | *p++ = ch; | 93 | continue; | 94 | } | 95 | } | 96 | | 97 | if (ch < 0xE0) { Branch (97:13): [True: 29.5k, False: 2.78k]
| 98 | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | Py_UCS4 ch2; | 100 | if (ch < 0xC2) { Branch (100:17): [True: 2, False: 29.5k]
| 101 | /* invalid sequence | 102 | \x80-\xBF -- continuation byte | 103 | \xC0-\xC1 -- fake 0000-007F */ | 104 | goto InvalidStart; | 105 | } | 106 | if (end - s < 2) { Branch (106:17): [True: 49, False: 29.4k]
| 107 | /* unexpected end of data: the caller will decide whether | 108 | it's an error or not */ | 109 | break; | 110 | } | 111 | ch2 = (unsigned char)s[1]; | 112 | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | /* invalid continuation byte */ | 114 | goto InvalidContinuation1; | 115 | ch = (ch << 6) + ch2 - | 116 | ((0xC0 << 6) + 0x80); | 117 | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | s += 2; | 119 | if (STRINGLIB_MAX_CHAR <= 0x007F || Branch (119:17): [Folded - Ignored]
| 120 | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) Branch (120:18): [Folded - Ignored]
Branch (120:49): [True: 304, False: 29.1k]
| 121 | /* Out-of-range */ | 122 | goto Return; | 123 | *p++ = ch; | 124 | continue; | 125 | } | 126 | | 127 | if (ch < 0xF0) { Branch (127:13): [True: 2.77k, False: 10]
| 128 | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | Py_UCS4 ch2, ch3; | 130 | if (end - s < 3) { Branch (130:17): [True: 1.02k, False: 1.75k]
| 131 | /* unexpected end of data: the caller will decide whether | 132 | it's an error or not */ | 133 | if (end - s < 2) Branch (133:21): [True: 617, False: 403]
| 134 | break; | 135 | ch2 = (unsigned char)s[1]; | 136 | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | (ch2 < 0xA0 ? ch == 0xE03 : ch == 0xED400 )) Branch (137:21): [True: 0, False: 403]
Branch (137:22): [True: 3, False: 400]
| 138 | /* for clarification see comments below */ | 139 | goto InvalidContinuation1; | 140 | break; | 141 | } | 142 | ch2 = (unsigned char)s[1]; | 143 | ch3 = (unsigned char)s[2]; | 144 | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | /* invalid continuation byte */ | 146 | goto InvalidContinuation1; | 147 | } | 148 | if (ch == 0xE0) { Branch (148:17): [True: 1.39k, False: 351]
| 149 | if (ch2 < 0xA0) Branch (149:21): [True: 0, False: 1.39k]
| 150 | /* invalid sequence | 151 | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | goto InvalidContinuation1; | 153 | } else if (351 ch == 0xED351 && ch2 >= 0xA01 ) { Branch (153:24): [True: 1, False: 350]
Branch (153:38): [True: 1, False: 0]
| 154 | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | will result in surrogates in range D800-DFFF. Surrogates are | 156 | not valid UTF-8 so they are rejected. | 157 | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | goto InvalidContinuation1; | 160 | } | 161 | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | /* invalid continuation byte */ | 163 | goto InvalidContinuation2; | 164 | } | 165 | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | s += 3; | 169 | if (STRINGLIB_MAX_CHAR <= 0x07FF || Branch (169:17): [Folded - Ignored]
| 170 | (0 STRINGLIB_MAX_CHAR0 < 0xFFFF0 && ch > 0 STRINGLIB_MAX_CHAR0 )) Branch (170:18): [Folded - Ignored]
Branch (170:49): [True: 0, False: 0]
| 171 | /* Out-of-range */ | 172 | goto Return; | 173 | *p++ = ch; | 174 | continue; | 175 | } | 176 | | 177 | if (ch < 0xF5) { Branch (177:13): [True: 5, False: 5]
| 178 | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | Py_UCS4 ch2, ch3, ch4; | 180 | if (end - s < 4) { Branch (180:17): [True: 1, False: 4]
| 181 | /* unexpected end of data: the caller will decide whether | 182 | it's an error or not */ | 183 | if (end - s < 2) Branch (183:21): [True: 0, False: 1]
| 184 | break; | 185 | ch2 = (unsigned char)s[1]; | 186 | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | (0 ch2 < 0x900 ? ch == 0xF00 : ch == 0xF40 )) Branch (187:21): [True: 0, False: 0]
Branch (187:22): [True: 0, False: 0]
| 188 | /* for clarification see comments below */ | 189 | goto InvalidContinuation1; | 190 | if (end - s < 3) Branch (190:21): [True: 0, False: 0]
| 191 | break; | 192 | ch3 = (unsigned char)s[2]; | 193 | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | goto InvalidContinuation2; | 195 | break; | 196 | } | 197 | ch2 = (unsigned char)s[1]; | 198 | ch3 = (unsigned char)s[2]; | 199 | ch4 = (unsigned char)s[3]; | 200 | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | /* invalid continuation byte */ | 202 | goto InvalidContinuation1; | 203 | } | 204 | if (ch == 0xF0) { Branch (204:17): [True: 4, False: 0]
| 205 | if (ch2 < 0x90) Branch (205:21): [True: 0, False: 4]
| 206 | /* invalid sequence | 207 | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | goto InvalidContinuation1; | 209 | } else if (0 ch == 0xF40 && ch2 >= 0x900 ) { Branch (209:24): [True: 0, False: 0]
Branch (209:38): [True: 0, False: 0]
| 210 | /* invalid sequence | 211 | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | goto InvalidContinuation1; | 213 | } | 214 | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | /* invalid continuation byte */ | 216 | goto InvalidContinuation2; | 217 | } | 218 | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | /* invalid continuation byte */ | 220 | goto InvalidContinuation3; | 221 | } | 222 | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | s += 4; | 226 | if (STRINGLIB_MAX_CHAR <= 0xFFFF || Branch (226:17): [Folded - Ignored]
| 227 | (0 STRINGLIB_MAX_CHAR0 < 0x10FFFF0 && ch > 0 STRINGLIB_MAX_CHAR0 )) Branch (227:18): [Folded - Ignored]
Branch (227:51): [True: 0, False: 0]
| 228 | /* Out-of-range */ | 229 | goto Return; | 230 | *p++ = ch; | 231 | continue; | 232 | } | 233 | goto InvalidStart; | 234 | } | 235 | ch = 0; | 236 | Return: | 237 | *inptr = s; | 238 | *outpos = p - dest; | 239 | return ch; | 240 | InvalidStart: | 241 | ch = 1; | 242 | goto Return; | 243 | InvalidContinuation1: | 244 | ch = 2; | 245 | goto Return; | 246 | InvalidContinuation2: | 247 | ch = 3; | 248 | goto Return; | 249 | InvalidContinuation3: | 250 | ch = 4; | 251 | goto Return; | 252 | } |
unicodeobject.c:ucs2lib_utf8_decode Line | Count | Source | 26 | { | 27 | Py_UCS4 ch; | 28 | const char *s = *inptr; | 29 | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | 31 | while (s < end) { Branch (31:12): [True: 959k, False: 46.8k]
| 32 | ch = (unsigned char)*s; | 33 | | 34 | if (ch < 0x80) { Branch (34:13): [True: 214k, False: 744k]
| 35 | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | input will consist of an overwhelming majority of ASCII | 37 | characters, we try to optimize for this case by checking | 38 | as many characters as a C 'size_t' can contain. | 39 | First, check if we can do an aligned read, as most CPUs have | 40 | a penalty for unaligned reads. | 41 | */ | 42 | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | /* Help register allocation */ | 44 | const char *_s = s; | 45 | STRINGLIB_CHAR *_p = p; | 46 | while (_s + SIZEOF_SIZE_T <= end) { Branch (46:24): [True: 523k, False: 12.1k]
| 47 | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | and do a fast unrolled copy if it only contains ASCII | 49 | characters. */ | 50 | size_t value = *(const size_t *) _s; | 51 | if (value & ASCII_CHAR_MASK) Branch (51:25): [True: 14.8k, False: 508k]
| 52 | break; | 53 | #if PY_LITTLE_ENDIAN | 54 | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | # if SIZEOF_SIZE_T == 8 | 59 | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | # endif | 64 | #else | 65 | # if SIZEOF_SIZE_T == 8 | 66 | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | # else | 75 | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | # endif | 80 | #endif | 81 | _s += SIZEOF_SIZE_T; | 82 | _p += SIZEOF_SIZE_T; | 83 | } | 84 | s = _s; | 85 | p = _p; | 86 | if (s == end) Branch (86:21): [True: 1.49k, False: 25.4k]
| 87 | break; | 88 | ch = (unsigned char)*s; | 89 | } | 90 | if (ch < 0x80) { Branch (90:17): [True: 212k, False: 551]
| 91 | s++; | 92 | *p++ = ch; | 93 | continue; | 94 | } | 95 | } | 96 | | 97 | if (ch < 0xE0) { Branch (97:13): [True: 21.2k, False: 724k]
| 98 | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | Py_UCS4 ch2; | 100 | if (ch < 0xC2) { Branch (100:17): [True: 368, False: 20.8k]
| 101 | /* invalid sequence | 102 | \x80-\xBF -- continuation byte | 103 | \xC0-\xC1 -- fake 0000-007F */ | 104 | goto InvalidStart; | 105 | } | 106 | if (end - s < 2) { Branch (106:17): [True: 778, False: 20.0k]
| 107 | /* unexpected end of data: the caller will decide whether | 108 | it's an error or not */ | 109 | break; | 110 | } | 111 | ch2 = (unsigned char)s[1]; | 112 | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | /* invalid continuation byte */ | 114 | goto InvalidContinuation1; | 115 | ch = (ch << 6) + ch2 - | 116 | ((0xC0 << 6) + 0x80); | 117 | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | s += 2; | 119 | if (STRINGLIB_MAX_CHAR <= 0x007F || Branch (119:17): [Folded - Ignored]
| 120 | (0 STRINGLIB_MAX_CHAR0 < 0x07FF0 && ch > 0 STRINGLIB_MAX_CHAR0 )) Branch (120:18): [Folded - Ignored]
Branch (120:49): [True: 0, False: 0]
| 121 | /* Out-of-range */ | 122 | goto Return; | 123 | *p++ = ch; | 124 | continue; | 125 | } | 126 | | 127 | if (ch < 0xF0) { Branch (127:13): [True: 718k, False: 5.56k]
| 128 | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | Py_UCS4 ch2, ch3; | 130 | if (end - s < 3) { Branch (130:17): [True: 78.5k, False: 640k]
| 131 | /* unexpected end of data: the caller will decide whether | 132 | it's an error or not */ | 133 | if (end - s < 2) Branch (133:21): [True: 41.0k, False: 37.5k]
| 134 | break; | 135 | ch2 = (unsigned char)s[1]; | 136 | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | (37.4k ch2 < 0xA037.4k ? ch == 0xE028.1k : ch == 0xED9.32k )) Branch (137:21): [True: 0, False: 37.4k]
Branch (137:22): [True: 28.1k, False: 9.32k]
| 138 | /* for clarification see comments below */ | 139 | goto InvalidContinuation1; | 140 | break; | 141 | } | 142 | ch2 = (unsigned char)s[1]; | 143 | ch3 = (unsigned char)s[2]; | 144 | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | /* invalid continuation byte */ | 146 | goto InvalidContinuation1; | 147 | } | 148 | if (ch == 0xE0) { Branch (148:17): [True: 7.79k, False: 632k]
| 149 | if (ch2 < 0xA0) Branch (149:21): [True: 0, False: 7.79k]
| 150 | /* invalid sequence | 151 | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | goto InvalidContinuation1; | 153 | } else if (ch == 0xED && ch2 >= 0xA013.4k ) { Branch (153:24): [True: 13.4k, False: 618k]
Branch (153:38): [True: 234, False: 13.1k]
| 154 | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | will result in surrogates in range D800-DFFF. Surrogates are | 156 | not valid UTF-8 so they are rejected. | 157 | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | goto InvalidContinuation1; | 160 | } | 161 | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | /* invalid continuation byte */ | 163 | goto InvalidContinuation2; | 164 | } | 165 | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | s += 3; | 169 | if (STRINGLIB_MAX_CHAR <= 0x07FF || Branch (169:17): [Folded - Ignored]
| 170 | (0 STRINGLIB_MAX_CHAR0 < 0xFFFF0 && ch > 0 STRINGLIB_MAX_CHAR0 )) Branch (170:18): [Folded - Ignored]
Branch (170:49): [True: 0, False: 0]
| 171 | /* Out-of-range */ | 172 | goto Return; | 173 | *p++ = ch; | 174 | continue; | 175 | } | 176 | | 177 | if (ch < 0xF5) { Branch (177:13): [True: 4.72k, False: 833]
| 178 | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | Py_UCS4 ch2, ch3, ch4; | 180 | if (end - s < 4) { Branch (180:17): [True: 3.98k, False: 748]
| 181 | /* unexpected end of data: the caller will decide whether | 182 | it's an error or not */ | 183 | if (end - s < 2) Branch (183:21): [True: 3.91k, False: 68]
| 184 | break; | 185 | ch2 = (unsigned char)s[1]; | 186 | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | (34 ch2 < 0x9034 ? ch == 0xF07 : ch == 0xF427 )) Branch (187:21): [True: 0, False: 34]
Branch (187:22): [True: 7, False: 27]
| 188 | /* for clarification see comments below */ | 189 | goto InvalidContinuation1; | 190 | if (end - s < 3) Branch (190:21): [True: 12, False: 22]
| 191 | break; | 192 | ch3 = (unsigned char)s[2]; | 193 | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | goto InvalidContinuation2; | 195 | break; | 196 | } | 197 | ch2 = (unsigned char)s[1]; | 198 | ch3 = (unsigned char)s[2]; | 199 | ch4 = (unsigned char)s[3]; | 200 | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | /* invalid continuation byte */ | 202 | goto InvalidContinuation1; | 203 | } | 204 | if (ch == 0xF0) { Branch (204:17): [True: 664, False: 62]
| 205 | if (ch2 < 0x90) Branch (205:21): [True: 0, False: 664]
| 206 | /* invalid sequence | 207 | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | goto InvalidContinuation1; | 209 | } else if (62 ch == 0xF462 && ch2 >= 0x9055 ) { Branch (209:24): [True: 55, False: 7]
Branch (209:38): [True: 0, False: 55]
| 210 | /* invalid sequence | 211 | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | goto InvalidContinuation1; | 213 | } | 214 | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | /* invalid continuation byte */ | 216 | goto InvalidContinuation2; | 217 | } | 218 | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | /* invalid continuation byte */ | 220 | goto InvalidContinuation3; | 221 | } | 222 | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | s += 4; | 226 | if (STRINGLIB_MAX_CHAR <= 0xFFFF || Branch (226:17): [Folded - Ignored]
| 227 | (0 STRINGLIB_MAX_CHAR0 < 0x10FFFF0 && ch > 0 STRINGLIB_MAX_CHAR0 )) Branch (227:18): [Folded - Ignored]
Branch (227:51): [True: 0, False: 0]
| 228 | /* Out-of-range */ | 229 | goto Return; | 230 | *p++ = ch; | 231 | continue; | 232 | } | 233 | goto InvalidStart; | 234 | } | 235 | ch = 0; | 236 | Return: | 237 | *inptr = s; | 238 | *outpos = p - dest; | 239 | return ch; | 240 | InvalidStart: | 241 | ch = 1; | 242 | goto Return; | 243 | InvalidContinuation1: | 244 | ch = 2; | 245 | goto Return; | 246 | InvalidContinuation2: | 247 | ch = 3; | 248 | goto Return; | 249 | InvalidContinuation3: | 250 | ch = 4; | 251 | goto Return; | 252 | } |
unicodeobject.c:ucs4lib_utf8_decode Line | Count | Source | 26 | { | 27 | Py_UCS4 ch; | 28 | const char *s = *inptr; | 29 | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | 31 | while (s < end) { Branch (31:12): [True: 1.07M, False: 2.60k]
| 32 | ch = (unsigned char)*s; | 33 | | 34 | if (ch < 0x80) { Branch (34:13): [True: 17.4k, False: 1.05M]
| 35 | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | input will consist of an overwhelming majority of ASCII | 37 | characters, we try to optimize for this case by checking | 38 | as many characters as a C 'size_t' can contain. | 39 | First, check if we can do an aligned read, as most CPUs have | 40 | a penalty for unaligned reads. | 41 | */ | 42 | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | /* Help register allocation */ | 44 | const char *_s = s; | 45 | STRINGLIB_CHAR *_p = p; | 46 | while (_s + SIZEOF_SIZE_T <= end) { Branch (46:24): [True: 87.1k, False: 1.99k]
| 47 | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | and do a fast unrolled copy if it only contains ASCII | 49 | characters. */ | 50 | size_t value = *(const size_t *) _s; | 51 | if (value & ASCII_CHAR_MASK) Branch (51:25): [True: 179, False: 86.9k]
| 52 | break; | 53 | #if PY_LITTLE_ENDIAN | 54 | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | # if SIZEOF_SIZE_T == 8 | 59 | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | # endif | 64 | #else | 65 | # if SIZEOF_SIZE_T == 8 | 66 | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | # else | 75 | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | # endif | 80 | #endif | 81 | _s += SIZEOF_SIZE_T; | 82 | _p += SIZEOF_SIZE_T; | 83 | } | 84 | s = _s; | 85 | p = _p; | 86 | if (s == end) Branch (86:21): [True: 128, False: 2.04k]
| 87 | break; | 88 | ch = (unsigned char)*s; | 89 | } | 90 | if (ch < 0x80) { Branch (90:17): [True: 17.2k, False: 10]
| 91 | s++; | 92 | *p++ = ch; | 93 | continue; | 94 | } | 95 | } | 96 | | 97 | if (ch < 0xE0) { Branch (97:13): [True: 857, False: 1.05M]
| 98 | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | Py_UCS4 ch2; | 100 | if (ch < 0xC2) { Branch (100:17): [True: 12, False: 845]
| 101 | /* invalid sequence | 102 | \x80-\xBF -- continuation byte | 103 | \xC0-\xC1 -- fake 0000-007F */ | 104 | goto InvalidStart; | 105 | } | 106 | if (end - s < 2) { Branch (106:17): [True: 24, False: 821]
| 107 | /* unexpected end of data: the caller will decide whether | 108 | it's an error or not */ | 109 | break; | 110 | } | 111 | ch2 = (unsigned char)s[1]; | 112 | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | /* invalid continuation byte */ | 114 | goto InvalidContinuation1; | 115 | ch = (ch << 6) + ch2 - | 116 | ((0xC0 << 6) + 0x80); | 117 | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | s += 2; | 119 | if (STRINGLIB_MAX_CHAR <= 0x007F || Branch (119:17): [Folded - Ignored]
| 120 | (0 STRINGLIB_MAX_CHAR0 < 0x07FF0 && ch > 0 STRINGLIB_MAX_CHAR0 )) Branch (120:18): [Folded - Ignored]
Branch (120:49): [True: 0, False: 0]
| 121 | /* Out-of-range */ | 122 | goto Return; | 123 | *p++ = ch; | 124 | continue; | 125 | } | 126 | | 127 | if (ch < 0xF0) { Branch (127:13): [True: 5.30k, False: 1.05M]
| 128 | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | Py_UCS4 ch2, ch3; | 130 | if (end - s < 3) { Branch (130:17): [True: 99, False: 5.20k]
| 131 | /* unexpected end of data: the caller will decide whether | 132 | it's an error or not */ | 133 | if (end - s < 2) Branch (133:21): [True: 81, False: 18]
| 134 | break; | 135 | ch2 = (unsigned char)s[1]; | 136 | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | (ch2 < 0xA0 ? ch == 0xE00 : ch == 0xED)) Branch (137:21): [True: 0, False: 18]
Branch (137:22): [True: 0, False: 18]
| 138 | /* for clarification see comments below */ | 139 | goto InvalidContinuation1; | 140 | break; | 141 | } | 142 | ch2 = (unsigned char)s[1]; | 143 | ch3 = (unsigned char)s[2]; | 144 | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | /* invalid continuation byte */ | 146 | goto InvalidContinuation1; | 147 | } | 148 | if (ch == 0xE0) { Branch (148:17): [True: 4, False: 5.19k]
| 149 | if (ch2 < 0xA0) Branch (149:21): [True: 0, False: 4]
| 150 | /* invalid sequence | 151 | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | goto InvalidContinuation1; | 153 | } else if (ch == 0xED && ch2 >= 0xA017 ) { Branch (153:24): [True: 17, False: 5.18k]
Branch (153:38): [True: 17, False: 0]
| 154 | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | will result in surrogates in range D800-DFFF. Surrogates are | 156 | not valid UTF-8 so they are rejected. | 157 | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | goto InvalidContinuation1; | 160 | } | 161 | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | /* invalid continuation byte */ | 163 | goto InvalidContinuation2; | 164 | } | 165 | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | s += 3; | 169 | if (STRINGLIB_MAX_CHAR <= 0x07FF || Branch (169:17): [Folded - Ignored]
| 170 | (0 STRINGLIB_MAX_CHAR0 < 0xFFFF0 && ch > 0 STRINGLIB_MAX_CHAR0 )) Branch (170:18): [Folded - Ignored]
Branch (170:49): [True: 0, False: 0]
| 171 | /* Out-of-range */ | 172 | goto Return; | 173 | *p++ = ch; | 174 | continue; | 175 | } | 176 | | 177 | if (ch < 0xF5) { Branch (177:13): [True: 1.05M, False: 0]
| 178 | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | Py_UCS4 ch2, ch3, ch4; | 180 | if (end - s < 4) { Branch (180:17): [True: 90, False: 1.05M]
| 181 | /* unexpected end of data: the caller will decide whether | 182 | it's an error or not */ | 183 | if (end - s < 2) Branch (183:21): [True: 54, False: 36]
| 184 | break; | 185 | ch2 = (unsigned char)s[1]; | 186 | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | (ch2 < 0x90 ? ch == 0xF00 : ch == 0xF4)) Branch (187:21): [True: 0, False: 36]
Branch (187:22): [True: 0, False: 36]
| 188 | /* for clarification see comments below */ | 189 | goto InvalidContinuation1; | 190 | if (end - s < 3) Branch (190:21): [True: 18, False: 18]
| 191 | break; | 192 | ch3 = (unsigned char)s[2]; | 193 | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | goto InvalidContinuation2; | 195 | break; | 196 | } | 197 | ch2 = (unsigned char)s[1]; | 198 | ch3 = (unsigned char)s[2]; | 199 | ch4 = (unsigned char)s[3]; | 200 | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | /* invalid continuation byte */ | 202 | goto InvalidContinuation1; | 203 | } | 204 | if (ch == 0xF0) { Branch (204:17): [True: 199k, False: 852k]
| 205 | if (ch2 < 0x90) Branch (205:21): [True: 0, False: 199k]
| 206 | /* invalid sequence | 207 | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | goto InvalidContinuation1; | 209 | } else if (ch == 0xF4 && ch2 >= 0x9066.4k ) { Branch (209:24): [True: 66.4k, False: 786k]
Branch (209:38): [True: 0, False: 66.4k]
| 210 | /* invalid sequence | 211 | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | goto InvalidContinuation1; | 213 | } | 214 | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | /* invalid continuation byte */ | 216 | goto InvalidContinuation2; | 217 | } | 218 | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | /* invalid continuation byte */ | 220 | goto InvalidContinuation3; | 221 | } | 222 | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | s += 4; | 226 | if (STRINGLIB_MAX_CHAR <= 0xFFFF || Branch (226:17): [Folded - Ignored]
| 227 | (0 STRINGLIB_MAX_CHAR0 < 0x10FFFF0 && ch > 0 STRINGLIB_MAX_CHAR0 )) Branch (227:18): [Folded - Ignored]
Branch (227:51): [True: 0, False: 0]
| 228 | /* Out-of-range */ | 229 | goto Return; | 230 | *p++ = ch; | 231 | continue; | 232 | } | 233 | goto InvalidStart; | 234 | } | 235 | ch = 0; | 236 | Return: | 237 | *inptr = s; | 238 | *outpos = p - dest; | 239 | return ch; | 240 | InvalidStart: | 241 | ch = 1; | 242 | goto Return; | 243 | InvalidContinuation1: | 244 | ch = 2; | 245 | goto Return; | 246 | InvalidContinuation2: | 247 | ch = 3; | 248 | goto Return; | 249 | InvalidContinuation3: | 250 | ch = 4; | 251 | goto Return; | 252 | } |
|
253 | |
254 | #undef ASCII_CHAR_MASK |
255 | |
256 | |
257 | /* UTF-8 encoder specialized for a Unicode kind to avoid the slow |
258 | PyUnicode_READ() macro. Delete some parts of the code depending on the kind: |
259 | UCS-1 strings don't need to handle surrogates for example. */ |
260 | Py_LOCAL_INLINE(char *) |
261 | STRINGLIB(utf8_encoder)(_PyBytesWriter *writer, |
262 | PyObject *unicode, |
263 | const STRINGLIB_CHAR *data, |
264 | Py_ssize_t size, |
265 | _Py_error_handler error_handler, |
266 | const char *errors) |
267 | { |
268 | Py_ssize_t i; /* index into data of next input character */ |
269 | char *p; /* next free byte in output buffer */ |
270 | #if STRINGLIB_SIZEOF_CHAR > 1 |
271 | PyObject *error_handler_obj = NULL; |
272 | PyObject *exc = NULL; |
273 | PyObject *rep = NULL; |
274 | #endif |
275 | #if STRINGLIB_SIZEOF_CHAR == 1 |
276 | const Py_ssize_t max_char_size = 2; |
277 | #elif STRINGLIB_SIZEOF_CHAR == 2 |
278 | const Py_ssize_t max_char_size = 3; |
279 | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ |
280 | const Py_ssize_t max_char_size = 4; |
281 | #endif |
282 | |
283 | assert(size >= 0); |
284 | if (size > PY_SSIZE_T_MAX / max_char_size) { Branch (284:9): [True: 0, False: 76.7k]
Branch (284:9): [True: 0, False: 155k]
Branch (284:9): [True: 0, False: 1.34k]
|
285 | /* integer overflow */ |
286 | PyErr_NoMemory(); |
287 | return NULL; |
288 | } |
289 | |
290 | _PyBytesWriter_Init(writer); |
291 | p = _PyBytesWriter_Alloc(writer, size * max_char_size); |
292 | if (p == NULL) Branch (292:9): [True: 0, False: 76.7k]
Branch (292:9): [True: 0, False: 155k]
Branch (292:9): [True: 0, False: 1.34k]
|
293 | return NULL; |
294 | |
295 | for (i = 0; 76.7k i < size;) { Branch (295:17): [True: 26.4M, False: 76.7k]
Branch (295:17): [True: 3.01M, False: 154k]
Branch (295:17): [True: 1.13M, False: 1.34k]
|
296 | Py_UCS4 ch = data[i++]; |
297 | |
298 | if (ch < 0x80) { Branch (298:13): [True: 26.3M, False: 149k]
Branch (298:13): [True: 2.23M, False: 778k]
Branch (298:13): [True: 11.0k, False: 1.12M]
|
299 | /* Encode ASCII */ |
300 | *p++ = (char) ch; |
301 | |
302 | } |
303 | else |
304 | #if STRINGLIB_SIZEOF_CHAR > 1 |
305 | if (778k ch < 0x0800) Branch (305:13): [True: 29.3k, False: 748k]
Branch (305:13): [True: 3.34k, False: 1.12M]
|
306 | #endif |
307 | { |
308 | /* Encode Latin-1 */ |
309 | *p++ = (char)(0xc0 | (ch >> 6)); |
310 | *p++ = (char)(0x80 | (ch & 0x3f)); |
311 | } |
312 | #if STRINGLIB_SIZEOF_CHAR > 1 |
313 | else if (748k Py_UNICODE_IS_SURROGATE(ch)) { Branch (313:18): [True: 3.54k, False: 745k]
Branch (313:18): [True: 6, False: 1.12M]
|
314 | Py_ssize_t startpos, endpos, newpos; |
315 | Py_ssize_t k; |
316 | if (error_handler == _Py_ERROR_UNKNOWN) { Branch (316:17): [True: 1.65k, False: 1.89k]
Branch (316:17): [True: 6, False: 0]
|
317 | error_handler = _Py_GetErrorHandler(errors); |
318 | } |
319 | |
320 | startpos = i-1; |
321 | endpos = startpos+1; |
322 | |
323 | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])2.28k ) Branch (323:20): [True: 2.28k, False: 1.69k]
Branch (323:39): [True: 438, False: 1.84k]
Branch (323:20): [True: 4, False: 2]
Branch (323:39): [True: 0, False: 4]
|
324 | endpos++; |
325 | |
326 | /* Only overallocate the buffer if it's not the last write */ |
327 | writer->overallocate = (endpos < size); |
328 | |
329 | switch (error_handler) |
330 | { |
331 | case _Py_ERROR_REPLACE: Branch (331:13): [True: 4, False: 3.54k]
Branch (331:13): [True: 0, False: 6]
|
332 | memset(p, '?', endpos - startpos); |
333 | p += (endpos - startpos); |
334 | /* fall through */ |
335 | case _Py_ERROR_IGNORE: Branch (335:13): [True: 4, False: 3.54k]
Branch (335:13): [True: 0, False: 6]
|
336 | i += (endpos - startpos - 1); |
337 | break; |
338 | |
339 | case _Py_ERROR_SURROGATEPASS: Branch (339:13): [True: 50, False: 3.49k]
Branch (339:13): [True: 6, False: 0]
|
340 | for (k=startpos; k<endpos; k++60 ) { Branch (340:34): [True: 54, False: 50]
Branch (340:34): [True: 6, False: 6]
|
341 | ch = data[k]; |
342 | *p++ = (char)(0xe0 | (ch >> 12)); |
343 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
344 | *p++ = (char)(0x80 | (ch & 0x3f)); |
345 | } |
346 | i += (endpos - startpos - 1); |
347 | break; |
348 | |
349 | case _Py_ERROR_BACKSLASHREPLACE: Branch (349:13): [True: 4, False: 3.54k]
Branch (349:13): [True: 0, False: 6]
|
350 | /* subtract preallocated bytes */ |
351 | writer->min_size -= max_char_size * (endpos - startpos); |
352 | p = backslashreplace(writer, p, |
353 | unicode, startpos, endpos); |
354 | if (p == NULL) Branch (354:21): [True: 0, False: 4]
Branch (354:21): [True: 0, False: 0]
|
355 | goto error; |
356 | i += (endpos - startpos - 1); |
357 | break; |
358 | |
359 | case _Py_ERROR_XMLCHARREFREPLACE: Branch (359:13): [True: 2, False: 3.54k]
Branch (359:13): [True: 0, False: 6]
|
360 | /* subtract preallocated bytes */ |
361 | writer->min_size -= max_char_size * (endpos - startpos); |
362 | p = xmlcharrefreplace(writer, p, |
363 | unicode, startpos, endpos); |
364 | if (p == NULL) Branch (364:21): [True: 0, False: 2]
Branch (364:21): [True: 0, False: 0]
|
365 | goto error; |
366 | i += (endpos - startpos - 1); |
367 | break; |
368 | |
369 | case _Py_ERROR_SURROGATEESCAPE: Branch (369:13): [True: 3.10k, False: 444]
Branch (369:13): [True: 0, False: 6]
|
370 | for (k=startpos; k<endpos; k++3.10k ) { Branch (370:34): [True: 3.15k, False: 3.04k]
Branch (370:34): [True: 0, False: 0]
|
371 | ch = data[k]; |
372 | if (!(0xDC80 <= ch && ch <= 0xDCFF3.14k )) Branch (372:27): [True: 3.14k, False: 10]
Branch (372:43): [True: 3.10k, False: 46]
Branch (372:27): [True: 0, False: 0]
Branch (372:43): [True: 0, False: 0]
|
373 | break; |
374 | *p++ = (char)(ch & 0xff); |
375 | } |
376 | if (k >= endpos) { Branch (376:21): [True: 3.04k, False: 56]
Branch (376:21): [True: 0, False: 0]
|
377 | i += (endpos - startpos - 1); |
378 | break; |
379 | } |
380 | startpos = k; |
381 | assert(startpos < endpos); |
382 | /* fall through */ |
383 | default: Branch (383:13): [True: 380, False: 3.16k]
Branch (383:13): [True: 0, False: 6]
|
384 | rep = unicode_encode_call_errorhandler( |
385 | errors, &error_handler_obj, "utf-8", "surrogates not allowed", |
386 | unicode, &exc, startpos, endpos, &newpos); |
387 | if (!rep) Branch (387:21): [True: 379, False: 57]
Branch (387:21): [True: 0, False: 0]
|
388 | goto error; |
389 | |
390 | if (newpos < startpos) { Branch (390:21): [True: 51, False: 6]
Branch (390:21): [True: 0, False: 0]
|
391 | writer->overallocate = 1; |
392 | p = _PyBytesWriter_Prepare(writer, p, |
393 | max_char_size * (startpos - newpos)); |
394 | if (p == NULL) Branch (394:25): [True: 0, False: 51]
Branch (394:25): [True: 0, False: 0]
|
395 | goto error; |
396 | } |
397 | else { |
398 | /* subtract preallocated bytes */ |
399 | writer->min_size -= max_char_size * (newpos - startpos); |
400 | /* Only overallocate the buffer if it's not the last write */ |
401 | writer->overallocate = (newpos < size); |
402 | } |
403 | |
404 | if (PyBytes_Check(rep)) { |
405 | p = _PyBytesWriter_WriteBytes(writer, p, |
406 | PyBytes_AS_STRING(rep), |
407 | PyBytes_GET_SIZE(rep)); |
408 | } |
409 | else { |
410 | /* rep is unicode */ |
411 | if (PyUnicode_READY(rep) < 0) Branch (411:25): [True: 0, False: 56]
Branch (411:25): [True: 0, False: 0]
|
412 | goto error; |
413 | |
414 | if (!PyUnicode_IS_ASCII(rep)) { Branch (414:25): [True: 3, False: 53]
Branch (414:25): [True: 0, False: 0]
|
415 | raise_encode_exception(&exc, "utf-8", unicode, |
416 | startpos, endpos, |
417 | "surrogates not allowed"); |
418 | goto error; |
419 | } |
420 | |
421 | p = _PyBytesWriter_WriteBytes(writer, p, |
422 | PyUnicode_DATA(rep), |
423 | PyUnicode_GET_LENGTH(rep)); |
424 | } |
425 | |
426 | if (p == NULL) Branch (426:21): [True: 0, False: 54]
Branch (426:21): [True: 0, False: 0]
|
427 | goto error; |
428 | Py_CLEAR(rep); |
429 | |
430 | i = newpos; |
431 | } |
432 | |
433 | /* If overallocation was disabled, ensure that it was the last |
434 | write. Otherwise, we missed an optimization */ |
435 | assert(writer->overallocate || i == size); |
436 | } |
437 | else |
438 | #if STRINGLIB_SIZEOF_CHAR > 2 |
439 | if (ch < 0x10000) Branch (439:13): [True: 75.0k, False: 1.05M]
|
440 | #endif |
441 | { |
442 | *p++ = (char)(0xe0 | (ch >> 12)); |
443 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
444 | *p++ = (char)(0x80 | (ch & 0x3f)); |
445 | } |
446 | #if STRINGLIB_SIZEOF_CHAR > 2 |
447 | else /* ch >= 0x10000 */ |
448 | { |
449 | assert(ch <= MAX_UNICODE); |
450 | /* Encode UCS4 Unicode ordinals */ |
451 | *p++ = (char)(0xf0 | (ch >> 18)); |
452 | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); |
453 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
454 | *p++ = (char)(0x80 | (ch & 0x3f)); |
455 | } |
456 | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ |
457 | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ |
458 | } |
459 | |
460 | #if STRINGLIB_SIZEOF_CHAR > 1 |
461 | Py_XDECREF(error_handler_obj); |
462 | Py_XDECREF(exc); |
463 | #endif |
464 | return p; |
465 | |
466 | #if STRINGLIB_SIZEOF_CHAR > 1 |
467 | error: |
468 | Py_XDECREF(rep); |
469 | Py_XDECREF(error_handler_obj); |
470 | Py_XDECREF(exc); |
471 | return NULL; |
472 | #endif |
473 | } unicodeobject.c:ucs1lib_utf8_encoder Line | Count | Source | 267 | { | 268 | Py_ssize_t i; /* index into data of next input character */ | 269 | char *p; /* next free byte in output buffer */ | 270 | #if STRINGLIB_SIZEOF_CHAR > 1 | 271 | PyObject *error_handler_obj = NULL; | 272 | PyObject *exc = NULL; | 273 | PyObject *rep = NULL; | 274 | #endif | 275 | #if STRINGLIB_SIZEOF_CHAR == 1 | 276 | const Py_ssize_t max_char_size = 2; | 277 | #elif STRINGLIB_SIZEOF_CHAR == 2 | 278 | const Py_ssize_t max_char_size = 3; | 279 | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ | 280 | const Py_ssize_t max_char_size = 4; | 281 | #endif | 282 | | 283 | assert(size >= 0); | 284 | if (size > PY_SSIZE_T_MAX / max_char_size) { Branch (284:9): [True: 0, False: 76.7k]
| 285 | /* integer overflow */ | 286 | PyErr_NoMemory(); | 287 | return NULL; | 288 | } | 289 | | 290 | _PyBytesWriter_Init(writer); | 291 | p = _PyBytesWriter_Alloc(writer, size * max_char_size); | 292 | if (p == NULL) Branch (292:9): [True: 0, False: 76.7k]
| 293 | return NULL; | 294 | | 295 | for (i = 0; 76.7k i < size;) { Branch (295:17): [True: 26.4M, False: 76.7k]
| 296 | Py_UCS4 ch = data[i++]; | 297 | | 298 | if (ch < 0x80) { Branch (298:13): [True: 26.3M, False: 149k]
| 299 | /* Encode ASCII */ | 300 | *p++ = (char) ch; | 301 | | 302 | } | 303 | else | 304 | #if STRINGLIB_SIZEOF_CHAR > 1 | 305 | if (ch < 0x0800) | 306 | #endif | 307 | { | 308 | /* Encode Latin-1 */ | 309 | *p++ = (char)(0xc0 | (ch >> 6)); | 310 | *p++ = (char)(0x80 | (ch & 0x3f)); | 311 | } | 312 | #if STRINGLIB_SIZEOF_CHAR > 1 | 313 | else if (Py_UNICODE_IS_SURROGATE(ch)) { | 314 | Py_ssize_t startpos, endpos, newpos; | 315 | Py_ssize_t k; | 316 | if (error_handler == _Py_ERROR_UNKNOWN) { | 317 | error_handler = _Py_GetErrorHandler(errors); | 318 | } | 319 | | 320 | startpos = i-1; | 321 | endpos = startpos+1; | 322 | | 323 | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) | 324 | endpos++; | 325 | | 326 | /* Only overallocate the buffer if it's not the last write */ | 327 | writer->overallocate = (endpos < size); | 328 | | 329 | switch (error_handler) | 330 | { | 331 | case _Py_ERROR_REPLACE: | 332 | memset(p, '?', endpos - startpos); | 333 | p += (endpos - startpos); | 334 | /* fall through */ | 335 | case _Py_ERROR_IGNORE: | 336 | i += (endpos - startpos - 1); | 337 | break; | 338 | | 339 | case _Py_ERROR_SURROGATEPASS: | 340 | for (k=startpos; k<endpos; k++) { | 341 | ch = data[k]; | 342 | *p++ = (char)(0xe0 | (ch >> 12)); | 343 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 344 | *p++ = (char)(0x80 | (ch & 0x3f)); | 345 | } | 346 | i += (endpos - startpos - 1); | 347 | break; | 348 | | 349 | case _Py_ERROR_BACKSLASHREPLACE: | 350 | /* subtract preallocated bytes */ | 351 | writer->min_size -= max_char_size * (endpos - startpos); | 352 | p = backslashreplace(writer, p, | 353 | unicode, startpos, endpos); | 354 | if (p == NULL) | 355 | goto error; | 356 | i += (endpos - startpos - 1); | 357 | break; | 358 | | 359 | case _Py_ERROR_XMLCHARREFREPLACE: | 360 | /* subtract preallocated bytes */ | 361 | writer->min_size -= max_char_size * (endpos - startpos); | 362 | p = xmlcharrefreplace(writer, p, | 363 | unicode, startpos, endpos); | 364 | if (p == NULL) | 365 | goto error; | 366 | i += (endpos - startpos - 1); | 367 | break; | 368 | | 369 | case _Py_ERROR_SURROGATEESCAPE: | 370 | for (k=startpos; k<endpos; k++) { | 371 | ch = data[k]; | 372 | if (!(0xDC80 <= ch && ch <= 0xDCFF)) | 373 | break; | 374 | *p++ = (char)(ch & 0xff); | 375 | } | 376 | if (k >= endpos) { | 377 | i += (endpos - startpos - 1); | 378 | break; | 379 | } | 380 | startpos = k; | 381 | assert(startpos < endpos); | 382 | /* fall through */ | 383 | default: | 384 | rep = unicode_encode_call_errorhandler( | 385 | errors, &error_handler_obj, "utf-8", "surrogates not allowed", | 386 | unicode, &exc, startpos, endpos, &newpos); | 387 | if (!rep) | 388 | goto error; | 389 | | 390 | if (newpos < startpos) { | 391 | writer->overallocate = 1; | 392 | p = _PyBytesWriter_Prepare(writer, p, | 393 | max_char_size * (startpos - newpos)); | 394 | if (p == NULL) | 395 | goto error; | 396 | } | 397 | else { | 398 | /* subtract preallocated bytes */ | 399 | writer->min_size -= max_char_size * (newpos - startpos); | 400 | /* Only overallocate the buffer if it's not the last write */ | 401 | writer->overallocate = (newpos < size); | 402 | } | 403 | | 404 | if (PyBytes_Check(rep)) { | 405 | p = _PyBytesWriter_WriteBytes(writer, p, | 406 | PyBytes_AS_STRING(rep), | 407 | PyBytes_GET_SIZE(rep)); | 408 | } | 409 | else { | 410 | /* rep is unicode */ | 411 | if (PyUnicode_READY(rep) < 0) | 412 | goto error; | 413 | | 414 | if (!PyUnicode_IS_ASCII(rep)) { | 415 | raise_encode_exception(&exc, "utf-8", unicode, | 416 | startpos, endpos, | 417 | "surrogates not allowed"); | 418 | goto error; | 419 | } | 420 | | 421 | p = _PyBytesWriter_WriteBytes(writer, p, | 422 | PyUnicode_DATA(rep), | 423 | PyUnicode_GET_LENGTH(rep)); | 424 | } | 425 | | 426 | if (p == NULL) | 427 | goto error; | 428 | Py_CLEAR(rep); | 429 | | 430 | i = newpos; | 431 | } | 432 | | 433 | /* If overallocation was disabled, ensure that it was the last | 434 | write. Otherwise, we missed an optimization */ | 435 | assert(writer->overallocate || i == size); | 436 | } | 437 | else | 438 | #if STRINGLIB_SIZEOF_CHAR > 2 | 439 | if (ch < 0x10000) | 440 | #endif | 441 | { | 442 | *p++ = (char)(0xe0 | (ch >> 12)); | 443 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 444 | *p++ = (char)(0x80 | (ch & 0x3f)); | 445 | } | 446 | #if STRINGLIB_SIZEOF_CHAR > 2 | 447 | else /* ch >= 0x10000 */ | 448 | { | 449 | assert(ch <= MAX_UNICODE); | 450 | /* Encode UCS4 Unicode ordinals */ | 451 | *p++ = (char)(0xf0 | (ch >> 18)); | 452 | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); | 453 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 454 | *p++ = (char)(0x80 | (ch & 0x3f)); | 455 | } | 456 | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ | 457 | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ | 458 | } | 459 | | 460 | #if STRINGLIB_SIZEOF_CHAR > 1 | 461 | Py_XDECREF(error_handler_obj); | 462 | Py_XDECREF(exc); | 463 | #endif | 464 | return p; | 465 | | 466 | #if STRINGLIB_SIZEOF_CHAR > 1 | 467 | error: | 468 | Py_XDECREF(rep); | 469 | Py_XDECREF(error_handler_obj); | 470 | Py_XDECREF(exc); | 471 | return NULL; | 472 | #endif | 473 | } |
unicodeobject.c:ucs2lib_utf8_encoder Line | Count | Source | 267 | { | 268 | Py_ssize_t i; /* index into data of next input character */ | 269 | char *p; /* next free byte in output buffer */ | 270 | #if STRINGLIB_SIZEOF_CHAR > 1 | 271 | PyObject *error_handler_obj = NULL; | 272 | PyObject *exc = NULL; | 273 | PyObject *rep = NULL; | 274 | #endif | 275 | #if STRINGLIB_SIZEOF_CHAR == 1 | 276 | const Py_ssize_t max_char_size = 2; | 277 | #elif STRINGLIB_SIZEOF_CHAR == 2 | 278 | const Py_ssize_t max_char_size = 3; | 279 | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ | 280 | const Py_ssize_t max_char_size = 4; | 281 | #endif | 282 | | 283 | assert(size >= 0); | 284 | if (size > PY_SSIZE_T_MAX / max_char_size) { Branch (284:9): [True: 0, False: 155k]
| 285 | /* integer overflow */ | 286 | PyErr_NoMemory(); | 287 | return NULL; | 288 | } | 289 | | 290 | _PyBytesWriter_Init(writer); | 291 | p = _PyBytesWriter_Alloc(writer, size * max_char_size); | 292 | if (p == NULL) Branch (292:9): [True: 0, False: 155k]
| 293 | return NULL; | 294 | | 295 | for (i = 0; 155k i < size;) { Branch (295:17): [True: 3.01M, False: 154k]
| 296 | Py_UCS4 ch = data[i++]; | 297 | | 298 | if (ch < 0x80) { Branch (298:13): [True: 2.23M, False: 778k]
| 299 | /* Encode ASCII */ | 300 | *p++ = (char) ch; | 301 | | 302 | } | 303 | else | 304 | #if STRINGLIB_SIZEOF_CHAR > 1 | 305 | if (ch < 0x0800) Branch (305:13): [True: 29.3k, False: 748k]
| 306 | #endif | 307 | { | 308 | /* Encode Latin-1 */ | 309 | *p++ = (char)(0xc0 | (ch >> 6)); | 310 | *p++ = (char)(0x80 | (ch & 0x3f)); | 311 | } | 312 | #if STRINGLIB_SIZEOF_CHAR > 1 | 313 | else if (Py_UNICODE_IS_SURROGATE(ch)) { Branch (313:18): [True: 3.54k, False: 745k]
| 314 | Py_ssize_t startpos, endpos, newpos; | 315 | Py_ssize_t k; | 316 | if (error_handler == _Py_ERROR_UNKNOWN) { Branch (316:17): [True: 1.65k, False: 1.89k]
| 317 | error_handler = _Py_GetErrorHandler(errors); | 318 | } | 319 | | 320 | startpos = i-1; | 321 | endpos = startpos+1; | 322 | | 323 | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])2.28k ) Branch (323:20): [True: 2.28k, False: 1.69k]
Branch (323:39): [True: 438, False: 1.84k]
| 324 | endpos++; | 325 | | 326 | /* Only overallocate the buffer if it's not the last write */ | 327 | writer->overallocate = (endpos < size); | 328 | | 329 | switch (error_handler) | 330 | { | 331 | case _Py_ERROR_REPLACE: Branch (331:13): [True: 4, False: 3.54k]
| 332 | memset(p, '?', endpos - startpos); | 333 | p += (endpos - startpos); | 334 | /* fall through */ | 335 | case _Py_ERROR_IGNORE: Branch (335:13): [True: 4, False: 3.54k]
| 336 | i += (endpos - startpos - 1); | 337 | break; | 338 | | 339 | case _Py_ERROR_SURROGATEPASS: Branch (339:13): [True: 50, False: 3.49k]
| 340 | for (k=startpos; k<endpos; k++54 ) { Branch (340:34): [True: 54, False: 50]
| 341 | ch = data[k]; | 342 | *p++ = (char)(0xe0 | (ch >> 12)); | 343 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 344 | *p++ = (char)(0x80 | (ch & 0x3f)); | 345 | } | 346 | i += (endpos - startpos - 1); | 347 | break; | 348 | | 349 | case _Py_ERROR_BACKSLASHREPLACE: Branch (349:13): [True: 4, False: 3.54k]
| 350 | /* subtract preallocated bytes */ | 351 | writer->min_size -= max_char_size * (endpos - startpos); | 352 | p = backslashreplace(writer, p, | 353 | unicode, startpos, endpos); | 354 | if (p == NULL) Branch (354:21): [True: 0, False: 4]
| 355 | goto error; | 356 | i += (endpos - startpos - 1); | 357 | break; | 358 | | 359 | case _Py_ERROR_XMLCHARREFREPLACE: Branch (359:13): [True: 2, False: 3.54k]
| 360 | /* subtract preallocated bytes */ | 361 | writer->min_size -= max_char_size * (endpos - startpos); | 362 | p = xmlcharrefreplace(writer, p, | 363 | unicode, startpos, endpos); | 364 | if (p == NULL) Branch (364:21): [True: 0, False: 2]
| 365 | goto error; | 366 | i += (endpos - startpos - 1); | 367 | break; | 368 | | 369 | case _Py_ERROR_SURROGATEESCAPE: Branch (369:13): [True: 3.10k, False: 444]
| 370 | for (k=startpos; k<endpos; k++3.10k ) { Branch (370:34): [True: 3.15k, False: 3.04k]
| 371 | ch = data[k]; | 372 | if (!(0xDC80 <= ch && ch <= 0xDCFF3.14k )) Branch (372:27): [True: 3.14k, False: 10]
Branch (372:43): [True: 3.10k, False: 46]
| 373 | break; | 374 | *p++ = (char)(ch & 0xff); | 375 | } | 376 | if (k >= endpos) { Branch (376:21): [True: 3.04k, False: 56]
| 377 | i += (endpos - startpos - 1); | 378 | break; | 379 | } | 380 | startpos = k; | 381 | assert(startpos < endpos); | 382 | /* fall through */ | 383 | default: Branch (383:13): [True: 380, False: 3.16k]
| 384 | rep = unicode_encode_call_errorhandler( | 385 | errors, &error_handler_obj, "utf-8", "surrogates not allowed", | 386 | unicode, &exc, startpos, endpos, &newpos); | 387 | if (!rep) Branch (387:21): [True: 379, False: 57]
| 388 | goto error; | 389 | | 390 | if (newpos < startpos) { Branch (390:21): [True: 51, False: 6]
| 391 | writer->overallocate = 1; | 392 | p = _PyBytesWriter_Prepare(writer, p, | 393 | max_char_size * (startpos - newpos)); | 394 | if (p == NULL) Branch (394:25): [True: 0, False: 51]
| 395 | goto error; | 396 | } | 397 | else { | 398 | /* subtract preallocated bytes */ | 399 | writer->min_size -= max_char_size * (newpos - startpos); | 400 | /* Only overallocate the buffer if it's not the last write */ | 401 | writer->overallocate = (newpos < size); | 402 | } | 403 | | 404 | if (PyBytes_Check(rep)) { | 405 | p = _PyBytesWriter_WriteBytes(writer, p, | 406 | PyBytes_AS_STRING(rep), | 407 | PyBytes_GET_SIZE(rep)); | 408 | } | 409 | else { | 410 | /* rep is unicode */ | 411 | if (PyUnicode_READY(rep) < 0) Branch (411:25): [True: 0, False: 56]
| 412 | goto error; | 413 | | 414 | if (!PyUnicode_IS_ASCII(rep)) { Branch (414:25): [True: 3, False: 53]
| 415 | raise_encode_exception(&exc, "utf-8", unicode, | 416 | startpos, endpos, | 417 | "surrogates not allowed"); | 418 | goto error; | 419 | } | 420 | | 421 | p = _PyBytesWriter_WriteBytes(writer, p, | 422 | PyUnicode_DATA(rep), | 423 | PyUnicode_GET_LENGTH(rep)); | 424 | } | 425 | | 426 | if (p == NULL) Branch (426:21): [True: 0, False: 54]
| 427 | goto error; | 428 | Py_CLEAR(rep); | 429 | | 430 | i = newpos; | 431 | } | 432 | | 433 | /* If overallocation was disabled, ensure that it was the last | 434 | write. Otherwise, we missed an optimization */ | 435 | assert(writer->overallocate || i == size); | 436 | } | 437 | else | 438 | #if STRINGLIB_SIZEOF_CHAR > 2 | 439 | if (ch < 0x10000) | 440 | #endif | 441 | { | 442 | *p++ = (char)(0xe0 | (ch >> 12)); | 443 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 444 | *p++ = (char)(0x80 | (ch & 0x3f)); | 445 | } | 446 | #if STRINGLIB_SIZEOF_CHAR > 2 | 447 | else /* ch >= 0x10000 */ | 448 | { | 449 | assert(ch <= MAX_UNICODE); | 450 | /* Encode UCS4 Unicode ordinals */ | 451 | *p++ = (char)(0xf0 | (ch >> 18)); | 452 | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); | 453 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 454 | *p++ = (char)(0x80 | (ch & 0x3f)); | 455 | } | 456 | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ | 457 | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ | 458 | } | 459 | | 460 | #if STRINGLIB_SIZEOF_CHAR > 1 | 461 | Py_XDECREF(error_handler_obj); | 462 | Py_XDECREF(exc); | 463 | #endif | 464 | return p; | 465 | | 466 | #if STRINGLIB_SIZEOF_CHAR > 1 | 467 | error: | 468 | Py_XDECREF(rep); | 469 | Py_XDECREF(error_handler_obj); | 470 | Py_XDECREF(exc); | 471 | return NULL; | 472 | #endif | 473 | } |
unicodeobject.c:ucs4lib_utf8_encoder Line | Count | Source | 267 | { | 268 | Py_ssize_t i; /* index into data of next input character */ | 269 | char *p; /* next free byte in output buffer */ | 270 | #if STRINGLIB_SIZEOF_CHAR > 1 | 271 | PyObject *error_handler_obj = NULL; | 272 | PyObject *exc = NULL; | 273 | PyObject *rep = NULL; | 274 | #endif | 275 | #if STRINGLIB_SIZEOF_CHAR == 1 | 276 | const Py_ssize_t max_char_size = 2; | 277 | #elif STRINGLIB_SIZEOF_CHAR == 2 | 278 | const Py_ssize_t max_char_size = 3; | 279 | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ | 280 | const Py_ssize_t max_char_size = 4; | 281 | #endif | 282 | | 283 | assert(size >= 0); | 284 | if (size > PY_SSIZE_T_MAX / max_char_size) { Branch (284:9): [True: 0, False: 1.34k]
| 285 | /* integer overflow */ | 286 | PyErr_NoMemory(); | 287 | return NULL; | 288 | } | 289 | | 290 | _PyBytesWriter_Init(writer); | 291 | p = _PyBytesWriter_Alloc(writer, size * max_char_size); | 292 | if (p == NULL) Branch (292:9): [True: 0, False: 1.34k]
| 293 | return NULL; | 294 | | 295 | for (i = 0; 1.34k i < size;) { Branch (295:17): [True: 1.13M, False: 1.34k]
| 296 | Py_UCS4 ch = data[i++]; | 297 | | 298 | if (ch < 0x80) { Branch (298:13): [True: 11.0k, False: 1.12M]
| 299 | /* Encode ASCII */ | 300 | *p++ = (char) ch; | 301 | | 302 | } | 303 | else | 304 | #if STRINGLIB_SIZEOF_CHAR > 1 | 305 | if (ch < 0x0800) Branch (305:13): [True: 3.34k, False: 1.12M]
| 306 | #endif | 307 | { | 308 | /* Encode Latin-1 */ | 309 | *p++ = (char)(0xc0 | (ch >> 6)); | 310 | *p++ = (char)(0x80 | (ch & 0x3f)); | 311 | } | 312 | #if STRINGLIB_SIZEOF_CHAR > 1 | 313 | else if (Py_UNICODE_IS_SURROGATE(ch)) { Branch (313:18): [True: 6, False: 1.12M]
| 314 | Py_ssize_t startpos, endpos, newpos; | 315 | Py_ssize_t k; | 316 | if (error_handler == _Py_ERROR_UNKNOWN) { Branch (316:17): [True: 6, False: 0]
| 317 | error_handler = _Py_GetErrorHandler(errors); | 318 | } | 319 | | 320 | startpos = i-1; | 321 | endpos = startpos+1; | 322 | | 323 | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])4 ) Branch (323:20): [True: 4, False: 2]
Branch (323:39): [True: 0, False: 4]
| 324 | endpos++; | 325 | | 326 | /* Only overallocate the buffer if it's not the last write */ | 327 | writer->overallocate = (endpos < size); | 328 | | 329 | switch (error_handler) | 330 | { | 331 | case _Py_ERROR_REPLACE: Branch (331:13): [True: 0, False: 6]
| 332 | memset(p, '?', endpos - startpos); | 333 | p += (endpos - startpos); | 334 | /* fall through */ | 335 | case _Py_ERROR_IGNORE: Branch (335:13): [True: 0, False: 6]
| 336 | i += (endpos - startpos - 1); | 337 | break; | 338 | | 339 | case _Py_ERROR_SURROGATEPASS: Branch (339:13): [True: 6, False: 0]
| 340 | for (k=startpos; k<endpos; k++6 ) { Branch (340:34): [True: 6, False: 6]
| 341 | ch = data[k]; | 342 | *p++ = (char)(0xe0 | (ch >> 12)); | 343 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 344 | *p++ = (char)(0x80 | (ch & 0x3f)); | 345 | } | 346 | i += (endpos - startpos - 1); | 347 | break; | 348 | | 349 | case _Py_ERROR_BACKSLASHREPLACE: Branch (349:13): [True: 0, False: 6]
| 350 | /* subtract preallocated bytes */ | 351 | writer->min_size -= max_char_size * (endpos - startpos); | 352 | p = backslashreplace(writer, p, | 353 | unicode, startpos, endpos); | 354 | if (p == NULL) Branch (354:21): [True: 0, False: 0]
| 355 | goto error; | 356 | i += (endpos - startpos - 1); | 357 | break; | 358 | | 359 | case _Py_ERROR_XMLCHARREFREPLACE: Branch (359:13): [True: 0, False: 6]
| 360 | /* subtract preallocated bytes */ | 361 | writer->min_size -= max_char_size * (endpos - startpos); | 362 | p = xmlcharrefreplace(writer, p, | 363 | unicode, startpos, endpos); | 364 | if (p == NULL) Branch (364:21): [True: 0, False: 0]
| 365 | goto error; | 366 | i += (endpos - startpos - 1); | 367 | break; | 368 | | 369 | case _Py_ERROR_SURROGATEESCAPE: Branch (369:13): [True: 0, False: 6]
| 370 | for (k=startpos; k<endpos; k++) { Branch (370:34): [True: 0, False: 0]
| 371 | ch = data[k]; | 372 | if (!(0xDC80 <= ch && ch <= 0xDCFF)) Branch (372:27): [True: 0, False: 0]
Branch (372:43): [True: 0, False: 0]
| 373 | break; | 374 | *p++ = (char)(ch & 0xff); | 375 | } | 376 | if (k >= endpos) { Branch (376:21): [True: 0, False: 0]
| 377 | i += (endpos - startpos - 1); | 378 | break; | 379 | } | 380 | startpos = k; | 381 | assert(startpos < endpos); | 382 | /* fall through */ | 383 | default: Branch (383:13): [True: 0, False: 6]
| 384 | rep = unicode_encode_call_errorhandler( | 385 | errors, &error_handler_obj, "utf-8", "surrogates not allowed", | 386 | unicode, &exc, startpos, endpos, &newpos); | 387 | if (!rep) Branch (387:21): [True: 0, False: 0]
| 388 | goto error; | 389 | | 390 | if (newpos < startpos) { Branch (390:21): [True: 0, False: 0]
| 391 | writer->overallocate = 1; | 392 | p = _PyBytesWriter_Prepare(writer, p, | 393 | max_char_size * (startpos - newpos)); | 394 | if (p == NULL) Branch (394:25): [True: 0, False: 0]
| 395 | goto error; | 396 | } | 397 | else { | 398 | /* subtract preallocated bytes */ | 399 | writer->min_size -= max_char_size * (newpos - startpos); | 400 | /* Only overallocate the buffer if it's not the last write */ | 401 | writer->overallocate = (newpos < size); | 402 | } | 403 | | 404 | if (PyBytes_Check(rep)) { | 405 | p = _PyBytesWriter_WriteBytes(writer, p, | 406 | PyBytes_AS_STRING(rep), | 407 | PyBytes_GET_SIZE(rep)); | 408 | } | 409 | else { | 410 | /* rep is unicode */ | 411 | if (PyUnicode_READY(rep) < 0) Branch (411:25): [True: 0, False: 0]
| 412 | goto error; | 413 | | 414 | if (!PyUnicode_IS_ASCII(rep)) { Branch (414:25): [True: 0, False: 0]
| 415 | raise_encode_exception(&exc, "utf-8", unicode, | 416 | startpos, endpos, | 417 | "surrogates not allowed"); | 418 | goto error; | 419 | } | 420 | | 421 | p = _PyBytesWriter_WriteBytes(writer, p, | 422 | PyUnicode_DATA(rep), | 423 | PyUnicode_GET_LENGTH(rep)); | 424 | } | 425 | | 426 | if (p == NULL) Branch (426:21): [True: 0, False: 0]
| 427 | goto error; | 428 | Py_CLEAR(rep); | 429 |
| 430 | i = newpos; | 431 | } | 432 | | 433 | /* If overallocation was disabled, ensure that it was the last | 434 | write. Otherwise, we missed an optimization */ | 435 | assert(writer->overallocate || i == size); | 436 | } | 437 | else | 438 | #if STRINGLIB_SIZEOF_CHAR > 2 | 439 | if (ch < 0x10000) Branch (439:13): [True: 75.0k, False: 1.05M]
| 440 | #endif | 441 | { | 442 | *p++ = (char)(0xe0 | (ch >> 12)); | 443 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 444 | *p++ = (char)(0x80 | (ch & 0x3f)); | 445 | } | 446 | #if STRINGLIB_SIZEOF_CHAR > 2 | 447 | else /* ch >= 0x10000 */ | 448 | { | 449 | assert(ch <= MAX_UNICODE); | 450 | /* Encode UCS4 Unicode ordinals */ | 451 | *p++ = (char)(0xf0 | (ch >> 18)); | 452 | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); | 453 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 454 | *p++ = (char)(0x80 | (ch & 0x3f)); | 455 | } | 456 | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ | 457 | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ | 458 | } | 459 | | 460 | #if STRINGLIB_SIZEOF_CHAR > 1 | 461 | Py_XDECREF(error_handler_obj); | 462 | Py_XDECREF(exc); | 463 | #endif | 464 | return p; | 465 | | 466 | #if STRINGLIB_SIZEOF_CHAR > 1 | 467 | error: | 468 | Py_XDECREF(rep); | 469 | Py_XDECREF(error_handler_obj); | 470 | Py_XDECREF(exc); | 471 | return NULL; | 472 | #endif | 473 | } |
Unexecuted instantiation: unicodeobject.c:asciilib_utf8_encoder |
474 | |
475 | /* The pattern for constructing UCS2-repeated masks. */ |
476 | #if SIZEOF_LONG == 8 |
477 | # define UCS2_REPEAT_MASK 0x0001000100010001ul |
478 | #elif SIZEOF_LONG == 4 |
479 | # define UCS2_REPEAT_MASK 0x00010001ul |
480 | #else |
481 | # error C 'long' size should be either 4 or 8! |
482 | #endif |
483 | |
484 | /* The mask for fast checking. */ |
485 | #if STRINGLIB_SIZEOF_CHAR == 1 |
486 | /* The mask for fast checking of whether a C 'long' contains a |
487 | non-ASCII or non-Latin1 UTF16-encoded characters. */ |
488 | # define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR)) |
489 | #else |
490 | /* The mask for fast checking of whether a C 'long' may contain |
491 | UTF16-encoded surrogate characters. This is an efficient heuristic, |
492 | assuming that non-surrogate characters with a code point >= 0x8000 are |
493 | rare in most input. |
494 | */ |
495 | # define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u) |
496 | #endif |
497 | /* The mask for fast byte-swapping. */ |
498 | #define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu) |
499 | /* Swap bytes. */ |
500 | #define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \ |
501 | (((value) & STRIPPED_MASK) << 8)) |
502 | |
503 | Py_LOCAL_INLINE(Py_UCS4) |
504 | STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e, |
505 | STRINGLIB_CHAR *dest, Py_ssize_t *outpos, |
506 | int native_ordering) |
507 | { |
508 | Py_UCS4 ch; |
509 | const unsigned char *q = *inptr; |
510 | STRINGLIB_CHAR *p = dest + *outpos; |
511 | /* Offsets from q for retrieving byte pairs in the right order. */ |
512 | #if PY_LITTLE_ENDIAN |
513 | int ihi = !!native_ordering, ilo = !native_ordering; |
514 | #else |
515 | int ihi = !native_ordering, ilo = !!native_ordering; |
516 | #endif |
517 | --e; |
518 | |
519 | while (q < e) { Branch (519:12): [True: 100k, False: 24.2k]
Branch (519:12): [True: 67, False: 10]
Branch (519:12): [True: 20.6k, False: 4.07k]
Branch (519:12): [True: 85, False: 23]
|
520 | Py_UCS4 ch2; |
521 | /* First check for possible aligned read of a C 'long'. Unaligned |
522 | reads are more expensive, better to defer to another iteration. */ |
523 | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { |
524 | /* Fast path for runs of in-range non-surrogate chars. */ |
525 | const unsigned char *_q = q; |
526 | while (_q + SIZEOF_LONG <= e) { Branch (526:20): [True: 191k, False: 26.4k]
Branch (526:20): [True: 9, False: 10]
Branch (526:20): [True: 16.5k, False: 3.83k]
Branch (526:20): [True: 18, False: 14]
|
527 | unsigned long block = * (const unsigned long *) _q; |
528 | if (native_ordering) { Branch (528:21): [True: 129k, False: 62.3k]
Branch (528:21): [True: 2, False: 7]
Branch (528:21): [True: 10.9k, False: 5.66k]
Branch (528:21): [True: 12, False: 6]
|
529 | /* Can use buffer directly */ |
530 | if (block & FAST_CHAR_MASK) Branch (530:25): [True: 2.53k, False: 126k]
Branch (530:25): [True: 2, False: 0]
Branch (530:25): [True: 0, False: 10.9k]
Branch (530:25): [True: 5, False: 7]
|
531 | break; |
532 | } |
533 | else { |
534 | /* Need to byte-swap */ |
535 | if (block & SWAB(FAST_CHAR_MASK)) Branch (535:25): [True: 1.31k, False: 61.0k]
Branch (535:25): [True: 2, False: 5]
Branch (535:25): [True: 0, False: 5.66k]
Branch (535:25): [True: 3, False: 3]
|
536 | break; |
537 | #if STRINGLIB_SIZEOF_CHAR == 1 |
538 | block >>= 8; |
539 | #else |
540 | block = SWAB(block); |
541 | #endif |
542 | } |
543 | #if PY_LITTLE_ENDIAN |
544 | # if SIZEOF_LONG == 4 |
545 | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
546 | p[1] = (STRINGLIB_CHAR)(block >> 16); |
547 | # elif SIZEOF_LONG == 8 |
548 | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
549 | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); |
550 | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); |
551 | p[3] = (STRINGLIB_CHAR)(block >> 48); |
552 | # endif |
553 | #else |
554 | # if SIZEOF_LONG == 4 |
555 | p[0] = (STRINGLIB_CHAR)(block >> 16); |
556 | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
557 | # elif SIZEOF_LONG == 8 |
558 | p[0] = (STRINGLIB_CHAR)(block >> 48); |
559 | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); |
560 | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); |
561 | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
562 | # endif |
563 | #endif |
564 | _q += SIZEOF_LONG; |
565 | p += SIZEOF_LONG / 2; |
566 | } |
567 | q = _q; |
568 | if (q >= e) Branch (568:17): [True: 88, False: 30.2k]
Branch (568:17): [True: 0, False: 14]
Branch (568:17): [True: 0, False: 3.83k]
Branch (568:17): [True: 0, False: 22]
|
569 | break; |
570 | } |
571 | |
572 | ch = (q[ihi] << 8) | q[ilo]; |
573 | q += 2; |
574 | if (!Py_UNICODE_IS_SURROGATE(ch)) { Branch (574:13): [True: 99.8k, False: 179]
Branch (574:13): [True: 65, False: 2]
Branch (574:13): [True: 20.6k, False: 18]
Branch (574:13): [True: 50, False: 35]
|
575 | #if STRINGLIB_SIZEOF_CHAR < 2 |
576 | if (ch > STRINGLIB_MAX_CHAR) Branch (576:17): [True: 6.92k, False: 92.9k]
Branch (576:17): [True: 6, False: 59]
|
577 | /* Out-of-range */ |
578 | goto Return; |
579 | #endif |
580 | *p++ = (STRINGLIB_CHAR)ch; |
581 | continue; |
582 | } |
583 | |
584 | /* UTF-16 code pair: */ |
585 | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) Branch (585:13): [True: 90, False: 89]
Branch (585:13): [True: 0, False: 2]
Branch (585:13): [True: 0, False: 18]
Branch (585:13): [True: 15, False: 20]
|
586 | goto IllegalEncoding; |
587 | if (q >= e) Branch (587:13): [True: 44, False: 45]
Branch (587:13): [True: 0, False: 2]
Branch (587:13): [True: 0, False: 18]
Branch (587:13): [True: 0, False: 20]
|
588 | goto UnexpectedEnd; |
589 | ch2 = (q[ihi] << 8) | q[ilo]; |
590 | q += 2; |
591 | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) Branch (591:13): [True: 6, False: 39]
Branch (591:13): [True: 0, False: 2]
Branch (591:13): [True: 6, False: 12]
Branch (591:13): [True: 0, False: 20]
|
592 | goto IllegalSurrogate; |
593 | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); |
594 | #if STRINGLIB_SIZEOF_CHAR < 4 |
595 | /* Out-of-range */ |
596 | goto Return; |
597 | #else |
598 | *p++ = (STRINGLIB_CHAR)ch; |
599 | #endif |
600 | } |
601 | ch = 0; |
602 | Return: |
603 | *inptr = q; |
604 | *outpos = p - dest; |
605 | return ch; |
606 | UnexpectedEnd: |
607 | ch = 1; |
608 | goto Return; |
609 | IllegalEncoding: |
610 | ch = 2; |
611 | goto Return; |
612 | IllegalSurrogate: |
613 | ch = 3; |
614 | goto Return; |
615 | } unicodeobject.c:asciilib_utf16_decode Line | Count | Source | 507 | { | 508 | Py_UCS4 ch; | 509 | const unsigned char *q = *inptr; | 510 | STRINGLIB_CHAR *p = dest + *outpos; | 511 | /* Offsets from q for retrieving byte pairs in the right order. */ | 512 | #if PY_LITTLE_ENDIAN | 513 | int ihi = !!native_ordering, ilo = !native_ordering; | 514 | #else | 515 | int ihi = !native_ordering, ilo = !!native_ordering; | 516 | #endif | 517 | --e; | 518 | | 519 | while (q < e) { Branch (519:12): [True: 100k, False: 24.2k]
| 520 | Py_UCS4 ch2; | 521 | /* First check for possible aligned read of a C 'long'. Unaligned | 522 | reads are more expensive, better to defer to another iteration. */ | 523 | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 524 | /* Fast path for runs of in-range non-surrogate chars. */ | 525 | const unsigned char *_q = q; | 526 | while (_q + SIZEOF_LONG <= e) { Branch (526:20): [True: 191k, False: 26.4k]
| 527 | unsigned long block = * (const unsigned long *) _q; | 528 | if (native_ordering) { Branch (528:21): [True: 129k, False: 62.3k]
| 529 | /* Can use buffer directly */ | 530 | if (block & FAST_CHAR_MASK) Branch (530:25): [True: 2.53k, False: 126k]
| 531 | break; | 532 | } | 533 | else { | 534 | /* Need to byte-swap */ | 535 | if (block & SWAB(FAST_CHAR_MASK)) Branch (535:25): [True: 1.31k, False: 61.0k]
| 536 | break; | 537 | #if STRINGLIB_SIZEOF_CHAR == 1 | 538 | block >>= 8; | 539 | #else | 540 | block = SWAB(block); | 541 | #endif | 542 | } | 543 | #if PY_LITTLE_ENDIAN | 544 | # if SIZEOF_LONG == 4 | 545 | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 546 | p[1] = (STRINGLIB_CHAR)(block >> 16); | 547 | # elif SIZEOF_LONG == 8 | 548 | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 549 | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 550 | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 551 | p[3] = (STRINGLIB_CHAR)(block >> 48); | 552 | # endif | 553 | #else | 554 | # if SIZEOF_LONG == 4 | 555 | p[0] = (STRINGLIB_CHAR)(block >> 16); | 556 | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 557 | # elif SIZEOF_LONG == 8 | 558 | p[0] = (STRINGLIB_CHAR)(block >> 48); | 559 | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 560 | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 561 | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 562 | # endif | 563 | #endif | 564 | _q += SIZEOF_LONG; | 565 | p += SIZEOF_LONG / 2; | 566 | } | 567 | q = _q; | 568 | if (q >= e) Branch (568:17): [True: 88, False: 30.2k]
| 569 | break; | 570 | } | 571 | | 572 | ch = (q[ihi] << 8) | q[ilo]; | 573 | q += 2; | 574 | if (!Py_UNICODE_IS_SURROGATE(ch)) { Branch (574:13): [True: 99.8k, False: 179]
| 575 | #if STRINGLIB_SIZEOF_CHAR < 2 | 576 | if (ch > STRINGLIB_MAX_CHAR) Branch (576:17): [True: 6.92k, False: 92.9k]
| 577 | /* Out-of-range */ | 578 | goto Return; | 579 | #endif | 580 | *p++ = (STRINGLIB_CHAR)ch; | 581 | continue; | 582 | } | 583 | | 584 | /* UTF-16 code pair: */ | 585 | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) Branch (585:13): [True: 90, False: 89]
| 586 | goto IllegalEncoding; | 587 | if (q >= e) Branch (587:13): [True: 44, False: 45]
| 588 | goto UnexpectedEnd; | 589 | ch2 = (q[ihi] << 8) | q[ilo]; | 590 | q += 2; | 591 | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) Branch (591:13): [True: 6, False: 39]
| 592 | goto IllegalSurrogate; | 593 | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 594 | #if STRINGLIB_SIZEOF_CHAR < 4 | 595 | /* Out-of-range */ | 596 | goto Return; | 597 | #else | 598 | *p++ = (STRINGLIB_CHAR)ch; | 599 | #endif | 600 | } | 601 | ch = 0; | 602 | Return: | 603 | *inptr = q; | 604 | *outpos = p - dest; | 605 | return ch; | 606 | UnexpectedEnd: | 607 | ch = 1; | 608 | goto Return; | 609 | IllegalEncoding: | 610 | ch = 2; | 611 | goto Return; | 612 | IllegalSurrogate: | 613 | ch = 3; | 614 | goto Return; | 615 | } |
unicodeobject.c:ucs1lib_utf16_decode Line | Count | Source | 507 | { | 508 | Py_UCS4 ch; | 509 | const unsigned char *q = *inptr; | 510 | STRINGLIB_CHAR *p = dest + *outpos; | 511 | /* Offsets from q for retrieving byte pairs in the right order. */ | 512 | #if PY_LITTLE_ENDIAN | 513 | int ihi = !!native_ordering, ilo = !native_ordering; | 514 | #else | 515 | int ihi = !native_ordering, ilo = !!native_ordering; | 516 | #endif | 517 | --e; | 518 | | 519 | while (q < e) { Branch (519:12): [True: 67, False: 10]
| 520 | Py_UCS4 ch2; | 521 | /* First check for possible aligned read of a C 'long'. Unaligned | 522 | reads are more expensive, better to defer to another iteration. */ | 523 | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 524 | /* Fast path for runs of in-range non-surrogate chars. */ | 525 | const unsigned char *_q = q; | 526 | while (_q + SIZEOF_LONG <= e) { Branch (526:20): [True: 9, False: 10]
| 527 | unsigned long block = * (const unsigned long *) _q; | 528 | if (native_ordering) { Branch (528:21): [True: 2, False: 7]
| 529 | /* Can use buffer directly */ | 530 | if (block & FAST_CHAR_MASK) Branch (530:25): [True: 2, False: 0]
| 531 | break; | 532 | } | 533 | else { | 534 | /* Need to byte-swap */ | 535 | if (block & SWAB(FAST_CHAR_MASK)) Branch (535:25): [True: 2, False: 5]
| 536 | break; | 537 | #if STRINGLIB_SIZEOF_CHAR == 1 | 538 | block >>= 8; | 539 | #else | 540 | block = SWAB(block); | 541 | #endif | 542 | } | 543 | #if PY_LITTLE_ENDIAN | 544 | # if SIZEOF_LONG == 4 | 545 | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 546 | p[1] = (STRINGLIB_CHAR)(block >> 16); | 547 | # elif SIZEOF_LONG == 8 | 548 | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 549 | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 550 | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 551 | p[3] = (STRINGLIB_CHAR)(block >> 48); | 552 | # endif | 553 | #else | 554 | # if SIZEOF_LONG == 4 | 555 | p[0] = (STRINGLIB_CHAR)(block >> 16); | 556 | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 557 | # elif SIZEOF_LONG == 8 | 558 | p[0] = (STRINGLIB_CHAR)(block >> 48); | 559 | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 560 | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 561 | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 562 | # endif | 563 | #endif | 564 | _q += SIZEOF_LONG; | 565 | p += SIZEOF_LONG / 2; | 566 | } | 567 | q = _q; | 568 | if (q >= e) Branch (568:17): [True: 0, False: 14]
| 569 | break; | 570 | } | 571 | | 572 | ch = (q[ihi] << 8) | q[ilo]; | 573 | q += 2; | 574 | if (!Py_UNICODE_IS_SURROGATE(ch)) { Branch (574:13): [True: 65, False: 2]
| 575 | #if STRINGLIB_SIZEOF_CHAR < 2 | 576 | if (ch > STRINGLIB_MAX_CHAR) Branch (576:17): [True: 6, False: 59]
| 577 | /* Out-of-range */ | 578 | goto Return; | 579 | #endif | 580 | *p++ = (STRINGLIB_CHAR)ch; | 581 | continue; | 582 | } | 583 | | 584 | /* UTF-16 code pair: */ | 585 | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) Branch (585:13): [True: 0, False: 2]
| 586 | goto IllegalEncoding; | 587 | if (q >= e) Branch (587:13): [True: 0, False: 2]
| 588 | goto UnexpectedEnd; | 589 | ch2 = (q[ihi] << 8) | q[ilo]; | 590 | q += 2; | 591 | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) Branch (591:13): [True: 0, False: 2]
| 592 | goto IllegalSurrogate; | 593 | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 594 | #if STRINGLIB_SIZEOF_CHAR < 4 | 595 | /* Out-of-range */ | 596 | goto Return; | 597 | #else | 598 | *p++ = (STRINGLIB_CHAR)ch; | 599 | #endif | 600 | } | 601 | ch = 0; | 602 | Return: | 603 | *inptr = q; | 604 | *outpos = p - dest; | 605 | return ch; | 606 | UnexpectedEnd: | 607 | ch = 1; | 608 | goto Return; | 609 | IllegalEncoding: | 610 | ch = 2; | 611 | goto Return; | 612 | IllegalSurrogate: | 613 | ch = 3; | 614 | goto Return; | 615 | } |
unicodeobject.c:ucs2lib_utf16_decode Line | Count | Source | 507 | { | 508 | Py_UCS4 ch; | 509 | const unsigned char *q = *inptr; | 510 | STRINGLIB_CHAR *p = dest + *outpos; | 511 | /* Offsets from q for retrieving byte pairs in the right order. */ | 512 | #if PY_LITTLE_ENDIAN | 513 | int ihi = !!native_ordering, ilo = !native_ordering; | 514 | #else | 515 | int ihi = !native_ordering, ilo = !!native_ordering; | 516 | #endif | 517 | --e; | 518 | | 519 | while (q < e) { Branch (519:12): [True: 20.6k, False: 4.07k]
| 520 | Py_UCS4 ch2; | 521 | /* First check for possible aligned read of a C 'long'. Unaligned | 522 | reads are more expensive, better to defer to another iteration. */ | 523 | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 524 | /* Fast path for runs of in-range non-surrogate chars. */ | 525 | const unsigned char *_q = q; | 526 | while (_q + SIZEOF_LONG <= e) { Branch (526:20): [True: 16.5k, False: 3.83k]
| 527 | unsigned long block = * (const unsigned long *) _q; | 528 | if (native_ordering) { Branch (528:21): [True: 10.9k, False: 5.66k]
| 529 | /* Can use buffer directly */ | 530 | if (block & FAST_CHAR_MASK) Branch (530:25): [True: 0, False: 10.9k]
| 531 | break; | 532 | } | 533 | else { | 534 | /* Need to byte-swap */ | 535 | if (block & SWAB(FAST_CHAR_MASK)) Branch (535:25): [True: 0, False: 5.66k]
| 536 | break; | 537 | #if STRINGLIB_SIZEOF_CHAR == 1 | 538 | block >>= 8; | 539 | #else | 540 | block = SWAB(block); | 541 | #endif | 542 | } | 543 | #if PY_LITTLE_ENDIAN | 544 | # if SIZEOF_LONG == 4 | 545 | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 546 | p[1] = (STRINGLIB_CHAR)(block >> 16); | 547 | # elif SIZEOF_LONG == 8 | 548 | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 549 | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 550 | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 551 | p[3] = (STRINGLIB_CHAR)(block >> 48); | 552 | # endif | 553 | #else | 554 | # if SIZEOF_LONG == 4 | 555 | p[0] = (STRINGLIB_CHAR)(block >> 16); | 556 | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 557 | # elif SIZEOF_LONG == 8 | 558 | p[0] = (STRINGLIB_CHAR)(block >> 48); | 559 | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 560 | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 561 | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 562 | # endif | 563 | #endif | 564 | _q += SIZEOF_LONG; | 565 | p += SIZEOF_LONG / 2; | 566 | } | 567 | q = _q; | 568 | if (q >= e) Branch (568:17): [True: 0, False: 3.83k]
| 569 | break; | 570 | } | 571 | | 572 | ch = (q[ihi] << 8) | q[ilo]; | 573 | q += 2; | 574 | if (!Py_UNICODE_IS_SURROGATE(ch)) { Branch (574:13): [True: 20.6k, False: 18]
| 575 | #if STRINGLIB_SIZEOF_CHAR < 2 | 576 | if (ch > STRINGLIB_MAX_CHAR) | 577 | /* Out-of-range */ | 578 | goto Return; | 579 | #endif | 580 | *p++ = (STRINGLIB_CHAR)ch; | 581 | continue; | 582 | } | 583 | | 584 | /* UTF-16 code pair: */ | 585 | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) Branch (585:13): [True: 0, False: 18]
| 586 | goto IllegalEncoding; | 587 | if (q >= e) Branch (587:13): [True: 0, False: 18]
| 588 | goto UnexpectedEnd; | 589 | ch2 = (q[ihi] << 8) | q[ilo]; | 590 | q += 2; | 591 | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) Branch (591:13): [True: 6, False: 12]
| 592 | goto IllegalSurrogate; | 593 | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 594 | #if STRINGLIB_SIZEOF_CHAR < 4 | 595 | /* Out-of-range */ | 596 | goto Return; | 597 | #else | 598 | *p++ = (STRINGLIB_CHAR)ch; | 599 | #endif | 600 | } | 601 | ch = 0; | 602 | Return: | 603 | *inptr = q; | 604 | *outpos = p - dest; | 605 | return ch; | 606 | UnexpectedEnd: | 607 | ch = 1; | 608 | goto Return; | 609 | IllegalEncoding: | 610 | ch = 2; | 611 | goto Return; | 612 | IllegalSurrogate: | 613 | ch = 3; | 614 | goto Return; | 615 | } |
unicodeobject.c:ucs4lib_utf16_decode Line | Count | Source | 507 | { | 508 | Py_UCS4 ch; | 509 | const unsigned char *q = *inptr; | 510 | STRINGLIB_CHAR *p = dest + *outpos; | 511 | /* Offsets from q for retrieving byte pairs in the right order. */ | 512 | #if PY_LITTLE_ENDIAN | 513 | int ihi = !!native_ordering, ilo = !native_ordering; | 514 | #else | 515 | int ihi = !native_ordering, ilo = !!native_ordering; | 516 | #endif | 517 | --e; | 518 | | 519 | while (q < e) { Branch (519:12): [True: 85, False: 23]
| 520 | Py_UCS4 ch2; | 521 | /* First check for possible aligned read of a C 'long'. Unaligned | 522 | reads are more expensive, better to defer to another iteration. */ | 523 | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 524 | /* Fast path for runs of in-range non-surrogate chars. */ | 525 | const unsigned char *_q = q; | 526 | while (_q + SIZEOF_LONG <= e) { Branch (526:20): [True: 18, False: 14]
| 527 | unsigned long block = * (const unsigned long *) _q; | 528 | if (native_ordering) { Branch (528:21): [True: 12, False: 6]
| 529 | /* Can use buffer directly */ | 530 | if (block & FAST_CHAR_MASK) Branch (530:25): [True: 5, False: 7]
| 531 | break; | 532 | } | 533 | else { | 534 | /* Need to byte-swap */ | 535 | if (block & SWAB(FAST_CHAR_MASK)) Branch (535:25): [True: 3, False: 3]
| 536 | break; | 537 | #if STRINGLIB_SIZEOF_CHAR == 1 | 538 | block >>= 8; | 539 | #else | 540 | block = SWAB(block); | 541 | #endif | 542 | } | 543 | #if PY_LITTLE_ENDIAN | 544 | # if SIZEOF_LONG == 4 | 545 | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 546 | p[1] = (STRINGLIB_CHAR)(block >> 16); | 547 | # elif SIZEOF_LONG == 8 | 548 | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 549 | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 550 | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 551 | p[3] = (STRINGLIB_CHAR)(block >> 48); | 552 | # endif | 553 | #else | 554 | # if SIZEOF_LONG == 4 | 555 | p[0] = (STRINGLIB_CHAR)(block >> 16); | 556 | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 557 | # elif SIZEOF_LONG == 8 | 558 | p[0] = (STRINGLIB_CHAR)(block >> 48); | 559 | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 560 | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 561 | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 562 | # endif | 563 | #endif | 564 | _q += SIZEOF_LONG; | 565 | p += SIZEOF_LONG / 2; | 566 | } | 567 | q = _q; | 568 | if (q >= e) Branch (568:17): [True: 0, False: 22]
| 569 | break; | 570 | } | 571 | | 572 | ch = (q[ihi] << 8) | q[ilo]; | 573 | q += 2; | 574 | if (!Py_UNICODE_IS_SURROGATE(ch)) { Branch (574:13): [True: 50, False: 35]
| 575 | #if STRINGLIB_SIZEOF_CHAR < 2 | 576 | if (ch > STRINGLIB_MAX_CHAR) | 577 | /* Out-of-range */ | 578 | goto Return; | 579 | #endif | 580 | *p++ = (STRINGLIB_CHAR)ch; | 581 | continue; | 582 | } | 583 | | 584 | /* UTF-16 code pair: */ | 585 | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) Branch (585:13): [True: 15, False: 20]
| 586 | goto IllegalEncoding; | 587 | if (q >= e) Branch (587:13): [True: 0, False: 20]
| 588 | goto UnexpectedEnd; | 589 | ch2 = (q[ihi] << 8) | q[ilo]; | 590 | q += 2; | 591 | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) Branch (591:13): [True: 0, False: 20]
| 592 | goto IllegalSurrogate; | 593 | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 594 | #if STRINGLIB_SIZEOF_CHAR < 4 | 595 | /* Out-of-range */ | 596 | goto Return; | 597 | #else | 598 | *p++ = (STRINGLIB_CHAR)ch; | 599 | #endif | 600 | } | 601 | ch = 0; | 602 | Return: | 603 | *inptr = q; | 604 | *outpos = p - dest; | 605 | return ch; | 606 | UnexpectedEnd: | 607 | ch = 1; | 608 | goto Return; | 609 | IllegalEncoding: | 610 | ch = 2; | 611 | goto Return; | 612 | IllegalSurrogate: | 613 | ch = 3; | 614 | goto Return; | 615 | } |
|
616 | #undef UCS2_REPEAT_MASK |
617 | #undef FAST_CHAR_MASK |
618 | #undef STRIPPED_MASK |
619 | #undef SWAB |
620 | |
621 | |
622 | #if STRINGLIB_MAX_CHAR >= 0x80 |
623 | Py_LOCAL_INLINE(Py_ssize_t) |
624 | STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in, |
625 | Py_ssize_t len, |
626 | unsigned short **outptr, |
627 | int native_ordering) |
628 | { |
629 | unsigned short *out = *outptr; |
630 | const STRINGLIB_CHAR *end = in + len; |
631 | #if STRINGLIB_SIZEOF_CHAR == 1 |
632 | if (native_ordering) { Branch (632:9): [True: 2.00k, False: 902]
|
633 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
634 | while (in < unrolled_end) { Branch (634:16): [True: 122k, False: 2.00k]
|
635 | out[0] = in[0]; |
636 | out[1] = in[1]; |
637 | out[2] = in[2]; |
638 | out[3] = in[3]; |
639 | in += 4; out += 4; |
640 | } |
641 | while (in < end) { Branch (641:16): [True: 2.13k, False: 2.00k]
|
642 | *out++ = *in++; |
643 | } |
644 | } else { |
645 | # define SWAB2(CH) ((CH) << 8) /* high byte is zero */ |
646 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
647 | while (in < unrolled_end) { Branch (647:16): [True: 55.5k, False: 902]
|
648 | out[0] = SWAB2(in[0]); |
649 | out[1] = SWAB2(in[1]); |
650 | out[2] = SWAB2(in[2]); |
651 | out[3] = SWAB2(in[3]); |
652 | in += 4; out += 4; |
653 | } |
654 | while (in < end) { Branch (654:16): [True: 954, False: 902]
|
655 | Py_UCS4 ch = *in++; |
656 | *out++ = SWAB2((Py_UCS2)ch); |
657 | } |
658 | #undef SWAB2 |
659 | } |
660 | *outptr = out; |
661 | return len; |
662 | #else |
663 | if (native_ordering) { Branch (663:9): [True: 2.02k, False: 979]
Branch (663:9): [True: 40, False: 22]
|
664 | #if STRINGLIB_MAX_CHAR < 0x10000 |
665 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
666 | while (in < unrolled_end) { Branch (666:16): [True: 40.7k, False: 2.02k]
|
667 | /* check if any character is a surrogate character */ |
668 | if (((in[0] ^ 0xd800) & Branch (668:17): [True: 4, False: 40.7k]
|
669 | (in[1] ^ 0xd800) & |
670 | (in[2] ^ 0xd800) & |
671 | (in[3] ^ 0xd800) & 0xf800) == 0) |
672 | break; |
673 | out[0] = in[0]; |
674 | out[1] = in[1]; |
675 | out[2] = in[2]; |
676 | out[3] = in[3]; |
677 | in += 4; out += 4; |
678 | } |
679 | #endif |
680 | while (in < end) { Branch (680:16): [True: 2.07k, False: 1.94k]
Branch (680:16): [True: 923, False: 36]
|
681 | Py_UCS4 ch; |
682 | ch = *in++; |
683 | if (ch < 0xd800) Branch (683:17): [True: 1.99k, False: 83]
Branch (683:17): [True: 850, False: 73]
|
684 | *out++ = ch; |
685 | else if (ch < 0xe000) Branch (685:22): [True: 83, False: 0]
Branch (685:22): [True: 4, False: 69]
|
686 | /* reject surrogate characters (U+D800-U+DFFF) */ |
687 | goto fail; |
688 | #if STRINGLIB_MAX_CHAR >= 0x10000 |
689 | else if (ch >= 0x10000) { Branch (689:22): [True: 61, False: 8]
|
690 | out[0] = Py_UNICODE_HIGH_SURROGATE(ch); |
691 | out[1] = Py_UNICODE_LOW_SURROGATE(ch); |
692 | out += 2; |
693 | } |
694 | #endif |
695 | else |
696 | *out++ = ch; |
697 | } |
698 | } else { |
699 | #define SWAB2(CH) (((CH) << 8) | ((CH) >> 8)) |
700 | #if STRINGLIB_MAX_CHAR < 0x10000 |
701 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
702 | while (in < unrolled_end) { Branch (702:16): [True: 18.8k, False: 977]
|
703 | /* check if any character is a surrogate character */ |
704 | if (((in[0] ^ 0xd800) & Branch (704:17): [True: 2, False: 18.8k]
|
705 | (in[1] ^ 0xd800) & |
706 | (in[2] ^ 0xd800) & |
707 | (in[3] ^ 0xd800) & 0xf800) == 0) |
708 | break; |
709 | out[0] = SWAB2(in[0]); |
710 | out[1] = SWAB2(in[1]); |
711 | out[2] = SWAB2(in[2]); |
712 | out[3] = SWAB2(in[3]); |
713 | in += 4; out += 4; |
714 | } |
715 | #endif |
716 | while (in < end) { Branch (716:16): [True: 996, False: 961]
Branch (716:16): [True: 475, False: 20]
|
717 | Py_UCS4 ch = *in++; |
718 | if (ch < 0xd800) Branch (718:17): [True: 978, False: 18]
Branch (718:17): [True: 436, False: 39]
|
719 | *out++ = SWAB2((Py_UCS2)ch); |
720 | else if (ch < 0xe000) Branch (720:22): [True: 18, False: 0]
Branch (720:22): [True: 2, False: 37]
|
721 | /* reject surrogate characters (U+D800-U+DFFF) */ |
722 | goto fail; |
723 | #if STRINGLIB_MAX_CHAR >= 0x10000 |
724 | else if (ch >= 0x10000) { Branch (724:22): [True: 33, False: 4]
|
725 | Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch); |
726 | Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch); |
727 | out[0] = SWAB2(ch1); |
728 | out[1] = SWAB2(ch2); |
729 | out += 2; |
730 | } |
731 | #endif |
732 | else |
733 | *out++ = SWAB2((Py_UCS2)ch); |
734 | } |
735 | #undef SWAB2 |
736 | } |
737 | *outptr = out; |
738 | return len; |
739 | fail: |
740 | *outptr = out; |
741 | return len - (end - in + 1); |
742 | #endif |
743 | } unicodeobject.c:ucs1lib_utf16_encode Line | Count | Source | 628 | { | 629 | unsigned short *out = *outptr; | 630 | const STRINGLIB_CHAR *end = in + len; | 631 | #if STRINGLIB_SIZEOF_CHAR == 1 | 632 | if (native_ordering) { Branch (632:9): [True: 2.00k, False: 902]
| 633 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); | 634 | while (in < unrolled_end) { Branch (634:16): [True: 122k, False: 2.00k]
| 635 | out[0] = in[0]; | 636 | out[1] = in[1]; | 637 | out[2] = in[2]; | 638 | out[3] = in[3]; | 639 | in += 4; out += 4; | 640 | } | 641 | while (in < end) { Branch (641:16): [True: 2.13k, False: 2.00k]
| 642 | *out++ = *in++; | 643 | } | 644 | } else { | 645 | # define SWAB2(CH) ((CH) << 8) /* high byte is zero */ | 646 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); | 647 | while (in < unrolled_end) { Branch (647:16): [True: 55.5k, False: 902]
| 648 | out[0] = SWAB2(in[0]); | 649 | out[1] = SWAB2(in[1]); | 650 | out[2] = SWAB2(in[2]); | 651 | out[3] = SWAB2(in[3]); | 652 | in += 4; out += 4; | 653 | } | 654 | while (in < end) { Branch (654:16): [True: 954, False: 902]
| 655 | Py_UCS4 ch = *in++; | 656 | *out++ = SWAB2((Py_UCS2)ch); | 657 | } | 658 | #undef SWAB2 | 659 | } | 660 | *outptr = out; | 661 | return len; | 662 | #else | 663 | if (native_ordering) { | 664 | #if STRINGLIB_MAX_CHAR < 0x10000 | 665 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); | 666 | while (in < unrolled_end) { | 667 | /* check if any character is a surrogate character */ | 668 | if (((in[0] ^ 0xd800) & | 669 | (in[1] ^ 0xd800) & | 670 | (in[2] ^ 0xd800) & | 671 | (in[3] ^ 0xd800) & 0xf800) == 0) | 672 | break; | 673 | out[0] = in[0]; | 674 | out[1] = in[1]; | 675 | out[2] = in[2]; | 676 | out[3] = in[3]; | 677 | in += 4; out += 4; | 678 | } | 679 | #endif | 680 | while (in < end) { | 681 | Py_UCS4 ch; | 682 | ch = *in++; | 683 | if (ch < 0xd800) | 684 | *out++ = ch; | 685 | else if (ch < 0xe000) | 686 | /* reject surrogate characters (U+D800-U+DFFF) */ | 687 | goto fail; | 688 | #if STRINGLIB_MAX_CHAR >= 0x10000 | 689 | else if (ch >= 0x10000) { | 690 | out[0] = Py_UNICODE_HIGH_SURROGATE(ch); | 691 | out[1] = Py_UNICODE_LOW_SURROGATE(ch); | 692 | out += 2; | 693 | } | 694 | #endif | 695 | else | 696 | *out++ = ch; | 697 | } | 698 | } else { | 699 | #define SWAB2(CH) (((CH) << 8) | ((CH) >> 8)) | 700 | #if STRINGLIB_MAX_CHAR < 0x10000 | 701 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); | 702 | while (in < unrolled_end) { | 703 | /* check if any character is a surrogate character */ | 704 | if (((in[0] ^ 0xd800) & | 705 | (in[1] ^ 0xd800) & | 706 | (in[2] ^ 0xd800) & | 707 | (in[3] ^ 0xd800) & 0xf800) == 0) | 708 | break; | 709 | out[0] = SWAB2(in[0]); | 710 | out[1] = SWAB2(in[1]); | 711 | out[2] = SWAB2(in[2]); | 712 | out[3] = SWAB2(in[3]); | 713 | in += 4; out += 4; | 714 | } | 715 | #endif | 716 | while (in < end) { | 717 | Py_UCS4 ch = *in++; | 718 | if (ch < 0xd800) | 719 | *out++ = SWAB2((Py_UCS2)ch); | 720 | else if (ch < 0xe000) | 721 | /* reject surrogate characters (U+D800-U+DFFF) */ | 722 | goto fail; | 723 | #if STRINGLIB_MAX_CHAR >= 0x10000 | 724 | else if (ch >= 0x10000) { | 725 | Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch); | 726 | Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch); | 727 | out[0] = SWAB2(ch1); | 728 | out[1] = SWAB2(ch2); | 729 | out += 2; | 730 | } | 731 | #endif | 732 | else | 733 | *out++ = SWAB2((Py_UCS2)ch); | 734 | } | 735 | #undef SWAB2 | 736 | } | 737 | *outptr = out; | 738 | return len; | 739 | fail: | 740 | *outptr = out; | 741 | return len - (end - in + 1); | 742 | #endif | 743 | } |
unicodeobject.c:ucs2lib_utf16_encode Line | Count | Source | 628 | { | 629 | unsigned short *out = *outptr; | 630 | const STRINGLIB_CHAR *end = in + len; | 631 | #if STRINGLIB_SIZEOF_CHAR == 1 | 632 | if (native_ordering) { | 633 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); | 634 | while (in < unrolled_end) { | 635 | out[0] = in[0]; | 636 | out[1] = in[1]; | 637 | out[2] = in[2]; | 638 | out[3] = in[3]; | 639 | in += 4; out += 4; | 640 | } | 641 | while (in < end) { | 642 | *out++ = *in++; | 643 | } | 644 | } else { | 645 | # define SWAB2(CH) ((CH) << 8) /* high byte is zero */ | 646 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); | 647 | while (in < unrolled_end) { | 648 | out[0] = SWAB2(in[0]); | 649 | out[1] = SWAB2(in[1]); | 650 | out[2] = SWAB2(in[2]); | 651 | out[3] = SWAB2(in[3]); | 652 | in += 4; out += 4; | 653 | } | 654 | while (in < end) { | 655 | Py_UCS4 ch = *in++; | 656 | *out++ = SWAB2((Py_UCS2)ch); | 657 | } | 658 | #undef SWAB2 | 659 | } | 660 | *outptr = out; | 661 | return len; | 662 | #else | 663 | if (native_ordering) { Branch (663:9): [True: 2.02k, False: 979]
| 664 | #if STRINGLIB_MAX_CHAR < 0x10000 | 665 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); | 666 | while (in < unrolled_end) { Branch (666:16): [True: 40.7k, False: 2.02k]
| 667 | /* check if any character is a surrogate character */ | 668 | if (((in[0] ^ 0xd800) & Branch (668:17): [True: 4, False: 40.7k]
| 669 | (in[1] ^ 0xd800) & | 670 | (in[2] ^ 0xd800) & | 671 | (in[3] ^ 0xd800) & 0xf800) == 0) | 672 | break; | 673 | out[0] = in[0]; | 674 | out[1] = in[1]; | 675 | out[2] = in[2]; | 676 | out[3] = in[3]; | 677 | in += 4; out += 4; | 678 | } | 679 | #endif | 680 | while (in < end) { Branch (680:16): [True: 2.07k, False: 1.94k]
| 681 | Py_UCS4 ch; | 682 | ch = *in++; | 683 | if (ch < 0xd800) Branch (683:17): [True: 1.99k, False: 83]
| 684 | *out++ = ch; | 685 | else if (ch < 0xe000) Branch (685:22): [True: 83, False: 0]
| 686 | /* reject surrogate characters (U+D800-U+DFFF) */ | 687 | goto fail; | 688 | #if STRINGLIB_MAX_CHAR >= 0x10000 | 689 | else if (ch >= 0x10000) { | 690 | out[0] = Py_UNICODE_HIGH_SURROGATE(ch); | 691 | out[1] = Py_UNICODE_LOW_SURROGATE(ch); | 692 | out += 2; | 693 | } | 694 | #endif | 695 | else | 696 | *out++ = ch; | 697 | } | 698 | } else { | 699 | #define SWAB2(CH) (((CH) << 8) | ((CH) >> 8)) | 700 | #if STRINGLIB_MAX_CHAR < 0x10000 | 701 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); | 702 | while (in < unrolled_end) { Branch (702:16): [True: 18.8k, False: 977]
| 703 | /* check if any character is a surrogate character */ | 704 | if (((in[0] ^ 0xd800) & Branch (704:17): [True: 2, False: 18.8k]
| 705 | (in[1] ^ 0xd800) & | 706 | (in[2] ^ 0xd800) & | 707 | (in[3] ^ 0xd800) & 0xf800) == 0) | 708 | break; | 709 | out[0] = SWAB2(in[0]); | 710 | out[1] = SWAB2(in[1]); | 711 | out[2] = SWAB2(in[2]); | 712 | out[3] = SWAB2(in[3]); | 713 | in += 4; out += 4; | 714 | } | 715 | #endif | 716 | while (in < end) { Branch (716:16): [True: 996, False: 961]
| 717 | Py_UCS4 ch = *in++; | 718 | if (ch < 0xd800) Branch (718:17): [True: 978, False: 18]
| 719 | *out++ = SWAB2((Py_UCS2)ch); | 720 | else if (ch < 0xe000) Branch (720:22): [True: 18, False: 0]
| 721 | /* reject surrogate characters (U+D800-U+DFFF) */ | 722 | goto fail; | 723 | #if STRINGLIB_MAX_CHAR >= 0x10000 | 724 | else if (ch >= 0x10000) { | 725 | Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch); | 726 | Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch); | 727 | out[0] = SWAB2(ch1); | 728 | out[1] = SWAB2(ch2); | 729 | out += 2; | 730 | } | 731 | #endif | 732 | else | 733 | *out++ = SWAB2((Py_UCS2)ch); | 734 | } | 735 | #undef SWAB2 | 736 | } | 737 | *outptr = out; | 738 | return len; | 739 | fail: | 740 | *outptr = out; | 741 | return len - (end - in + 1); | 742 | #endif | 743 | } |
unicodeobject.c:ucs4lib_utf16_encode Line | Count | Source | 628 | { | 629 | unsigned short *out = *outptr; | 630 | const STRINGLIB_CHAR *end = in + len; | 631 | #if STRINGLIB_SIZEOF_CHAR == 1 | 632 | if (native_ordering) { | 633 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); | 634 | while (in < unrolled_end) { | 635 | out[0] = in[0]; | 636 | out[1] = in[1]; | 637 | out[2] = in[2]; | 638 | out[3] = in[3]; | 639 | in += 4; out += 4; | 640 | } | 641 | while (in < end) { | 642 | *out++ = *in++; | 643 | } | 644 | } else { | 645 | # define SWAB2(CH) ((CH) << 8) /* high byte is zero */ | 646 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); | 647 | while (in < unrolled_end) { | 648 | out[0] = SWAB2(in[0]); | 649 | out[1] = SWAB2(in[1]); | 650 | out[2] = SWAB2(in[2]); | 651 | out[3] = SWAB2(in[3]); | 652 | in += 4; out += 4; | 653 | } | 654 | while (in < end) { | 655 | Py_UCS4 ch = *in++; | 656 | *out++ = SWAB2((Py_UCS2)ch); | 657 | } | 658 | #undef SWAB2 | 659 | } | 660 | *outptr = out; | 661 | return len; | 662 | #else | 663 | if (native_ordering) { Branch (663:9): [True: 40, False: 22]
| 664 | #if STRINGLIB_MAX_CHAR < 0x10000 | 665 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); | 666 | while (in < unrolled_end) { | 667 | /* check if any character is a surrogate character */ | 668 | if (((in[0] ^ 0xd800) & | 669 | (in[1] ^ 0xd800) & | 670 | (in[2] ^ 0xd800) & | 671 | (in[3] ^ 0xd800) & 0xf800) == 0) | 672 | break; | 673 | out[0] = in[0]; | 674 | out[1] = in[1]; | 675 | out[2] = in[2]; | 676 | out[3] = in[3]; | 677 | in += 4; out += 4; | 678 | } | 679 | #endif | 680 | while (in < end) { Branch (680:16): [True: 923, False: 36]
| 681 | Py_UCS4 ch; | 682 | ch = *in++; | 683 | if (ch < 0xd800) Branch (683:17): [True: 850, False: 73]
| 684 | *out++ = ch; | 685 | else if (ch < 0xe000) Branch (685:22): [True: 4, False: 69]
| 686 | /* reject surrogate characters (U+D800-U+DFFF) */ | 687 | goto fail; | 688 | #if STRINGLIB_MAX_CHAR >= 0x10000 | 689 | else if (ch >= 0x10000) { Branch (689:22): [True: 61, False: 8]
| 690 | out[0] = Py_UNICODE_HIGH_SURROGATE(ch); | 691 | out[1] = Py_UNICODE_LOW_SURROGATE(ch); | 692 | out += 2; | 693 | } | 694 | #endif | 695 | else | 696 | *out++ = ch; | 697 | } | 698 | } else { | 699 | #define SWAB2(CH) (((CH) << 8) | ((CH) >> 8)) | 700 | #if STRINGLIB_MAX_CHAR < 0x10000 | 701 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); | 702 | while (in < unrolled_end) { | 703 | /* check if any character is a surrogate character */ | 704 | if (((in[0] ^ 0xd800) & | 705 | (in[1] ^ 0xd800) & | 706 | (in[2] ^ 0xd800) & | 707 | (in[3] ^ 0xd800) & 0xf800) == 0) | 708 | break; | 709 | out[0] = SWAB2(in[0]); | 710 | out[1] = SWAB2(in[1]); | 711 | out[2] = SWAB2(in[2]); | 712 | out[3] = SWAB2(in[3]); | 713 | in += 4; out += 4; | 714 | } | 715 | #endif | 716 | while (in < end) { Branch (716:16): [True: 475, False: 20]
| 717 | Py_UCS4 ch = *in++; | 718 | if (ch < 0xd800) Branch (718:17): [True: 436, False: 39]
| 719 | *out++ = SWAB2((Py_UCS2)ch); | 720 | else if (ch < 0xe000) Branch (720:22): [True: 2, False: 37]
| 721 | /* reject surrogate characters (U+D800-U+DFFF) */ | 722 | goto fail; | 723 | #if STRINGLIB_MAX_CHAR >= 0x10000 | 724 | else if (ch >= 0x10000) { Branch (724:22): [True: 33, False: 4]
| 725 | Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch); | 726 | Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch); | 727 | out[0] = SWAB2(ch1); | 728 | out[1] = SWAB2(ch2); | 729 | out += 2; | 730 | } | 731 | #endif | 732 | else | 733 | *out++ = SWAB2((Py_UCS2)ch); | 734 | } | 735 | #undef SWAB2 | 736 | } | 737 | *outptr = out; | 738 | return len; | 739 | fail: | 740 | *outptr = out; | 741 | return len - (end - in + 1); | 742 | #endif | 743 | } |
|
744 | |
745 | static inline uint32_t |
746 | STRINGLIB(SWAB4)(STRINGLIB_CHAR ch) |
747 | { |
748 | uint32_t word = ch; |
749 | #if STRINGLIB_SIZEOF_CHAR == 1 |
750 | /* high bytes are zero */ |
751 | return (word << 24); |
752 | #elif STRINGLIB_SIZEOF_CHAR == 2 |
753 | /* high bytes are zero */ |
754 | return ((word & 0x00FFu) << 24) | ((word & 0xFF00u) << 8); |
755 | #else |
756 | return _Py_bswap32(word); |
757 | #endif |
758 | } unicodeobject.c:ucs1lib_SWAB4 Line | Count | Source | 747 | { | 748 | uint32_t word = ch; | 749 | #if STRINGLIB_SIZEOF_CHAR == 1 | 750 | /* high bytes are zero */ | 751 | return (word << 24); | 752 | #elif STRINGLIB_SIZEOF_CHAR == 2 | 753 | /* high bytes are zero */ | 754 | return ((word & 0x00FFu) << 24) | ((word & 0xFF00u) << 8); | 755 | #else | 756 | return _Py_bswap32(word); | 757 | #endif | 758 | } |
unicodeobject.c:ucs2lib_SWAB4 Line | Count | Source | 747 | { | 748 | uint32_t word = ch; | 749 | #if STRINGLIB_SIZEOF_CHAR == 1 | 750 | /* high bytes are zero */ | 751 | return (word << 24); | 752 | #elif STRINGLIB_SIZEOF_CHAR == 2 | 753 | /* high bytes are zero */ | 754 | return ((word & 0x00FFu) << 24) | ((word & 0xFF00u) << 8); | 755 | #else | 756 | return _Py_bswap32(word); | 757 | #endif | 758 | } |
unicodeobject.c:ucs4lib_SWAB4 Line | Count | Source | 747 | { | 748 | uint32_t word = ch; | 749 | #if STRINGLIB_SIZEOF_CHAR == 1 | 750 | /* high bytes are zero */ | 751 | return (word << 24); | 752 | #elif STRINGLIB_SIZEOF_CHAR == 2 | 753 | /* high bytes are zero */ | 754 | return ((word & 0x00FFu) << 24) | ((word & 0xFF00u) << 8); | 755 | #else | 756 | return _Py_bswap32(word); | 757 | #endif | 758 | } |
|
759 | |
760 | Py_LOCAL_INLINE(Py_ssize_t) |
761 | STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in, |
762 | Py_ssize_t len, |
763 | uint32_t **outptr, |
764 | int native_ordering) |
765 | { |
766 | uint32_t *out = *outptr; |
767 | const STRINGLIB_CHAR *end = in + len; |
768 | if (native_ordering) { Branch (768:9): [True: 1.31k, False: 577]
Branch (768:9): [True: 480, False: 210]
Branch (768:9): [True: 22, False: 12]
|
769 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
770 | while (in < unrolled_end) { Branch (770:16): [True: 119k, False: 1.31k]
Branch (770:16): [True: 40.7k, False: 476]
Branch (770:16): [True: 12, False: 14]
|
771 | #if STRINGLIB_SIZEOF_CHAR > 1 |
772 | /* check if any character is a surrogate character */ |
773 | if (((in[0] ^ 0xd800) & Branch (773:17): [True: 4, False: 40.7k]
Branch (773:17): [True: 8, False: 4]
|
774 | (in[1] ^ 0xd800) & |
775 | (in[2] ^ 0xd800) & |
776 | (in[3] ^ 0xd800) & 0xf800) == 0) |
777 | break; |
778 | #endif |
779 | out[0] = in[0]; |
780 | out[1] = in[1]; |
781 | out[2] = in[2]; |
782 | out[3] = in[3]; |
783 | in += 4; out += 4; |
784 | } |
785 | while (in < end) { Branch (785:16): [True: 1.40k, False: 1.31k]
Branch (785:16): [True: 511, False: 395]
Branch (785:16): [True: 57, False: 18]
|
786 | Py_UCS4 ch; |
787 | ch = *in++; |
788 | #if STRINGLIB_SIZEOF_CHAR > 1 |
789 | if (Py_UNICODE_IS_SURROGATE(ch)) { Branch (789:17): [True: 85, False: 426]
Branch (789:17): [True: 4, False: 53]
|
790 | /* reject surrogate characters (U+D800-U+DFFF) */ |
791 | goto fail; |
792 | } |
793 | #endif |
794 | *out++ = ch; |
795 | } |
796 | } else { |
797 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
798 | while (in < unrolled_end) { Branch (798:16): [True: 54.6k, False: 577]
Branch (798:16): [True: 18.8k, False: 208]
Branch (798:16): [True: 8, False: 8]
|
799 | #if STRINGLIB_SIZEOF_CHAR > 1 |
800 | /* check if any character is a surrogate character */ |
801 | if (((in[0] ^ 0xd800) & Branch (801:17): [True: 2, False: 18.8k]
Branch (801:17): [True: 4, False: 4]
|
802 | (in[1] ^ 0xd800) & |
803 | (in[2] ^ 0xd800) & |
804 | (in[3] ^ 0xd800) & 0xf800) == 0) |
805 | break; |
806 | #endif |
807 | out[0] = 18.8k STRINGLIB(SWAB4)(in[0]); |
808 | out[1] = STRINGLIB(SWAB4)(in[1]); |
809 | out[2] = STRINGLIB(SWAB4)(in[2]); |
810 | out[3] = STRINGLIB(SWAB4)(in[3]); |
811 | in += 4; out += 4; |
812 | } |
813 | while (in < end) { Branch (813:16): [True: 603, False: 577]
Branch (813:16): [True: 229, False: 192]
Branch (813:16): [True: 29, False: 10]
|
814 | Py_UCS4 ch = *in++; |
815 | #if STRINGLIB_SIZEOF_CHAR > 1 |
816 | if (Py_UNICODE_IS_SURROGATE(ch)) { Branch (816:17): [True: 18, False: 211]
Branch (816:17): [True: 2, False: 27]
|
817 | /* reject surrogate characters (U+D800-U+DFFF) */ |
818 | goto fail; |
819 | } |
820 | #endif |
821 | *out++ = 238 STRINGLIB(SWAB4)(ch); |
822 | } |
823 | } |
824 | *outptr = out; |
825 | return len; |
826 | #if STRINGLIB_SIZEOF_CHAR > 1 |
827 | fail: |
828 | *outptr = out; |
829 | return len - (end - in + 1); |
830 | #endif |
831 | } unicodeobject.c:ucs1lib_utf32_encode Line | Count | Source | 765 | { | 766 | uint32_t *out = *outptr; | 767 | const STRINGLIB_CHAR *end = in + len; | 768 | if (native_ordering) { Branch (768:9): [True: 1.31k, False: 577]
| 769 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); | 770 | while (in < unrolled_end) { Branch (770:16): [True: 119k, False: 1.31k]
| 771 | #if STRINGLIB_SIZEOF_CHAR > 1 | 772 | /* check if any character is a surrogate character */ | 773 | if (((in[0] ^ 0xd800) & | 774 | (in[1] ^ 0xd800) & | 775 | (in[2] ^ 0xd800) & | 776 | (in[3] ^ 0xd800) & 0xf800) == 0) | 777 | break; | 778 | #endif | 779 | out[0] = in[0]; | 780 | out[1] = in[1]; | 781 | out[2] = in[2]; | 782 | out[3] = in[3]; | 783 | in += 4; out += 4; | 784 | } | 785 | while (in < end) { Branch (785:16): [True: 1.40k, False: 1.31k]
| 786 | Py_UCS4 ch; | 787 | ch = *in++; | 788 | #if STRINGLIB_SIZEOF_CHAR > 1 | 789 | if (Py_UNICODE_IS_SURROGATE(ch)) { | 790 | /* reject surrogate characters (U+D800-U+DFFF) */ | 791 | goto fail; | 792 | } | 793 | #endif | 794 | *out++ = ch; | 795 | } | 796 | } else { | 797 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); | 798 | while (in < unrolled_end) { Branch (798:16): [True: 54.6k, False: 577]
| 799 | #if STRINGLIB_SIZEOF_CHAR > 1 | 800 | /* check if any character is a surrogate character */ | 801 | if (((in[0] ^ 0xd800) & | 802 | (in[1] ^ 0xd800) & | 803 | (in[2] ^ 0xd800) & | 804 | (in[3] ^ 0xd800) & 0xf800) == 0) | 805 | break; | 806 | #endif | 807 | out[0] = STRINGLIB(SWAB4)(in[0]); | 808 | out[1] = STRINGLIB(SWAB4)(in[1]); | 809 | out[2] = STRINGLIB(SWAB4)(in[2]); | 810 | out[3] = STRINGLIB(SWAB4)(in[3]); | 811 | in += 4; out += 4; | 812 | } | 813 | while (in < end) { Branch (813:16): [True: 603, False: 577]
| 814 | Py_UCS4 ch = *in++; | 815 | #if STRINGLIB_SIZEOF_CHAR > 1 | 816 | if (Py_UNICODE_IS_SURROGATE(ch)) { | 817 | /* reject surrogate characters (U+D800-U+DFFF) */ | 818 | goto fail; | 819 | } | 820 | #endif | 821 | *out++ = STRINGLIB(SWAB4)(ch); | 822 | } | 823 | } | 824 | *outptr = out; | 825 | return len; | 826 | #if STRINGLIB_SIZEOF_CHAR > 1 | 827 | fail: | 828 | *outptr = out; | 829 | return len - (end - in + 1); | 830 | #endif | 831 | } |
unicodeobject.c:ucs2lib_utf32_encode Line | Count | Source | 765 | { | 766 | uint32_t *out = *outptr; | 767 | const STRINGLIB_CHAR *end = in + len; | 768 | if (native_ordering) { Branch (768:9): [True: 480, False: 210]
| 769 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); | 770 | while (in < unrolled_end) { Branch (770:16): [True: 40.7k, False: 476]
| 771 | #if STRINGLIB_SIZEOF_CHAR > 1 | 772 | /* check if any character is a surrogate character */ | 773 | if (((in[0] ^ 0xd800) & Branch (773:17): [True: 4, False: 40.7k]
| 774 | (in[1] ^ 0xd800) & | 775 | (in[2] ^ 0xd800) & | 776 | (in[3] ^ 0xd800) & 0xf800) == 0) | 777 | break; | 778 | #endif | 779 | out[0] = in[0]; | 780 | out[1] = in[1]; | 781 | out[2] = in[2]; | 782 | out[3] = in[3]; | 783 | in += 4; out += 4; | 784 | } | 785 | while (in < end) { Branch (785:16): [True: 511, False: 395]
| 786 | Py_UCS4 ch; | 787 | ch = *in++; | 788 | #if STRINGLIB_SIZEOF_CHAR > 1 | 789 | if (Py_UNICODE_IS_SURROGATE(ch)) { Branch (789:17): [True: 85, False: 426]
| 790 | /* reject surrogate characters (U+D800-U+DFFF) */ | 791 | goto fail; | 792 | } | 793 | #endif | 794 | *out++ = ch; | 795 | } | 796 | } else { | 797 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); | 798 | while (in < unrolled_end) { Branch (798:16): [True: 18.8k, False: 208]
| 799 | #if STRINGLIB_SIZEOF_CHAR > 1 | 800 | /* check if any character is a surrogate character */ | 801 | if (((in[0] ^ 0xd800) & Branch (801:17): [True: 2, False: 18.8k]
| 802 | (in[1] ^ 0xd800) & | 803 | (in[2] ^ 0xd800) & | 804 | (in[3] ^ 0xd800) & 0xf800) == 0) | 805 | break; | 806 | #endif | 807 | out[0] = STRINGLIB(SWAB4)(in[0]); | 808 | out[1] = STRINGLIB(SWAB4)(in[1]); | 809 | out[2] = STRINGLIB(SWAB4)(in[2]); | 810 | out[3] = STRINGLIB(SWAB4)(in[3]); | 811 | in += 4; out += 4; | 812 | } | 813 | while (in < end) { Branch (813:16): [True: 229, False: 192]
| 814 | Py_UCS4 ch = *in++; | 815 | #if STRINGLIB_SIZEOF_CHAR > 1 | 816 | if (Py_UNICODE_IS_SURROGATE(ch)) { Branch (816:17): [True: 18, False: 211]
| 817 | /* reject surrogate characters (U+D800-U+DFFF) */ | 818 | goto fail; | 819 | } | 820 | #endif | 821 | *out++ = STRINGLIB(SWAB4)(ch); | 822 | } | 823 | } | 824 | *outptr = out; | 825 | return len; | 826 | #if STRINGLIB_SIZEOF_CHAR > 1 | 827 | fail: | 828 | *outptr = out; | 829 | return len - (end - in + 1); | 830 | #endif | 831 | } |
unicodeobject.c:ucs4lib_utf32_encode Line | Count | Source | 765 | { | 766 | uint32_t *out = *outptr; | 767 | const STRINGLIB_CHAR *end = in + len; | 768 | if (native_ordering) { Branch (768:9): [True: 22, False: 12]
| 769 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); | 770 | while (in < unrolled_end) { Branch (770:16): [True: 12, False: 14]
| 771 | #if STRINGLIB_SIZEOF_CHAR > 1 | 772 | /* check if any character is a surrogate character */ | 773 | if (((in[0] ^ 0xd800) & Branch (773:17): [True: 8, False: 4]
| 774 | (in[1] ^ 0xd800) & | 775 | (in[2] ^ 0xd800) & | 776 | (in[3] ^ 0xd800) & 0xf800) == 0) | 777 | break; | 778 | #endif | 779 | out[0] = in[0]; | 780 | out[1] = in[1]; | 781 | out[2] = in[2]; | 782 | out[3] = in[3]; | 783 | in += 4; out += 4; | 784 | } | 785 | while (in < end) { Branch (785:16): [True: 57, False: 18]
| 786 | Py_UCS4 ch; | 787 | ch = *in++; | 788 | #if STRINGLIB_SIZEOF_CHAR > 1 | 789 | if (Py_UNICODE_IS_SURROGATE(ch)) { Branch (789:17): [True: 4, False: 53]
| 790 | /* reject surrogate characters (U+D800-U+DFFF) */ | 791 | goto fail; | 792 | } | 793 | #endif | 794 | *out++ = ch; | 795 | } | 796 | } else { | 797 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); | 798 | while (in < unrolled_end) { Branch (798:16): [True: 8, False: 8]
| 799 | #if STRINGLIB_SIZEOF_CHAR > 1 | 800 | /* check if any character is a surrogate character */ | 801 | if (((in[0] ^ 0xd800) & Branch (801:17): [True: 4, False: 4]
| 802 | (in[1] ^ 0xd800) & | 803 | (in[2] ^ 0xd800) & | 804 | (in[3] ^ 0xd800) & 0xf800) == 0) | 805 | break; | 806 | #endif | 807 | out[0] = STRINGLIB(SWAB4)(in[0]); | 808 | out[1] = STRINGLIB(SWAB4)(in[1]); | 809 | out[2] = STRINGLIB(SWAB4)(in[2]); | 810 | out[3] = STRINGLIB(SWAB4)(in[3]); | 811 | in += 4; out += 4; | 812 | } | 813 | while (in < end) { Branch (813:16): [True: 29, False: 10]
| 814 | Py_UCS4 ch = *in++; | 815 | #if STRINGLIB_SIZEOF_CHAR > 1 | 816 | if (Py_UNICODE_IS_SURROGATE(ch)) { Branch (816:17): [True: 2, False: 27]
| 817 | /* reject surrogate characters (U+D800-U+DFFF) */ | 818 | goto fail; | 819 | } | 820 | #endif | 821 | *out++ = STRINGLIB(SWAB4)(ch); | 822 | } | 823 | } | 824 | *outptr = out; | 825 | return len; | 826 | #if STRINGLIB_SIZEOF_CHAR > 1 | 827 | fail: | 828 | *outptr = out; | 829 | return len - (end - in + 1); | 830 | #endif | 831 | } |
|
832 | |
833 | #endif |