Line data Source code
1 : /* stringlib: split implementation */
2 :
3 : #ifndef STRINGLIB_FASTSEARCH_H
4 : #error must include "stringlib/fastsearch.h" before including this module
5 : #endif
6 :
7 : /* Overallocate the initial list to reduce the number of reallocs for small
8 : split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three
9 : resizes, to sizes 4, 8, then 16. Most observed string splits are for human
10 : text (roughly 11 words per line) and field delimited data (usually 1-10
11 : fields). For large strings the split algorithms are bandwidth limited
12 : so increasing the preallocation likely will not improve things.*/
13 :
14 : #define MAX_PREALLOC 12
15 :
16 : /* 5 splits gives 6 elements */
17 : #define PREALLOC_SIZE(maxsplit) \
18 : (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
19 :
20 : #define SPLIT_APPEND(data, left, right) \
21 : sub = STRINGLIB_NEW((data) + (left), \
22 : (right) - (left)); \
23 : if (sub == NULL) \
24 : goto onError; \
25 : if (PyList_Append(list, sub)) { \
26 : Py_DECREF(sub); \
27 : goto onError; \
28 : } \
29 : else \
30 : Py_DECREF(sub);
31 :
32 : #define SPLIT_ADD(data, left, right) { \
33 : sub = STRINGLIB_NEW((data) + (left), \
34 : (right) - (left)); \
35 : if (sub == NULL) \
36 : goto onError; \
37 : if (count < MAX_PREALLOC) { \
38 : PyList_SET_ITEM(list, count, sub); \
39 : } else { \
40 : if (PyList_Append(list, sub)) { \
41 : Py_DECREF(sub); \
42 : goto onError; \
43 : } \
44 : else \
45 : Py_DECREF(sub); \
46 : } \
47 : count++; }
48 :
49 :
50 : /* Always force the list to the expected size. */
51 : #define FIX_PREALLOC_SIZE(list) Py_SET_SIZE(list, count)
52 :
53 : Py_LOCAL_INLINE(PyObject *)
54 478353 : STRINGLIB(split_whitespace)(PyObject* str_obj,
55 : const STRINGLIB_CHAR* str, Py_ssize_t str_len,
56 : Py_ssize_t maxcount)
57 : {
58 478353 : Py_ssize_t i, j, count=0;
59 478353 : PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
60 : PyObject *sub;
61 :
62 478353 : if (list == NULL)
63 0 : return NULL;
64 :
65 478353 : i = j = 0;
66 1679330 : while (maxcount-- > 0) {
67 3136264 : while (i < str_len && STRINGLIB_ISSPACE(str[i]))
68 1458644 : i++;
69 1677620 : if (i == str_len) break;
70 1279562 : j = i; i++;
71 9281332 : while (i < str_len && !STRINGLIB_ISSPACE(str[i]))
72 8001770 : i++;
73 : #if !STRINGLIB_MUTABLE
74 1279375 : if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
75 : /* No whitespace in str_obj, so just use it as list[0] */
76 78584 : Py_INCREF(str_obj);
77 78584 : PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
78 78584 : count++;
79 78584 : break;
80 : }
81 : #endif
82 1200983 : SPLIT_ADD(str, j, i);
83 : }
84 :
85 478353 : if (i < str_len) {
86 : /* Only occurs when maxcount was reached */
87 : /* Skip any remaining whitespace and copy to end of string */
88 3428 : while (i < str_len && STRINGLIB_ISSPACE(str[i]))
89 1744 : i++;
90 1684 : if (i != str_len)
91 1678 : SPLIT_ADD(str, i, str_len);
92 : }
93 478353 : FIX_PREALLOC_SIZE(list);
94 478353 : return list;
95 :
96 0 : onError:
97 0 : Py_DECREF(list);
98 0 : return NULL;
99 : }
100 :
101 : Py_LOCAL_INLINE(PyObject *)
102 1324566 : STRINGLIB(split_char)(PyObject* str_obj,
103 : const STRINGLIB_CHAR* str, Py_ssize_t str_len,
104 : const STRINGLIB_CHAR ch,
105 : Py_ssize_t maxcount)
106 : {
107 1324566 : Py_ssize_t i, j, count=0;
108 1324566 : PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
109 : PyObject *sub;
110 :
111 1324566 : if (list == NULL)
112 0 : return NULL;
113 :
114 1324566 : i = j = 0;
115 3795035 : while ((j < str_len) && (maxcount-- > 0)) {
116 22279857 : for(; j < str_len; j++) {
117 : /* I found that using memchr makes no difference */
118 21043322 : if (str[j] == ch) {
119 1233983 : SPLIT_ADD(str, i, j);
120 1233983 : i = j = j + 1;
121 1233983 : break;
122 : }
123 : }
124 : }
125 : #if !STRINGLIB_MUTABLE
126 1322452 : if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
127 : /* ch not in str_obj, so just use str_obj as list[0] */
128 552017 : Py_INCREF(str_obj);
129 552017 : PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
130 552017 : count++;
131 : } else
132 : #endif
133 772546 : if (i <= str_len) {
134 772546 : SPLIT_ADD(str, i, str_len);
135 : }
136 1324566 : FIX_PREALLOC_SIZE(list);
137 1324566 : return list;
138 :
139 0 : onError:
140 0 : Py_DECREF(list);
141 0 : return NULL;
142 : }
143 :
144 : Py_LOCAL_INLINE(PyObject *)
145 1883980 : STRINGLIB(split)(PyObject* str_obj,
146 : const STRINGLIB_CHAR* str, Py_ssize_t str_len,
147 : const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
148 : Py_ssize_t maxcount)
149 : {
150 1883980 : Py_ssize_t i, j, pos, count=0;
151 : PyObject *list, *sub;
152 :
153 1883980 : if (sep_len == 0) {
154 9 : PyErr_SetString(PyExc_ValueError, "empty separator");
155 9 : return NULL;
156 : }
157 1883976 : else if (sep_len == 1)
158 1324566 : return STRINGLIB(split_char)(str_obj, str, str_len, sep[0], maxcount);
159 :
160 559413 : list = PyList_New(PREALLOC_SIZE(maxcount));
161 559413 : if (list == NULL)
162 0 : return NULL;
163 :
164 559413 : i = j = 0;
165 1014616 : while (maxcount-- > 0) {
166 1014368 : pos = FASTSEARCH(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH);
167 1014368 : if (pos < 0)
168 559163 : break;
169 455204 : j = i + pos;
170 455204 : SPLIT_ADD(str, i, j);
171 455204 : i = j + sep_len;
172 : }
173 : #if !STRINGLIB_MUTABLE
174 559390 : if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
175 : /* No match in str_obj, so just use it as list[0] */
176 141554 : Py_INCREF(str_obj);
177 141554 : PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
178 141554 : count++;
179 : } else
180 : #endif
181 : {
182 417859 : SPLIT_ADD(str, i, str_len);
183 : }
184 559413 : FIX_PREALLOC_SIZE(list);
185 559413 : return list;
186 :
187 0 : onError:
188 0 : Py_DECREF(list);
189 0 : return NULL;
190 : }
191 :
192 : Py_LOCAL_INLINE(PyObject *)
193 155 : STRINGLIB(rsplit_whitespace)(PyObject* str_obj,
194 : const STRINGLIB_CHAR* str, Py_ssize_t str_len,
195 : Py_ssize_t maxcount)
196 : {
197 155 : Py_ssize_t i, j, count=0;
198 155 : PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
199 : PyObject *sub;
200 :
201 155 : if (list == NULL)
202 0 : return NULL;
203 :
204 155 : i = j = str_len - 1;
205 586 : while (maxcount-- > 0) {
206 1176 : while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
207 670 : i--;
208 506 : if (i < 0) break;
209 431 : j = i; i--;
210 812 : while (i >= 0 && !STRINGLIB_ISSPACE(str[i]))
211 381 : i--;
212 : #if !STRINGLIB_MUTABLE
213 322 : if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
214 : /* No whitespace in str_obj, so just use it as list[0] */
215 0 : Py_INCREF(str_obj);
216 0 : PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
217 0 : count++;
218 0 : break;
219 : }
220 : #endif
221 431 : SPLIT_ADD(str, i + 1, j + 1);
222 : }
223 :
224 155 : if (i >= 0) {
225 : /* Only occurs when maxcount was reached */
226 : /* Skip any remaining whitespace and copy to beginning of string */
227 168 : while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
228 112 : i--;
229 56 : if (i >= 0)
230 52 : SPLIT_ADD(str, 0, i + 1);
231 : }
232 155 : FIX_PREALLOC_SIZE(list);
233 155 : if (PyList_Reverse(list) < 0)
234 0 : goto onError;
235 155 : return list;
236 :
237 0 : onError:
238 0 : Py_DECREF(list);
239 0 : return NULL;
240 : }
241 :
242 : Py_LOCAL_INLINE(PyObject *)
243 3715 : STRINGLIB(rsplit_char)(PyObject* str_obj,
244 : const STRINGLIB_CHAR* str, Py_ssize_t str_len,
245 : const STRINGLIB_CHAR ch,
246 : Py_ssize_t maxcount)
247 : {
248 3715 : Py_ssize_t i, j, count=0;
249 3715 : PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
250 : PyObject *sub;
251 :
252 3715 : if (list == NULL)
253 0 : return NULL;
254 :
255 3715 : i = j = str_len - 1;
256 7641 : while ((i >= 0) && (maxcount-- > 0)) {
257 31802 : for(; i >= 0; i--) {
258 31740 : if (str[i] == ch) {
259 3864 : SPLIT_ADD(str, i + 1, j + 1);
260 3864 : j = i = i - 1;
261 3864 : break;
262 : }
263 : }
264 : }
265 : #if !STRINGLIB_MUTABLE
266 3695 : if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
267 : /* ch not in str_obj, so just use str_obj as list[0] */
268 48 : Py_INCREF(str_obj);
269 48 : PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
270 48 : count++;
271 : } else
272 : #endif
273 3667 : if (j >= -1) {
274 3667 : SPLIT_ADD(str, 0, j + 1);
275 : }
276 3715 : FIX_PREALLOC_SIZE(list);
277 3715 : if (PyList_Reverse(list) < 0)
278 0 : goto onError;
279 3715 : return list;
280 :
281 0 : onError:
282 0 : Py_DECREF(list);
283 0 : return NULL;
284 : }
285 :
286 : Py_LOCAL_INLINE(PyObject *)
287 3826 : STRINGLIB(rsplit)(PyObject* str_obj,
288 : const STRINGLIB_CHAR* str, Py_ssize_t str_len,
289 : const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
290 : Py_ssize_t maxcount)
291 : {
292 3826 : Py_ssize_t j, pos, count=0;
293 : PyObject *list, *sub;
294 :
295 3826 : if (sep_len == 0) {
296 8 : PyErr_SetString(PyExc_ValueError, "empty separator");
297 8 : return NULL;
298 : }
299 3818 : else if (sep_len == 1)
300 3715 : return STRINGLIB(rsplit_char)(str_obj, str, str_len, sep[0], maxcount);
301 :
302 103 : list = PyList_New(PREALLOC_SIZE(maxcount));
303 103 : if (list == NULL)
304 0 : return NULL;
305 :
306 103 : j = str_len;
307 444 : while (maxcount-- > 0) {
308 412 : pos = FASTSEARCH(str, j, sep, sep_len, -1, FAST_RSEARCH);
309 412 : if (pos < 0)
310 71 : break;
311 341 : SPLIT_ADD(str, pos + sep_len, j);
312 341 : j = pos;
313 : }
314 : #if !STRINGLIB_MUTABLE
315 80 : if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
316 : /* No match in str_obj, so just use it as list[0] */
317 17 : Py_INCREF(str_obj);
318 17 : PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
319 17 : count++;
320 : } else
321 : #endif
322 : {
323 86 : SPLIT_ADD(str, 0, j);
324 : }
325 103 : FIX_PREALLOC_SIZE(list);
326 103 : if (PyList_Reverse(list) < 0)
327 0 : goto onError;
328 103 : return list;
329 :
330 0 : onError:
331 0 : Py_DECREF(list);
332 0 : return NULL;
333 : }
334 :
335 : Py_LOCAL_INLINE(PyObject *)
336 331905 : STRINGLIB(splitlines)(PyObject* str_obj,
337 : const STRINGLIB_CHAR* str, Py_ssize_t str_len,
338 : int keepends)
339 : {
340 : /* This does not use the preallocated list because splitlines is
341 : usually run with hundreds of newlines. The overhead of
342 : switching between PyList_SET_ITEM and append causes about a
343 : 2-3% slowdown for that common case. A smarter implementation
344 : could move the if check out, so the SET_ITEMs are done first
345 : and the appends only done when the prealloc buffer is full.
346 : That's too much work for little gain.*/
347 :
348 : Py_ssize_t i;
349 : Py_ssize_t j;
350 331905 : PyObject *list = PyList_New(0);
351 : PyObject *sub;
352 :
353 331905 : if (list == NULL)
354 0 : return NULL;
355 :
356 1772376 : for (i = j = 0; i < str_len; ) {
357 : Py_ssize_t eol;
358 :
359 : /* Find a line and append it */
360 56811404 : while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i]))
361 55190593 : i++;
362 :
363 : /* Skip the line break reading CRLF as one line break */
364 1620773 : eol = i;
365 1620773 : if (i < str_len) {
366 1407887 : if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n')
367 33858 : i += 2;
368 : else
369 1374029 : i++;
370 1407887 : if (keepends)
371 1217421 : eol = i;
372 : }
373 : #if !STRINGLIB_MUTABLE
374 1620513 : if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
375 : /* No linebreak in str_obj, so just use it as list[0] */
376 180302 : if (PyList_Append(list, str_obj))
377 0 : goto onError;
378 180302 : break;
379 : }
380 : #endif
381 1440471 : SPLIT_APPEND(str, j, eol);
382 1440471 : j = i;
383 : }
384 331905 : return list;
385 :
386 0 : onError:
387 0 : Py_DECREF(list);
388 0 : return NULL;
389 : }
390 :
|