LCOV - code coverage report
Current view: top level - Objects/stringlib - split.h (source / functions) Hit Total Coverage
Test: CPython lcov report Lines: 153 189 81.0 %
Date: 2022-07-07 18:19:46 Functions: 31 35 88.6 %

          Line data    Source code
       1             : /* stringlib: split implementation */
       2             : 
       3             : #ifndef STRINGLIB_FASTSEARCH_H
       4             : #error must include "stringlib/fastsearch.h" before including this module
       5             : #endif
       6             : 
       7             : /* Overallocate the initial list to reduce the number of reallocs for small
       8             :    split sizes.  Eg, "A A A A A A A A A A".split() (10 elements) has three
       9             :    resizes, to sizes 4, 8, then 16.  Most observed string splits are for human
      10             :    text (roughly 11 words per line) and field delimited data (usually 1-10
      11             :    fields).  For large strings the split algorithms are bandwidth limited
      12             :    so increasing the preallocation likely will not improve things.*/
      13             : 
      14             : #define MAX_PREALLOC 12
      15             : 
      16             : /* 5 splits gives 6 elements */
      17             : #define PREALLOC_SIZE(maxsplit) \
      18             :     (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
      19             : 
      20             : #define SPLIT_APPEND(data, left, right)         \
      21             :     sub = STRINGLIB_NEW((data) + (left),        \
      22             :                         (right) - (left));      \
      23             :     if (sub == NULL)                            \
      24             :         goto onError;                           \
      25             :     if (PyList_Append(list, sub)) {             \
      26             :         Py_DECREF(sub);                         \
      27             :         goto onError;                           \
      28             :     }                                           \
      29             :     else                                        \
      30             :         Py_DECREF(sub);
      31             : 
      32             : #define SPLIT_ADD(data, left, right) {          \
      33             :     sub = STRINGLIB_NEW((data) + (left),        \
      34             :                         (right) - (left));      \
      35             :     if (sub == NULL)                            \
      36             :         goto onError;                           \
      37             :     if (count < MAX_PREALLOC) {                 \
      38             :         PyList_SET_ITEM(list, count, sub);      \
      39             :     } else {                                    \
      40             :         if (PyList_Append(list, sub)) {         \
      41             :             Py_DECREF(sub);                     \
      42             :             goto onError;                       \
      43             :         }                                       \
      44             :         else                                    \
      45             :             Py_DECREF(sub);                     \
      46             :     }                                           \
      47             :     count++; }
      48             : 
      49             : 
      50             : /* Always force the list to the expected size. */
      51             : #define FIX_PREALLOC_SIZE(list) Py_SET_SIZE(list, count)
      52             : 
      53             : Py_LOCAL_INLINE(PyObject *)
      54      478353 : STRINGLIB(split_whitespace)(PyObject* str_obj,
      55             :                            const STRINGLIB_CHAR* str, Py_ssize_t str_len,
      56             :                            Py_ssize_t maxcount)
      57             : {
      58      478353 :     Py_ssize_t i, j, count=0;
      59      478353 :     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
      60             :     PyObject *sub;
      61             : 
      62      478353 :     if (list == NULL)
      63           0 :         return NULL;
      64             : 
      65      478353 :     i = j = 0;
      66     1679330 :     while (maxcount-- > 0) {
      67     3136264 :         while (i < str_len && STRINGLIB_ISSPACE(str[i]))
      68     1458644 :             i++;
      69     1677620 :         if (i == str_len) break;
      70     1279562 :         j = i; i++;
      71     9281332 :         while (i < str_len && !STRINGLIB_ISSPACE(str[i]))
      72     8001770 :             i++;
      73             : #if !STRINGLIB_MUTABLE
      74     1279375 :         if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
      75             :             /* No whitespace in str_obj, so just use it as list[0] */
      76       78584 :             Py_INCREF(str_obj);
      77       78584 :             PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
      78       78584 :             count++;
      79       78584 :             break;
      80             :         }
      81             : #endif
      82     1200983 :         SPLIT_ADD(str, j, i);
      83             :     }
      84             : 
      85      478353 :     if (i < str_len) {
      86             :         /* Only occurs when maxcount was reached */
      87             :         /* Skip any remaining whitespace and copy to end of string */
      88        3428 :         while (i < str_len && STRINGLIB_ISSPACE(str[i]))
      89        1744 :             i++;
      90        1684 :         if (i != str_len)
      91        1678 :             SPLIT_ADD(str, i, str_len);
      92             :     }
      93      478353 :     FIX_PREALLOC_SIZE(list);
      94      478353 :     return list;
      95             : 
      96           0 :   onError:
      97           0 :     Py_DECREF(list);
      98           0 :     return NULL;
      99             : }
     100             : 
     101             : Py_LOCAL_INLINE(PyObject *)
     102     1324566 : STRINGLIB(split_char)(PyObject* str_obj,
     103             :                      const STRINGLIB_CHAR* str, Py_ssize_t str_len,
     104             :                      const STRINGLIB_CHAR ch,
     105             :                      Py_ssize_t maxcount)
     106             : {
     107     1324566 :     Py_ssize_t i, j, count=0;
     108     1324566 :     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
     109             :     PyObject *sub;
     110             : 
     111     1324566 :     if (list == NULL)
     112           0 :         return NULL;
     113             : 
     114     1324566 :     i = j = 0;
     115     3795035 :     while ((j < str_len) && (maxcount-- > 0)) {
     116    22279857 :         for(; j < str_len; j++) {
     117             :             /* I found that using memchr makes no difference */
     118    21043322 :             if (str[j] == ch) {
     119     1233983 :                 SPLIT_ADD(str, i, j);
     120     1233983 :                 i = j = j + 1;
     121     1233983 :                 break;
     122             :             }
     123             :         }
     124             :     }
     125             : #if !STRINGLIB_MUTABLE
     126     1322452 :     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
     127             :         /* ch not in str_obj, so just use str_obj as list[0] */
     128      552017 :         Py_INCREF(str_obj);
     129      552017 :         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
     130      552017 :         count++;
     131             :     } else
     132             : #endif
     133      772546 :     if (i <= str_len) {
     134      772546 :         SPLIT_ADD(str, i, str_len);
     135             :     }
     136     1324566 :     FIX_PREALLOC_SIZE(list);
     137     1324566 :     return list;
     138             : 
     139           0 :   onError:
     140           0 :     Py_DECREF(list);
     141           0 :     return NULL;
     142             : }
     143             : 
     144             : Py_LOCAL_INLINE(PyObject *)
     145     1883980 : STRINGLIB(split)(PyObject* str_obj,
     146             :                 const STRINGLIB_CHAR* str, Py_ssize_t str_len,
     147             :                 const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
     148             :                 Py_ssize_t maxcount)
     149             : {
     150     1883980 :     Py_ssize_t i, j, pos, count=0;
     151             :     PyObject *list, *sub;
     152             : 
     153     1883980 :     if (sep_len == 0) {
     154           9 :         PyErr_SetString(PyExc_ValueError, "empty separator");
     155           9 :         return NULL;
     156             :     }
     157     1883976 :     else if (sep_len == 1)
     158     1324566 :         return STRINGLIB(split_char)(str_obj, str, str_len, sep[0], maxcount);
     159             : 
     160      559413 :     list = PyList_New(PREALLOC_SIZE(maxcount));
     161      559413 :     if (list == NULL)
     162           0 :         return NULL;
     163             : 
     164      559413 :     i = j = 0;
     165     1014616 :     while (maxcount-- > 0) {
     166     1014368 :         pos = FASTSEARCH(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH);
     167     1014368 :         if (pos < 0)
     168      559163 :             break;
     169      455204 :         j = i + pos;
     170      455204 :         SPLIT_ADD(str, i, j);
     171      455204 :         i = j + sep_len;
     172             :     }
     173             : #if !STRINGLIB_MUTABLE
     174      559390 :     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
     175             :         /* No match in str_obj, so just use it as list[0] */
     176      141554 :         Py_INCREF(str_obj);
     177      141554 :         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
     178      141554 :         count++;
     179             :     } else
     180             : #endif
     181             :     {
     182      417859 :         SPLIT_ADD(str, i, str_len);
     183             :     }
     184      559413 :     FIX_PREALLOC_SIZE(list);
     185      559413 :     return list;
     186             : 
     187           0 :   onError:
     188           0 :     Py_DECREF(list);
     189           0 :     return NULL;
     190             : }
     191             : 
     192             : Py_LOCAL_INLINE(PyObject *)
     193         155 : STRINGLIB(rsplit_whitespace)(PyObject* str_obj,
     194             :                             const STRINGLIB_CHAR* str, Py_ssize_t str_len,
     195             :                             Py_ssize_t maxcount)
     196             : {
     197         155 :     Py_ssize_t i, j, count=0;
     198         155 :     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
     199             :     PyObject *sub;
     200             : 
     201         155 :     if (list == NULL)
     202           0 :         return NULL;
     203             : 
     204         155 :     i = j = str_len - 1;
     205         586 :     while (maxcount-- > 0) {
     206        1176 :         while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
     207         670 :             i--;
     208         506 :         if (i < 0) break;
     209         431 :         j = i; i--;
     210         812 :         while (i >= 0 && !STRINGLIB_ISSPACE(str[i]))
     211         381 :             i--;
     212             : #if !STRINGLIB_MUTABLE
     213         322 :         if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
     214             :             /* No whitespace in str_obj, so just use it as list[0] */
     215           0 :             Py_INCREF(str_obj);
     216           0 :             PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
     217           0 :             count++;
     218           0 :             break;
     219             :         }
     220             : #endif
     221         431 :         SPLIT_ADD(str, i + 1, j + 1);
     222             :     }
     223             : 
     224         155 :     if (i >= 0) {
     225             :         /* Only occurs when maxcount was reached */
     226             :         /* Skip any remaining whitespace and copy to beginning of string */
     227         168 :         while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
     228         112 :             i--;
     229          56 :         if (i >= 0)
     230          52 :             SPLIT_ADD(str, 0, i + 1);
     231             :     }
     232         155 :     FIX_PREALLOC_SIZE(list);
     233         155 :     if (PyList_Reverse(list) < 0)
     234           0 :         goto onError;
     235         155 :     return list;
     236             : 
     237           0 :   onError:
     238           0 :     Py_DECREF(list);
     239           0 :     return NULL;
     240             : }
     241             : 
     242             : Py_LOCAL_INLINE(PyObject *)
     243        3715 : STRINGLIB(rsplit_char)(PyObject* str_obj,
     244             :                       const STRINGLIB_CHAR* str, Py_ssize_t str_len,
     245             :                       const STRINGLIB_CHAR ch,
     246             :                       Py_ssize_t maxcount)
     247             : {
     248        3715 :     Py_ssize_t i, j, count=0;
     249        3715 :     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
     250             :     PyObject *sub;
     251             : 
     252        3715 :     if (list == NULL)
     253           0 :         return NULL;
     254             : 
     255        3715 :     i = j = str_len - 1;
     256        7641 :     while ((i >= 0) && (maxcount-- > 0)) {
     257       31802 :         for(; i >= 0; i--) {
     258       31740 :             if (str[i] == ch) {
     259        3864 :                 SPLIT_ADD(str, i + 1, j + 1);
     260        3864 :                 j = i = i - 1;
     261        3864 :                 break;
     262             :             }
     263             :         }
     264             :     }
     265             : #if !STRINGLIB_MUTABLE
     266        3695 :     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
     267             :         /* ch not in str_obj, so just use str_obj as list[0] */
     268          48 :         Py_INCREF(str_obj);
     269          48 :         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
     270          48 :         count++;
     271             :     } else
     272             : #endif
     273        3667 :     if (j >= -1) {
     274        3667 :         SPLIT_ADD(str, 0, j + 1);
     275             :     }
     276        3715 :     FIX_PREALLOC_SIZE(list);
     277        3715 :     if (PyList_Reverse(list) < 0)
     278           0 :         goto onError;
     279        3715 :     return list;
     280             : 
     281           0 :   onError:
     282           0 :     Py_DECREF(list);
     283           0 :     return NULL;
     284             : }
     285             : 
     286             : Py_LOCAL_INLINE(PyObject *)
     287        3826 : STRINGLIB(rsplit)(PyObject* str_obj,
     288             :                  const STRINGLIB_CHAR* str, Py_ssize_t str_len,
     289             :                  const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
     290             :                  Py_ssize_t maxcount)
     291             : {
     292        3826 :     Py_ssize_t j, pos, count=0;
     293             :     PyObject *list, *sub;
     294             : 
     295        3826 :     if (sep_len == 0) {
     296           8 :         PyErr_SetString(PyExc_ValueError, "empty separator");
     297           8 :         return NULL;
     298             :     }
     299        3818 :     else if (sep_len == 1)
     300        3715 :         return STRINGLIB(rsplit_char)(str_obj, str, str_len, sep[0], maxcount);
     301             : 
     302         103 :     list = PyList_New(PREALLOC_SIZE(maxcount));
     303         103 :     if (list == NULL)
     304           0 :         return NULL;
     305             : 
     306         103 :     j = str_len;
     307         444 :     while (maxcount-- > 0) {
     308         412 :         pos = FASTSEARCH(str, j, sep, sep_len, -1, FAST_RSEARCH);
     309         412 :         if (pos < 0)
     310          71 :             break;
     311         341 :         SPLIT_ADD(str, pos + sep_len, j);
     312         341 :         j = pos;
     313             :     }
     314             : #if !STRINGLIB_MUTABLE
     315          80 :     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
     316             :         /* No match in str_obj, so just use it as list[0] */
     317          17 :         Py_INCREF(str_obj);
     318          17 :         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
     319          17 :         count++;
     320             :     } else
     321             : #endif
     322             :     {
     323          86 :         SPLIT_ADD(str, 0, j);
     324             :     }
     325         103 :     FIX_PREALLOC_SIZE(list);
     326         103 :     if (PyList_Reverse(list) < 0)
     327           0 :         goto onError;
     328         103 :     return list;
     329             : 
     330           0 :   onError:
     331           0 :     Py_DECREF(list);
     332           0 :     return NULL;
     333             : }
     334             : 
     335             : Py_LOCAL_INLINE(PyObject *)
     336      331905 : STRINGLIB(splitlines)(PyObject* str_obj,
     337             :                      const STRINGLIB_CHAR* str, Py_ssize_t str_len,
     338             :                      int keepends)
     339             : {
     340             :     /* This does not use the preallocated list because splitlines is
     341             :        usually run with hundreds of newlines.  The overhead of
     342             :        switching between PyList_SET_ITEM and append causes about a
     343             :        2-3% slowdown for that common case.  A smarter implementation
     344             :        could move the if check out, so the SET_ITEMs are done first
     345             :        and the appends only done when the prealloc buffer is full.
     346             :        That's too much work for little gain.*/
     347             : 
     348             :     Py_ssize_t i;
     349             :     Py_ssize_t j;
     350      331905 :     PyObject *list = PyList_New(0);
     351             :     PyObject *sub;
     352             : 
     353      331905 :     if (list == NULL)
     354           0 :         return NULL;
     355             : 
     356     1772376 :     for (i = j = 0; i < str_len; ) {
     357             :         Py_ssize_t eol;
     358             : 
     359             :         /* Find a line and append it */
     360    56811404 :         while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i]))
     361    55190593 :             i++;
     362             : 
     363             :         /* Skip the line break reading CRLF as one line break */
     364     1620773 :         eol = i;
     365     1620773 :         if (i < str_len) {
     366     1407887 :             if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n')
     367       33858 :                 i += 2;
     368             :             else
     369     1374029 :                 i++;
     370     1407887 :             if (keepends)
     371     1217421 :                 eol = i;
     372             :         }
     373             : #if !STRINGLIB_MUTABLE
     374     1620513 :         if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
     375             :             /* No linebreak in str_obj, so just use it as list[0] */
     376      180302 :             if (PyList_Append(list, str_obj))
     377           0 :                 goto onError;
     378      180302 :             break;
     379             :         }
     380             : #endif
     381     1440471 :         SPLIT_APPEND(str, j, eol);
     382     1440471 :         j = i;
     383             :     }
     384      331905 :     return list;
     385             : 
     386           0 :   onError:
     387           0 :     Py_DECREF(list);
     388           0 :     return NULL;
     389             : }
     390             : 

Generated by: LCOV version 1.14