New submission from Skip Montanaro:

The topic of avoiding string copies in certain string methods came up in 
the
ChiPy list:

  http://mail.python.org/pipermail/chicago/2007-December/002975.html.

The attached patch modifies the split and rsplit implementations to 
avoid
making a copy of self when the split fails to find anything to split on:

    >>> s = "abc def"
    >>> x = s.split(';')
    >>> x[0] is s
    True
    >>> y = s.rsplit('-')
    >>> y[0] is s
    True
    >>> t = "abcdef"
    >>> x = t.split()
    >>> x[0] is t
    True
    >>> y = t.rsplit()
    >>> y[0] is t
    True

All tests pass.  Given that this is just a small optimization I
don't believe any changes to the docs or the existing tests are
necessary.

----------
components: Interpreter Core
files: string-split.patch
keywords: patch
messages: 58081
nosy: skip.montanaro
priority: normal
severity: normal
status: open
title: Avoid string copy when split char doesn't match
type: rfe
versions: Python 2.6
Added file: http://bugs.python.org/file8851/string-split.patch

__________________________________
Tracker <[EMAIL PROTECTED]>
<http://bugs.python.org/issue1538>
__________________________________
*** /tmp/skip/ediffWFAoxm       Sun Dec  2 01:28:32 2007
--- /Users/skip/src/python/trunk/Objects/stringobject.c Sun Dec  2 01:27:56 2007
***************
*** 1403,1410 ****
  #define RSKIP_NONSPACE(s, i)     { while (i>=0  && 
!isspace(Py_CHARMASK(s[i]))) i--; }
  
  Py_LOCAL_INLINE(PyObject *)
! split_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxsplit)
  {
        Py_ssize_t i, j, count=0;
        PyObject *str;
        PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
--- 1403,1411 ----
  #define RSKIP_NONSPACE(s, i)     { while (i>=0  && 
!isspace(Py_CHARMASK(s[i]))) i--; }
  
  Py_LOCAL_INLINE(PyObject *)
! split_whitespace(PyStringObject *self, Py_ssize_t len, Py_ssize_t maxsplit)
  {
+       const char *s = PyString_AS_STRING(self);
        Py_ssize_t i, j, count=0;
        PyObject *str;
        PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
***************
*** 1419,1424 ****
--- 1420,1432 ----
                if (i==len) break;
                j = i; i++;
                SKIP_NONSPACE(s, i, len);
+               if (j == 0 && i == len) {
+                       /* No whitespace in self, so just use it as list[0] */
+                       Py_INCREF(self);
+                       PyList_SET_ITEM(list, 0, (PyObject *)self);
+                       count++;
+                       break;
+               }
                SPLIT_ADD(s, j, i);
        }
  
***************
*** 1437,1444 ****
  }
  
  Py_LOCAL_INLINE(PyObject *)
! split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
  {
        register Py_ssize_t i, j, count=0;
        PyObject *str;
        PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
--- 1445,1453 ----
  }
  
  Py_LOCAL_INLINE(PyObject *)
! split_char(PyStringObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount)
  {
+       const char *s = PyString_AS_STRING(self);
        register Py_ssize_t i, j, count=0;
        PyObject *str;
        PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
***************
*** 1457,1463 ****
                        }
                }
        }
!       if (i <= len) {
                SPLIT_ADD(s, i, len);
        }
        FIX_PREALLOC_SIZE(list);
--- 1466,1478 ----
                        }
                }
        }
!       if (i == 0 && count == 0) {
!               /* ch not in self, so just use self as list[0] */
!               Py_INCREF(self);
!               PyList_SET_ITEM(list, 0, (PyObject *)self);
!               count++;
!       }
!       else if (i <= len) {
                SPLIT_ADD(s, i, len);
        }
        FIX_PREALLOC_SIZE(list);
***************
*** 1492,1498 ****
        if (maxsplit < 0)
                maxsplit = PY_SSIZE_T_MAX;
        if (subobj == Py_None)
!               return split_whitespace(s, len, maxsplit);
        if (PyString_Check(subobj)) {
                sub = PyString_AS_STRING(subobj);
                n = PyString_GET_SIZE(subobj);
--- 1507,1513 ----
        if (maxsplit < 0)
                maxsplit = PY_SSIZE_T_MAX;
        if (subobj == Py_None)
!               return split_whitespace(self, len, maxsplit);
        if (PyString_Check(subobj)) {
                sub = PyString_AS_STRING(subobj);
                n = PyString_GET_SIZE(subobj);
***************
*** 1509,1515 ****
                return NULL;
        }
        else if (n == 1)
!               return split_char(s, len, sub[0], maxsplit);
  
        list = PyList_New(PREALLOC_SIZE(maxsplit));
        if (list == NULL)
--- 1524,1530 ----
                return NULL;
        }
        else if (n == 1)
!               return split_char(self, len, sub[0], maxsplit);
  
        list = PyList_New(PREALLOC_SIZE(maxsplit));
        if (list == NULL)
***************
*** 1609,1616 ****
  }
  
  Py_LOCAL_INLINE(PyObject *)
! rsplit_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxsplit)
  {
        Py_ssize_t i, j, count=0;
        PyObject *str;
        PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
--- 1624,1632 ----
  }
  
  Py_LOCAL_INLINE(PyObject *)
! rsplit_whitespace(PyStringObject *self, Py_ssize_t len, Py_ssize_t maxsplit)
  {
+       const char *s = PyString_AS_STRING(self);
        Py_ssize_t i, j, count=0;
        PyObject *str;
        PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
***************
*** 1625,1630 ****
--- 1641,1653 ----
                if (i<0) break;
                j = i; i--;
                RSKIP_NONSPACE(s, i);
+               if (j == len-1 && i < 0) {
+                       /* No whitespace in self, so just use it as list[0] */
+                       Py_INCREF(self);
+                       PyList_SET_ITEM(list, 0, (PyObject *)self);
+                       count++;
+                       break;
+               }
                SPLIT_ADD(s, i + 1, j + 1);
        }
        if (i >= 0) {
***************
*** 1645,1652 ****
  }
  
  Py_LOCAL_INLINE(PyObject *)
! rsplit_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
  {
        register Py_ssize_t i, j, count=0;
        PyObject *str;
        PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
--- 1668,1676 ----
  }
  
  Py_LOCAL_INLINE(PyObject *)
! rsplit_char(PyStringObject *self, Py_ssize_t len, char ch, Py_ssize_t 
maxcount)
  {
+       const char *s = PyString_AS_STRING(self);
        register Py_ssize_t i, j, count=0;
        PyObject *str;
        PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
***************
*** 1664,1670 ****
                        }
                }
        }
!       if (j >= -1) {
                SPLIT_ADD(s, 0, j + 1);
        }
        FIX_PREALLOC_SIZE(list);
--- 1688,1700 ----
                        }
                }
        }
!       if (i < 0 && count == 0) {
!               /* ch not in self, so just use self as list[0] */
!               Py_INCREF(self);
!               PyList_SET_ITEM(list, 0, (PyObject *)self);
!               count++;
!       }
!       else if (j >= -1) {
                SPLIT_ADD(s, 0, j + 1);
        }
        FIX_PREALLOC_SIZE(list);
***************
*** 1691,1697 ****
  {
        Py_ssize_t len = PyString_GET_SIZE(self), n, i, j;
        Py_ssize_t maxsplit = -1, count=0;
!       const char *s = PyString_AS_STRING(self), *sub;
        PyObject *list, *str, *subobj = Py_None;
  
        if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit))
--- 1721,1727 ----
  {
        Py_ssize_t len = PyString_GET_SIZE(self), n, i, j;
        Py_ssize_t maxsplit = -1, count=0;
!       const char *s, *sub;
        PyObject *list, *str, *subobj = Py_None;
  
        if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit))
***************
*** 1699,1705 ****
        if (maxsplit < 0)
                maxsplit = PY_SSIZE_T_MAX;
        if (subobj == Py_None)
!               return rsplit_whitespace(s, len, maxsplit);
        if (PyString_Check(subobj)) {
                sub = PyString_AS_STRING(subobj);
                n = PyString_GET_SIZE(subobj);
--- 1729,1735 ----
        if (maxsplit < 0)
                maxsplit = PY_SSIZE_T_MAX;
        if (subobj == Py_None)
!               return rsplit_whitespace(self, len, maxsplit);
        if (PyString_Check(subobj)) {
                sub = PyString_AS_STRING(subobj);
                n = PyString_GET_SIZE(subobj);
***************
*** 1716,1722 ****
                return NULL;
        }
        else if (n == 1)
!               return rsplit_char(s, len, sub[0], maxsplit);
  
        list = PyList_New(PREALLOC_SIZE(maxsplit));
        if (list == NULL)
--- 1746,1752 ----
                return NULL;
        }
        else if (n == 1)
!               return rsplit_char(self, len, sub[0], maxsplit);
  
        list = PyList_New(PREALLOC_SIZE(maxsplit));
        if (list == NULL)
***************
*** 1725,1730 ****
--- 1755,1761 ----
        j = len;
        i = j - n;
  
+       s = PyString_AS_STRING(self);
        while ( (i >= 0) && (maxsplit-- > 0) ) {
                for (; i>=0; i--) {
                        if (Py_STRING_MATCH(s, i, sub, n)) {
_______________________________________________
Python-bugs-list mailing list 
Unsubscribe: 
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com

Reply via email to