Serhiy Storchaka added the comment:
After much experimentation, I suggest the new patch.
Benchmark results (time of replacing 1 of n character (ch1 to ch2) in 100000-
char string).
Py3.2 Py3.3 patch n ch1 ch2 fill
231 (-13%) 3025 (-93%) 200 1 'a' 'b' 'c'
626 (-18%) 2035 (-75%) 511 2 'a' 'b' 'c'
444 (-26%) 957 (-66%) 327 5 'a' 'b' 'c'
349 (-30%) 530 (-54%) 243 10 'a' 'b' 'c'
306 (-40%) 300 (-38%) 185 20 'a' 'b' 'c'
280 (-54%) 169 (-23%) 130 50 'a' 'b' 'c'
273 (-62%) 123 (-15%) 105 100 'a' 'b' 'c'
265 (-70%) 82 (-4%) 79 1000 'a' 'b' 'c'
230 (+4%) 3012 (-92%) 239 1 '\u010a' '\u010b' '\u010c'
624 (-17%) 1907 (-73%) 518 2 '\u010a' '\u010b' '\u010c'
442 (-16%) 962 (-62%) 370 5 '\u010a' '\u010b' '\u010c'
347 (-5%) 566 (-42%) 330 10 '\u010a' '\u010b' '\u010c'
305 (-10%) 357 (-23%) 275 20 '\u010a' '\u010b' '\u010c'
285 (-26%) 241 (-12%) 212 50 '\u010a' '\u010b' '\u010c'
280 (-33%) 190 (-2%) 187 100 '\u010a' '\u010b' '\u010c'
263 (-41%) 170 (-8%) 156 1000 '\u010a' '\u010b' '\u010c'
3355 (-85%) 3309 (-85%) 498 1 '\U0001000a' '\U0001000b' '\U0001000c'
2290 (-65%) 2267 (-65%) 800 2 '\U0001000a' '\U0001000b' '\U0001000c'
1598 (-62%) 1279 (-52%) 612 5 '\U0001000a' '\U0001000b' '\U0001000c'
1313 (-60%) 950 (-45%) 519 10 '\U0001000a' '\U0001000b' '\U0001000c'
1195 (-61%) 824 (-44%) 464 20 '\U0001000a' '\U0001000b' '\U0001000c'
1055 (-59%) 640 (-32%) 434 50 '\U0001000a' '\U0001000b' '\U0001000c'
982 (-55%) 549 (-20%) 439 100 '\U0001000a' '\U0001000b' '\U0001000c'
941 (-56%) 473 (-12%) 417 1000 '\U0001000a' '\U0001000b' '\U0001000c'
On other platforms other numbers are possible. Especially I'm interested in
the results on Windows and on 64-bit. For the test I used the script
replacebench2.py, and compared the results with the help of script
https://bitbucket.org/storchaka/cpython-stuff/raw/default/bench/bench-diff.py
.
----------
Added file: http://bugs.python.org/file27553/str_replace_1char.patch
_______________________________________
Python tracker <rep...@bugs.python.org>
<http://bugs.python.org/issue16061>
_______________________________________
diff -r 6fb1fb550319 Objects/unicodeobject.c
--- a/Objects/unicodeobject.c Thu Oct 11 18:59:34 2012 -0700
+++ b/Objects/unicodeobject.c Sat Oct 13 20:19:47 2012 +0300
@@ -9881,6 +9881,80 @@
return 0;
}
+static void
+replace_1char_inplace(PyObject *u, Py_ssize_t pos, Py_UCS4 u1, Py_UCS4 u2,
Py_ssize_t maxcount)
+{
+ int kind = PyUnicode_KIND(u);
+ void *data = PyUnicode_DATA(u);
+ Py_ssize_t len = PyUnicode_GET_LENGTH(u);
+ if (kind == PyUnicode_1BYTE_KIND) {
+ Py_UCS1 *s = (Py_UCS1 *)data;
+ Py_UCS1 *end = s + len;
+ s += pos;
+ *s = u2;
+ while (--maxcount && ++s != end) {
+ if (*s != u1) {
+ int attempts = 10;
+ while (1) {
+ if (++s == end)
+ return;
+ if (*s == u1)
+ break;
+ if (!--attempts) {
+ s++;
+ s = memchr(s, u1, end - s);
+ if (s == NULL)
+ return;
+ break;
+ }
+ }
+ }
+ *s = u2;
+ }
+ }
+ else if (kind == PyUnicode_2BYTE_KIND) {
+ Py_UCS2 *s = (Py_UCS2 *)data;
+ Py_UCS2 *end = s + len;
+ s += pos;
+ *s = u2;
+ while (--maxcount && ++s != end) {
+ if (*s != u1) {
+ int attempts = 10;
+ while (1) {
+ if (++s == end)
+ return;
+ if (*s == u1)
+ break;
+ if (!--attempts) {
+ Py_ssize_t i;
+ s++;
+ i = findchar(s, PyUnicode_2BYTE_KIND, end - s, u1, 1);
+ if (i < 0)
+ return;
+ s += i;
+ break;
+ }
+ }
+ }
+ *s = u2;
+ }
+ }
+ else {
+ Py_UCS4 *s = (Py_UCS4 *)data;
+ Py_UCS4 *end = s + len;
+ s += pos;
+ while (1) {
+ *s = u2;
+ if (!--maxcount)
+ return;
+ do {
+ if (++s == end)
+ return;
+ } while (*s != u1);
+ }
+ }
+}
+
static PyObject *
replace(PyObject *self, PyObject *str1,
PyObject *str2, Py_ssize_t maxcount)
@@ -9897,7 +9971,7 @@
Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
int mayshrink;
- Py_UCS4 maxchar, maxchar_str2;
+ Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
if (maxcount < 0)
maxcount = PY_SSIZE_T_MAX;
@@ -9906,15 +9980,16 @@
if (str1 == str2)
goto nothing;
- if (skind < kind1)
+
+ maxchar = PyUnicode_MAX_CHAR_VALUE(self);
+ maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
+ if (maxchar < maxchar_str1)
/* substring too wide to be present */
goto nothing;
-
- maxchar = PyUnicode_MAX_CHAR_VALUE(self);
maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
/* Replacing str1 with str2 may cause a maxchar reduction in the
result string. */
- mayshrink = (maxchar_str2 < maxchar);
+ mayshrink = (maxchar_str2 < maxchar_str1);
maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
if (len1 == len2) {
@@ -9924,35 +9999,19 @@
if (len1 == 1) {
/* replace characters */
Py_UCS4 u1, u2;
- int rkind;
- Py_ssize_t index, pos;
- char *src;
+ Py_ssize_t pos;
u1 = PyUnicode_READ_CHAR(str1, 0);
- pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
+ pos = findchar(sbuf, skind, slen, u1, 1);
if (pos < 0)
goto nothing;
u2 = PyUnicode_READ_CHAR(str2, 0);
u = PyUnicode_New(slen, maxchar);
if (!u)
goto error;
+
_PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
- rkind = PyUnicode_KIND(u);
-
- PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
- index = 0;
- src = sbuf;
- while (--maxcount)
- {
- pos++;
- src += pos * PyUnicode_KIND(self);
- slen -= pos;
- index += pos;
- pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
- if (pos < 0)
- break;
- PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
- }
+ replace_1char_inplace(u, pos, u1, u2, maxcount);
}
else {
int rkind = skind;
_______________________________________________
Python-bugs-list mailing list
Unsubscribe:
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com