On Thu, Jul 17, 2014 at 03:44:23AM +0600, Mikhail Korobov wrote: > So making code 3.x compatible by ditching cStringIO can cause a serious > performance/memory regressions. One can change the code to build the data > using BytesIO (without creating bytes objects in the first place), but that is > not always possible or convenient. > > I believe this problem affects tornado (https://github.com/tornadoweb/tornado/ > Do you know if there a workaround? Maybe there is some stdlib part that I'm > missing, or a module on PyPI? It is not that hard to write an own wrapper that > won't do copies (or to port [c]StringIO to 3.x), but I wonder if there is an > existing solution or plans to fix it in Python itself - this BytesIO use case > looks quite important.
Regarding a fix, the problem seems mostly that the StringI/StringO specializations were removed, and the new implementation is basically just a StringO. At a small cost to memory, it is easy to add a Py_buffer source and flags variable to the bytesio struct, with the buffers initially setup for reading, and if a mutation method is called, check for a copy-on-write flag, duplicate the source object into private memory, then continue operating as it does now. Attached is a (rough) patch implementing this, feel free to try it with hg tip. [23:03:44 k2!124 cpython] cat i.py import io buf = b'x' * (1048576 * 16) def x(): io.BytesIO(buf) [23:03:51 k2!125 cpython] ./python -m timeit -s 'import i' 'i.x()' 100 loops, best of 3: 2.9 msec per loop [23:03:57 k2!126 cpython] ./python-cow -m timeit -s 'import i' 'i.x()' 1000000 loops, best of 3: 0.364 usec per loop David diff --git a/Modules/_io/bytesio.c b/Modules/_io/bytesio.c --- a/Modules/_io/bytesio.c +++ b/Modules/_io/bytesio.c @@ -2,6 +2,12 @@ #include "structmember.h" /* for offsetof() */ #include "_iomodule.h" +enum io_flags { + /* initvalue describes a borrowed buffer we cannot modify and must later + * release */ + IO_SHARED = 1 +}; + typedef struct { PyObject_HEAD char *buf; @@ -11,6 +17,10 @@ PyObject *dict; PyObject *weakreflist; Py_ssize_t exports; + Py_buffer initvalue; + /* If IO_SHARED, indicates PyBuffer_release(initvalue) required, and that + * we don't own buf. */ + enum io_flags flags; } bytesio; typedef struct { @@ -33,6 +43,47 @@ return NULL; \ } +/* Unshare our buffer in preparation for writing, in the case that an + * initvalue object was provided, and we're currently borrowing its buffer. + * size indicates the total reserved buffer size allocated as part of + * unsharing, to avoid a potentially redundant allocation in the subsequent + * mutation. + */ +static int +unshare(bytesio *self, size_t size) +{ + Py_ssize_t new_size = size; + Py_ssize_t copy_size = size; + char *new_buf; + + /* Do nothing if buffer wasn't shared */ + if (! (self->flags & IO_SHARED)) { + return 0; + } + + /* If hint provided, adjust our new buffer size and truncate the amount of + * source buffer we copy as necessary. */ + if (size > copy_size) { + copy_size = size; + } + + /* Allocate or fail. */ + new_buf = (char *)PyMem_Malloc(new_size); + if (new_buf == NULL) { + PyErr_NoMemory(); + return -1; + } + + /* Copy the (possibly now truncated) source string to the new buffer, and + * forget any reference used to keep the source buffer alive. */ + memcpy(new_buf, self->buf, copy_size); + PyBuffer_Release(&self->initvalue); + self->flags &= ~IO_SHARED; + self->buf = new_buf; + self->buf_size = new_size; + self->string_size = (Py_ssize_t) copy_size; + return 0; +} /* Internal routine to get a line from the buffer of a BytesIO object. Returns the length between the current position to the @@ -125,11 +176,18 @@ static Py_ssize_t write_bytes(bytesio *self, const char *bytes, Py_ssize_t len) { + size_t desired; + assert(self->buf != NULL); assert(self->pos >= 0); assert(len >= 0); - if ((size_t)self->pos + len > self->buf_size) { + desired = (size_t)self->pos + len; + if (unshare(self, desired)) { + return -1; + } + + if (desired > self->buf_size) { if (resize_buffer(self, (size_t)self->pos + len) < 0) return -1; } @@ -502,6 +560,10 @@ return NULL; } + if (unshare(self, size)) { + return NULL; + } + if (size < self->string_size) { self->string_size = size; if (resize_buffer(self, size) < 0) @@ -655,10 +717,13 @@ static PyObject * bytesio_close(bytesio *self) { - if (self->buf != NULL) { + if (self->flags & IO_SHARED) { + PyBuffer_Release(&self->initvalue); + self->flags &= ~IO_SHARED; + } else if (self->buf != NULL) { PyMem_Free(self->buf); - self->buf = NULL; } + self->buf = NULL; Py_RETURN_NONE; } @@ -788,10 +853,17 @@ "deallocated BytesIO object has exported buffers"); PyErr_Print(); } - if (self->buf != NULL) { + + if (self->flags & IO_SHARED) { + /* We borrowed buf from another object */ + PyBuffer_Release(&self->initvalue); + self->flags &= ~IO_SHARED; + } else if (self->buf != NULL) { + /* We owned buf */ PyMem_Free(self->buf); - self->buf = NULL; } + self->buf = NULL; + Py_CLEAR(self->dict); if (self->weakreflist != NULL) PyObject_ClearWeakRefs((PyObject *) self); @@ -811,12 +883,6 @@ /* tp_alloc initializes all the fields to zero. So we don't have to initialize them here. */ - self->buf = (char *)PyMem_Malloc(0); - if (self->buf == NULL) { - Py_DECREF(self); - return PyErr_NoMemory(); - } - return (PyObject *)self; } @@ -834,13 +900,32 @@ self->string_size = 0; self->pos = 0; + /* Release any previous initvalue. */ + if (self->flags & IO_SHARED) { + PyBuffer_Release(&self->initvalue); + self->buf = NULL; + self->flags &= ~IO_SHARED; + } + if (initvalue && initvalue != Py_None) { - PyObject *res; - res = bytesio_write(self, initvalue); - if (res == NULL) + Py_buffer *buf = &self->initvalue; + if (PyObject_GetBuffer(initvalue, buf, PyBUF_CONTIG_RO) < 0) { return -1; - Py_DECREF(res); - self->pos = 0; + } + self->buf = self->initvalue.buf; + self->buf_size = (size_t)self->initvalue.len; + self->string_size = self->initvalue.len; + self->flags |= IO_SHARED; + } + + /* If no initvalue provided, prepare a private buffer now. */ + if (self->buf == NULL) { + self->buf = (char *)PyMem_Malloc(0); + if (self->buf == NULL) { + Py_DECREF(self); + PyErr_NoMemory(); + return -1; + } } return 0; _______________________________________________ Python-Dev mailing list Python-Dev@python.org https://mail.python.org/mailman/listinfo/python-dev Unsubscribe: https://mail.python.org/mailman/options/python-dev/archive%40mail-archive.com