https://github.com/python/cpython/commit/f262297d525e87906c5e4ab28e80284189641c9e
commit: f262297d525e87906c5e4ab28e80284189641c9e
branch: main
author: Emma Smith <[email protected]>
committer: emmatyping <[email protected]>
date: 2025-10-14T10:03:55-07:00
summary:

gh-139877: Use PyBytesWriter in pycore_blocks_output_buffer.h (#139976)

Previously, the _BlocksOutputBuffer code creates a list of bytes objects to 
handle the output data from compression libraries. This ends up being slow due 
to the output buffer code needing to copy each bytes element of the list into 
the final bytes object buffer at the end of compression.

The new PyBytesWriter API introduced in PEP 782 is an ergonomic and fast method 
of writing data into a buffer that will later turn into a bytes object. 
Benchmarks show that using the PyBytesWriter API is 10-30% faster for 
decompression across a variety of settings. The performance gains are greatest 
when the decompressor is very performant, such as for Zstandard (and likely 
zlib-ng). Otherwise the decompressor can bottleneck decompression and the gains 
are more modest, but still sizable (e.g. 10% faster for zlib)!

Co-authored-by: Bénédikt Tran <[email protected]>

files:
M Include/internal/pycore_blocks_output_buffer.h
M Modules/_bz2module.c
M Modules/_lzmamodule.c
M Modules/_zstd/buffer.h
M Modules/_zstd/compressor.c
M Modules/_zstd/decompressor.c
M Modules/zlibmodule.c

diff --git a/Include/internal/pycore_blocks_output_buffer.h 
b/Include/internal/pycore_blocks_output_buffer.h
index 573e10359b7bd2..016e7a18665859 100644
--- a/Include/internal/pycore_blocks_output_buffer.h
+++ b/Include/internal/pycore_blocks_output_buffer.h
@@ -45,12 +45,14 @@ extern "C" {
 #endif
 
 typedef struct {
-    // List of bytes objects
-    PyObject *list;
+    // Bytes writer managing output buffer
+    PyBytesWriter *writer;
     // Number of whole allocated size
     Py_ssize_t allocated;
-    // Max length of the buffer, negative number means unlimited length.
+    // Max length of the buffer, negative number means unlimited length
     Py_ssize_t max_length;
+    // Number of blocks of bytes. Used to calculate next allocation size
+    size_t num_blocks;
 } _BlocksOutputBuffer;
 
 static const char unable_allocate_msg[] = "Unable to allocate output buffer.";
@@ -107,11 +109,10 @@ _BlocksOutputBuffer_InitAndGrow(_BlocksOutputBuffer 
*buffer,
                                 const Py_ssize_t max_length,
                                 void **next_out)
 {
-    PyObject *b;
     Py_ssize_t block_size;
 
-    // ensure .list was set to NULL
-    assert(buffer->list == NULL);
+    // ensure .writer was set to NULL
+    assert(buffer->writer == NULL);
 
     // get block size
     if (0 <= max_length && max_length < BUFFER_BLOCK_SIZE[0]) {
@@ -120,25 +121,17 @@ _BlocksOutputBuffer_InitAndGrow(_BlocksOutputBuffer 
*buffer,
         block_size = BUFFER_BLOCK_SIZE[0];
     }
 
-    // the first block
-    b = PyBytes_FromStringAndSize(NULL, block_size);
-    if (b == NULL) {
+    buffer->writer = PyBytesWriter_Create(block_size);
+    if (buffer->writer == NULL) {
         return -1;
     }
 
-    // create the list
-    buffer->list = PyList_New(1);
-    if (buffer->list == NULL) {
-        Py_DECREF(b);
-        return -1;
-    }
-    PyList_SET_ITEM(buffer->list, 0, b);
-
     // set variables
     buffer->allocated = block_size;
     buffer->max_length = max_length;
+    buffer->num_blocks = 1;
 
-    *next_out = PyBytes_AS_STRING(b);
+    *next_out = PyBytesWriter_GetData(buffer->writer);
     return block_size;
 }
 
@@ -155,31 +148,21 @@ _BlocksOutputBuffer_InitWithSize(_BlocksOutputBuffer 
*buffer,
                                  const Py_ssize_t init_size,
                                  void **next_out)
 {
-    PyObject *b;
 
-    // ensure .list was set to NULL
-    assert(buffer->list == NULL);
+    // ensure .writer was set to NULL
+    assert(buffer->writer == NULL);
 
-    // the first block
-    b = PyBytes_FromStringAndSize(NULL, init_size);
-    if (b == NULL) {
-        PyErr_SetString(PyExc_MemoryError, unable_allocate_msg);
+    buffer->writer = PyBytesWriter_Create(init_size);
+    if (buffer->writer == NULL) {
         return -1;
     }
 
-    // create the list
-    buffer->list = PyList_New(1);
-    if (buffer->list == NULL) {
-        Py_DECREF(b);
-        return -1;
-    }
-    PyList_SET_ITEM(buffer->list, 0, b);
-
     // set variables
     buffer->allocated = init_size;
     buffer->max_length = -1;
+    buffer->num_blocks = 1;
 
-    *next_out = PyBytes_AS_STRING(b);
+    *next_out = PyBytesWriter_GetData(buffer->writer);
     return init_size;
 }
 
@@ -193,8 +176,6 @@ _BlocksOutputBuffer_Grow(_BlocksOutputBuffer *buffer,
                          void **next_out,
                          const Py_ssize_t avail_out)
 {
-    PyObject *b;
-    const Py_ssize_t list_len = Py_SIZE(buffer->list);
     Py_ssize_t block_size;
 
     // ensure no gaps in the data
@@ -205,11 +186,10 @@ _BlocksOutputBuffer_Grow(_BlocksOutputBuffer *buffer,
     }
 
     // get block size
-    if (list_len < (Py_ssize_t) Py_ARRAY_LENGTH(BUFFER_BLOCK_SIZE)) {
-        block_size = BUFFER_BLOCK_SIZE[list_len];
-    } else {
-        block_size = BUFFER_BLOCK_SIZE[Py_ARRAY_LENGTH(BUFFER_BLOCK_SIZE) - 1];
-    }
+    size_t maxblock = Py_ARRAY_LENGTH(BUFFER_BLOCK_SIZE);
+    assert(maxblock >= 1);
+    size_t block_index = Py_MIN(buffer->num_blocks, maxblock - 1);
+    block_size = BUFFER_BLOCK_SIZE[block_index];
 
     // check max_length
     if (buffer->max_length >= 0) {
@@ -229,22 +209,19 @@ _BlocksOutputBuffer_Grow(_BlocksOutputBuffer *buffer,
         return -1;
     }
 
-    // create the block
-    b = PyBytes_FromStringAndSize(NULL, block_size);
-    if (b == NULL) {
+    if (PyBytesWriter_Grow(buffer->writer, block_size)) {
         PyErr_SetString(PyExc_MemoryError, unable_allocate_msg);
         return -1;
     }
-    if (PyList_Append(buffer->list, b) < 0) {
-        Py_DECREF(b);
-        return -1;
-    }
-    Py_DECREF(b);
+
+    Py_ssize_t current_size = buffer->allocated;
 
     // set variables
     buffer->allocated += block_size;
+    buffer->num_blocks += 1;
 
-    *next_out = PyBytes_AS_STRING(b);
+    char *data = PyBytesWriter_GetData(buffer->writer);
+    *next_out = data + current_size;
     return block_size;
 }
 
@@ -265,54 +242,17 @@ static inline PyObject *
 _BlocksOutputBuffer_Finish(_BlocksOutputBuffer *buffer,
                            const Py_ssize_t avail_out)
 {
-    PyObject *result, *block;
-    const Py_ssize_t list_len = Py_SIZE(buffer->list);
-
-    // fast path for single block
-    if ((list_len == 1 && avail_out == 0) ||
-        (list_len == 2 && Py_SIZE(PyList_GET_ITEM(buffer->list, 1)) == 
avail_out))
-    {
-        block = PyList_GET_ITEM(buffer->list, 0);
-        Py_INCREF(block);
-
-        Py_CLEAR(buffer->list);
-        return block;
-    }
-
-    // final bytes object
-    result = PyBytes_FromStringAndSize(NULL, buffer->allocated - avail_out);
-    if (result == NULL) {
-        PyErr_SetString(PyExc_MemoryError, unable_allocate_msg);
-        return NULL;
-    }
-
-    // memory copy
-    if (list_len > 0) {
-        char *posi = PyBytes_AS_STRING(result);
-
-        // blocks except the last one
-        Py_ssize_t i = 0;
-        for (; i < list_len-1; i++) {
-            block = PyList_GET_ITEM(buffer->list, i);
-            memcpy(posi, PyBytes_AS_STRING(block), Py_SIZE(block));
-            posi += Py_SIZE(block);
-        }
-        // the last block
-        block = PyList_GET_ITEM(buffer->list, i);
-        memcpy(posi, PyBytes_AS_STRING(block), Py_SIZE(block) - avail_out);
-    } else {
-        assert(Py_SIZE(result) == 0);
-    }
-
-    Py_CLEAR(buffer->list);
-    return result;
+    assert(buffer->writer != NULL);
+    return PyBytesWriter_FinishWithSize(buffer->writer,
+                                        buffer->allocated - avail_out);
 }
 
 /* Clean up the buffer when an error occurred. */
 static inline void
 _BlocksOutputBuffer_OnError(_BlocksOutputBuffer *buffer)
 {
-    Py_CLEAR(buffer->list);
+    PyBytesWriter_Discard(buffer->writer);
+    buffer->writer = NULL;
 }
 
 #ifdef __cplusplus
diff --git a/Modules/_bz2module.c b/Modules/_bz2module.c
index 2e4cc43a2c3f11..9721b493a19956 100644
--- a/Modules/_bz2module.c
+++ b/Modules/_bz2module.c
@@ -190,7 +190,7 @@ static PyObject *
 compress(BZ2Compressor *c, char *data, size_t len, int action)
 {
     PyObject *result;
-    _BlocksOutputBuffer buffer = {.list = NULL};
+    _BlocksOutputBuffer buffer = {.writer = NULL};
 
     if (OutputBuffer_InitAndGrow(&buffer, -1, &c->bzs.next_out, 
&c->bzs.avail_out) < 0) {
         goto error;
@@ -429,7 +429,7 @@ decompress_buf(BZ2Decompressor *d, Py_ssize_t max_length)
        compare against max_length and PyBytes_GET_SIZE we declare it as
        signed */
     PyObject *result;
-    _BlocksOutputBuffer buffer = {.list = NULL};
+    _BlocksOutputBuffer buffer = {.writer = NULL};
     bz_stream *bzs = &d->bzs;
 
     if (OutputBuffer_InitAndGrow(&buffer, max_length, &bzs->next_out, 
&bzs->avail_out) < 0) {
diff --git a/Modules/_lzmamodule.c b/Modules/_lzmamodule.c
index 3e8e37096ba6b4..6fc072f6d0a382 100644
--- a/Modules/_lzmamodule.c
+++ b/Modules/_lzmamodule.c
@@ -554,7 +554,7 @@ static PyObject *
 compress(Compressor *c, uint8_t *data, size_t len, lzma_action action)
 {
     PyObject *result;
-    _BlocksOutputBuffer buffer = {.list = NULL};
+    _BlocksOutputBuffer buffer = {.writer = NULL};
     _lzma_state *state = PyType_GetModuleState(Py_TYPE(c));
     assert(state != NULL);
 
@@ -940,7 +940,7 @@ decompress_buf(Decompressor *d, Py_ssize_t max_length)
 {
     PyObject *result;
     lzma_stream *lzs = &d->lzs;
-    _BlocksOutputBuffer buffer = {.list = NULL};
+    _BlocksOutputBuffer buffer = {.writer = NULL};
     _lzma_state *state = PyType_GetModuleState(Py_TYPE(d));
     assert(state != NULL);
 
diff --git a/Modules/_zstd/buffer.h b/Modules/_zstd/buffer.h
index 0ac7bcb4ddc416..807c72c80dde8b 100644
--- a/Modules/_zstd/buffer.h
+++ b/Modules/_zstd/buffer.h
@@ -16,8 +16,8 @@ static inline int
 _OutputBuffer_InitAndGrow(_BlocksOutputBuffer *buffer, ZSTD_outBuffer *ob,
                         Py_ssize_t max_length)
 {
-    /* Ensure .list was set to NULL */
-    assert(buffer->list == NULL);
+    /* Ensure .writer was set to NULL */
+    assert(buffer->writer == NULL);
 
     Py_ssize_t res = _BlocksOutputBuffer_InitAndGrow(buffer, max_length,
                                                      &ob->dst);
@@ -39,8 +39,8 @@ _OutputBuffer_InitWithSize(_BlocksOutputBuffer *buffer, 
ZSTD_outBuffer *ob,
 {
     Py_ssize_t block_size;
 
-    /* Ensure .list was set to NULL */
-    assert(buffer->list == NULL);
+    /* Ensure .writer was set to NULL */
+    assert(buffer->writer == NULL);
 
     /* Get block size */
     if (0 <= max_length && max_length < init_size) {
diff --git a/Modules/_zstd/compressor.c b/Modules/_zstd/compressor.c
index 029c07113d4f45..f90bc9c5ab58b1 100644
--- a/Modules/_zstd/compressor.c
+++ b/Modules/_zstd/compressor.c
@@ -446,7 +446,7 @@ compress_lock_held(ZstdCompressor *self, Py_buffer *data,
     assert(PyMutex_IsLocked(&self->lock));
     ZSTD_inBuffer in;
     ZSTD_outBuffer out;
-    _BlocksOutputBuffer buffer = {.list = NULL};
+    _BlocksOutputBuffer buffer = {.writer = NULL};
     size_t zstd_ret;
     PyObject *ret;
 
@@ -527,7 +527,7 @@ compress_mt_continue_lock_held(ZstdCompressor *self, 
Py_buffer *data)
     assert(PyMutex_IsLocked(&self->lock));
     ZSTD_inBuffer in;
     ZSTD_outBuffer out;
-    _BlocksOutputBuffer buffer = {.list = NULL};
+    _BlocksOutputBuffer buffer = {.writer = NULL};
     size_t zstd_ret;
     PyObject *ret;
 
diff --git a/Modules/_zstd/decompressor.c b/Modules/_zstd/decompressor.c
index 6592cad6690d49..13071b7a2bacf0 100644
--- a/Modules/_zstd/decompressor.c
+++ b/Modules/_zstd/decompressor.c
@@ -216,7 +216,7 @@ decompress_lock_held(ZstdDecompressor *self, ZSTD_inBuffer 
*in,
 {
     size_t zstd_ret;
     ZSTD_outBuffer out;
-    _BlocksOutputBuffer buffer = {.list = NULL};
+    _BlocksOutputBuffer buffer = {.writer = NULL};
     PyObject *ret;
 
     /* Initialize the output buffer */
diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c
index f1312e687da71c..36c933bf618af0 100644
--- a/Modules/zlibmodule.c
+++ b/Modules/zlibmodule.c
@@ -344,7 +344,7 @@ zlib_compress_impl(PyObject *module, Py_buffer *data, int 
level, int wbits)
     PyObject *return_value;
     int flush;
     z_stream zst;
-    _BlocksOutputBuffer buffer = {.list = NULL};
+    _BlocksOutputBuffer buffer = {.writer = NULL};
 
     zlibstate *state = get_zlib_state(module);
 
@@ -445,7 +445,7 @@ zlib_decompress_impl(PyObject *module, Py_buffer *data, int 
wbits,
     Py_ssize_t ibuflen;
     int err, flush;
     z_stream zst;
-    _BlocksOutputBuffer buffer = {.list = NULL};
+    _BlocksOutputBuffer buffer = {.writer = NULL};
     _Uint32Window window;  // output buffer's UINT32_MAX sliding window
 
     zlibstate *state = get_zlib_state(module);
@@ -774,7 +774,7 @@ zlib_Compress_compress_impl(compobject *self, PyTypeObject 
*cls,
 {
     PyObject *return_value;
     int err;
-    _BlocksOutputBuffer buffer = {.list = NULL};
+    _BlocksOutputBuffer buffer = {.writer = NULL};
     zlibstate *state = PyType_GetModuleState(cls);
 
     ENTER_ZLIB(self);
@@ -898,7 +898,7 @@ zlib_Decompress_decompress_impl(compobject *self, 
PyTypeObject *cls,
     int err = Z_OK;
     Py_ssize_t ibuflen;
     PyObject *return_value;
-    _BlocksOutputBuffer buffer = {.list = NULL};
+    _BlocksOutputBuffer buffer = {.writer = NULL};
 
     PyObject *module = PyType_GetModule(cls);
     if (module == NULL)
@@ -1005,7 +1005,7 @@ zlib_Compress_flush_impl(compobject *self, PyTypeObject 
*cls, int mode)
 {
     int err;
     PyObject *return_value;
-    _BlocksOutputBuffer buffer = {.list = NULL};
+    _BlocksOutputBuffer buffer = {.writer = NULL};
 
     zlibstate *state = PyType_GetModuleState(cls);
     /* Flushing with Z_NO_FLUSH is a no-op, so there's no point in
@@ -1267,7 +1267,7 @@ zlib_Decompress_flush_impl(compobject *self, PyTypeObject 
*cls,
     Py_buffer data;
     PyObject *return_value;
     Py_ssize_t ibuflen;
-    _BlocksOutputBuffer buffer = {.list = NULL};
+    _BlocksOutputBuffer buffer = {.writer = NULL};
     _Uint32Window window;  // output buffer's UINT32_MAX sliding window
 
     PyObject *module = PyType_GetModule(cls);

_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]

Reply via email to