This is an automated email from the ASF dual-hosted git repository.
kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 66d9a30 ARROW-3514: [C++] Work around insufficient output size
estimate on old zlibs
66d9a30 is described below
commit 66d9a30a26e1659d9e992037339515e59a6ae518
Author: Antoine Pitrou <[email protected]>
AuthorDate: Tue Oct 16 13:32:29 2018 +0200
ARROW-3514: [C++] Work around insufficient output size estimate on old zlibs
With a manylinux1 zlib (1.2.3.x), one could get the following error when
writing a Parquet table with gzip compress:
"zlib deflate failed, output buffer too small"
Author: Antoine Pitrou <[email protected]>
Closes #2771 from pitrou/ARROW-3514-zlib-compression-bug and squashes the
following commits:
5b607327 <Antoine Pitrou> ARROW-3514: Work around insufficient output size
estimate on old zlibs
---
cpp/src/arrow/util/compression_zlib.cc | 10 ++++++----
python/manylinux1/build_arrow.sh | 1 -
python/pyarrow/tests/test_parquet.py | 13 +++++++++++++
3 files changed, 19 insertions(+), 5 deletions(-)
diff --git a/cpp/src/arrow/util/compression_zlib.cc
b/cpp/src/arrow/util/compression_zlib.cc
index cb3baff..9fe163e 100644
--- a/cpp/src/arrow/util/compression_zlib.cc
+++ b/cpp/src/arrow/util/compression_zlib.cc
@@ -438,13 +438,15 @@ class GZipCodec::GZipCodecImpl {
}
int64_t MaxCompressedLen(int64_t input_length, const uint8_t*
ARROW_ARG_UNUSED(input)) {
- // Most be in compression mode
+ // Must be in compression mode
if (!compressor_initialized_) {
Status s = InitCompressor();
DCHECK(s.ok());
}
- // TODO(wesm): deal with zlib < 1.2.3 (see Impala codebase)
- return deflateBound(&stream_, static_cast<uLong>(input_length));
+ int64_t max_len = deflateBound(&stream_, static_cast<uLong>(input_length));
+ // ARROW-3514: return a more pessimistic estimate to account for bugs
+ // in old zlib versions.
+ return max_len + 12;
}
Status Compress(int64_t input_length, const uint8_t* input, int64_t
output_buffer_len,
@@ -460,7 +462,7 @@ class GZipCodec::GZipCodecImpl {
int64_t ret = 0;
if ((ret = deflate(&stream_, Z_FINISH)) != Z_STREAM_END) {
if (ret == Z_OK) {
- // will return Z_OK (and stream.msg NOT set) if stream.avail_out is too
+ // Will return Z_OK (and stream.msg NOT set) if stream.avail_out is too
// small
return Status::IOError("zlib deflate failed, output buffer too small");
}
diff --git a/python/manylinux1/build_arrow.sh b/python/manylinux1/build_arrow.sh
index 8c37952..d99f072 100755
--- a/python/manylinux1/build_arrow.sh
+++ b/python/manylinux1/build_arrow.sh
@@ -101,7 +101,6 @@ for PYTHON_TUPLE in ${PYTHON_VERSIONS}; do
echo "=== (${PYTHON}) Building wheel ==="
PATH="$PATH:${CPYTHON_PATH}/bin" $PYTHON_INTERPRETER setup.py build_ext \
--inplace \
- --with-parquet \
--bundle-arrow-cpp \
--bundle-boost \
--boost-namespace=arrow_boost
diff --git a/python/pyarrow/tests/test_parquet.py
b/python/pyarrow/tests/test_parquet.py
index f3391ce..78677a0 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -2225,3 +2225,16 @@ def
test_parquet_writer_context_obj_with_exception(tempdir):
expected = pd.concat(frames, ignore_index=True)
tm.assert_frame_equal(result.to_pandas(), expected)
+
+
+def test_zlib_compression_bug():
+ # ARROW-3514: "zlib deflate failed, output buffer too small"
+ import pyarrow.parquet as pq
+
+ table = pa.Table.from_arrays([pa.array(['abc', 'def'])], ['some_col'])
+ f = io.BytesIO()
+ pq.write_table(table, f, compression='gzip')
+
+ f.seek(0)
+ roundtrip = pq.read_table(f)
+ tm.assert_frame_equal(roundtrip.to_pandas(), table.to_pandas())