This is an automated email from the ASF dual-hosted git repository.
wgtmac pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git
The following commit(s) were added to refs/heads/main by this push:
new 136b4683 fix(parquet): check compression codec availability (#656)
136b4683 is described below
commit 136b4683f70bb48913bf236644100485e964356d
Author: Gang Wu <[email protected]>
AuthorDate: Thu May 21 16:19:29 2026 +0800
fix(parquet): check compression codec availability (#656)
## Summary
- Check Parquet compression codec availability with Arrow before opening
the file writer.
- Return a clear `InvalidArgument` when a requested Parquet codec is not
built into Arrow.
- Add a regression test that exercises an unavailable codec in the
current Arrow build.
## Test Plan
- `cmake --build build --target parquet_test data_test && ctest
--test-dir build -R 'parquet_test|data_test' --output-on-failure`
- `git diff --check`
---
src/iceberg/parquet/parquet_writer.cc | 13 ++++++++++
src/iceberg/test/parquet_test.cc | 48 +++++++++++++++++++++++++++++++++++
2 files changed, 61 insertions(+)
diff --git a/src/iceberg/parquet/parquet_writer.cc
b/src/iceberg/parquet/parquet_writer.cc
index 7e2d3d15..c70d3310 100644
--- a/src/iceberg/parquet/parquet_writer.cc
+++ b/src/iceberg/parquet/parquet_writer.cc
@@ -20,9 +20,11 @@
#include "iceberg/parquet/parquet_writer.h"
#include <memory>
+#include <string_view>
#include <arrow/c/bridge.h>
#include <arrow/record_batch.h>
+#include <arrow/util/compression.h>
#include <arrow/util/key_value_metadata.h>
#include <parquet/arrow/schema.h>
#include <parquet/arrow/writer.h>
@@ -62,6 +64,14 @@ Result<::arrow::Compression::type> ParseCompression(const
WriterProperties& prop
}
}
+Status CheckCompressionAvailable(std::string_view compression_name,
+ ::arrow::Compression::type compression) {
+ ICEBERG_PRECHECK(::arrow::util::Codec::IsAvailable(compression),
+ "Parquet compression codec {} is not available in the
current build",
+ compression_name);
+ return {};
+}
+
Result<std::optional<int32_t>> ParseCodecLevel(const WriterProperties&
properties) {
auto level_str = properties.Get(WriterProperties::kParquetCompressionLevel);
if (level_str.empty()) {
@@ -98,6 +108,9 @@ class ParquetWriter::Impl {
auto schema_node = std::static_pointer_cast<::parquet::schema::GroupNode>(
schema_descriptor->schema_root());
+ ICEBERG_RETURN_UNEXPECTED(CheckCompressionAvailable(
+ options.properties.Get(WriterProperties::kParquetCompression),
compression));
+
ICEBERG_ASSIGN_OR_RAISE(output_stream_, OpenOutputStream(options));
auto file_writer = ::parquet::ParquetFileWriter::Open(
output_stream_, std::move(schema_node), std::move(writer_properties),
diff --git a/src/iceberg/test/parquet_test.cc b/src/iceberg/test/parquet_test.cc
index 65a4602d..70fb9880 100644
--- a/src/iceberg/test/parquet_test.cc
+++ b/src/iceberg/test/parquet_test.cc
@@ -18,6 +18,9 @@
*/
#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
#include <arrow/array.h>
#include <arrow/c/bridge.h>
@@ -26,6 +29,7 @@
#include <arrow/record_batch.h>
#include <arrow/table.h>
#include <arrow/type.h>
+#include <arrow/util/compression.h>
#include <arrow/util/key_value_metadata.h>
#include <parquet/arrow/reader.h>
#include <parquet/arrow/writer.h>
@@ -124,6 +128,27 @@ void DoRoundtrip(std::shared_ptr<::arrow::Array> data,
std::shared_ptr<Schema> s
ASSERT_TRUE(out != nullptr) << "Reader.Next() returned no data";
}
+struct ParquetCodec {
+ std::string name;
+ ::arrow::Compression::type compression;
+};
+
+std::optional<ParquetCodec> FirstUnavailableParquetCodec() {
+ const std::vector<ParquetCodec> codecs = {
+ {.name = "snappy", .compression = ::arrow::Compression::SNAPPY},
+ {.name = "gzip", .compression = ::arrow::Compression::GZIP},
+ {.name = "brotli", .compression = ::arrow::Compression::BROTLI},
+ {.name = "lz4", .compression = ::arrow::Compression::LZ4},
+ {.name = "zstd", .compression = ::arrow::Compression::ZSTD},
+ };
+ for (const auto& codec : codecs) {
+ if (!::arrow::util::Codec::IsAvailable(codec.compression)) {
+ return codec;
+ }
+ }
+ return std::nullopt;
+}
+
} // namespace
class ParquetReaderTest : public TempFileTestBase {
@@ -461,6 +486,29 @@ TEST_F(ParquetReadWrite, EmptyStruct) {
IsError(ErrorKind::kNotImplemented));
}
+TEST_F(ParquetReadWrite, RejectsUnavailableCompressionCodec) {
+ auto unavailable_codec = FirstUnavailableParquetCodec();
+ if (!unavailable_codec.has_value()) {
+ GTEST_SKIP() << "All optional Parquet compression codecs are available";
+ }
+
+ auto schema = std::make_shared<Schema>(
+ std::vector<SchemaField>{SchemaField::MakeRequired(1, "id", int32())});
+ WriterProperties writer_properties;
+ writer_properties.Set(WriterProperties::kParquetCompression,
unavailable_codec->name);
+
+ auto writer = WriterFactoryRegistry::Open(
+ FileFormatType::kParquet, {.path = "unavailable_codec.parquet",
+ .schema = schema,
+ .io =
arrow::ArrowFileSystemFileIO::MakeMockFileIO(),
+ .properties = std::move(writer_properties)});
+
+ EXPECT_THAT(writer, IsError(ErrorKind::kInvalidArgument));
+ EXPECT_THAT(writer,
+ HasErrorMessage("Parquet compression codec " +
unavailable_codec->name +
+ " is not available in the current build"));
+}
+
TEST_F(ParquetReadWrite, SimpleStructRoundTrip) {
auto schema = std::make_shared<Schema>(std::vector<SchemaField>{
SchemaField::MakeOptional(1, "a",