This is an automated email from the ASF dual-hosted git repository.

wgtmac pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git


The following commit(s) were added to refs/heads/main by this push:
     new 136b4683 fix(parquet): check compression codec availability (#656)
136b4683 is described below

commit 136b4683f70bb48913bf236644100485e964356d
Author: Gang Wu <[email protected]>
AuthorDate: Thu May 21 16:19:29 2026 +0800

    fix(parquet): check compression codec availability (#656)
    
    ## Summary
    - Check Parquet compression codec availability with Arrow before opening
    the file writer.
    - Return a clear `InvalidArgument` when a requested Parquet codec is not
    built into Arrow.
    - Add a regression test that exercises an unavailable codec in the
    current Arrow build.
    
    ## Test Plan
    - `cmake --build build --target parquet_test data_test && ctest
    --test-dir build -R 'parquet_test|data_test' --output-on-failure`
    - `git diff --check`
---
 src/iceberg/parquet/parquet_writer.cc | 13 ++++++++++
 src/iceberg/test/parquet_test.cc      | 48 +++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)

diff --git a/src/iceberg/parquet/parquet_writer.cc 
b/src/iceberg/parquet/parquet_writer.cc
index 7e2d3d15..c70d3310 100644
--- a/src/iceberg/parquet/parquet_writer.cc
+++ b/src/iceberg/parquet/parquet_writer.cc
@@ -20,9 +20,11 @@
 #include "iceberg/parquet/parquet_writer.h"
 
 #include <memory>
+#include <string_view>
 
 #include <arrow/c/bridge.h>
 #include <arrow/record_batch.h>
+#include <arrow/util/compression.h>
 #include <arrow/util/key_value_metadata.h>
 #include <parquet/arrow/schema.h>
 #include <parquet/arrow/writer.h>
@@ -62,6 +64,14 @@ Result<::arrow::Compression::type> ParseCompression(const 
WriterProperties& prop
   }
 }
 
+Status CheckCompressionAvailable(std::string_view compression_name,
+                                 ::arrow::Compression::type compression) {
+  ICEBERG_PRECHECK(::arrow::util::Codec::IsAvailable(compression),
+                   "Parquet compression codec {} is not available in the 
current build",
+                   compression_name);
+  return {};
+}
+
 Result<std::optional<int32_t>> ParseCodecLevel(const WriterProperties& 
properties) {
   auto level_str = properties.Get(WriterProperties::kParquetCompressionLevel);
   if (level_str.empty()) {
@@ -98,6 +108,9 @@ class ParquetWriter::Impl {
     auto schema_node = std::static_pointer_cast<::parquet::schema::GroupNode>(
         schema_descriptor->schema_root());
 
+    ICEBERG_RETURN_UNEXPECTED(CheckCompressionAvailable(
+        options.properties.Get(WriterProperties::kParquetCompression), 
compression));
+
     ICEBERG_ASSIGN_OR_RAISE(output_stream_, OpenOutputStream(options));
     auto file_writer = ::parquet::ParquetFileWriter::Open(
         output_stream_, std::move(schema_node), std::move(writer_properties),
diff --git a/src/iceberg/test/parquet_test.cc b/src/iceberg/test/parquet_test.cc
index 65a4602d..70fb9880 100644
--- a/src/iceberg/test/parquet_test.cc
+++ b/src/iceberg/test/parquet_test.cc
@@ -18,6 +18,9 @@
  */
 
 #include <optional>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include <arrow/array.h>
 #include <arrow/c/bridge.h>
@@ -26,6 +29,7 @@
 #include <arrow/record_batch.h>
 #include <arrow/table.h>
 #include <arrow/type.h>
+#include <arrow/util/compression.h>
 #include <arrow/util/key_value_metadata.h>
 #include <parquet/arrow/reader.h>
 #include <parquet/arrow/writer.h>
@@ -124,6 +128,27 @@ void DoRoundtrip(std::shared_ptr<::arrow::Array> data, 
std::shared_ptr<Schema> s
   ASSERT_TRUE(out != nullptr) << "Reader.Next() returned no data";
 }
 
+struct ParquetCodec {
+  std::string name;
+  ::arrow::Compression::type compression;
+};
+
+std::optional<ParquetCodec> FirstUnavailableParquetCodec() {
+  const std::vector<ParquetCodec> codecs = {
+      {.name = "snappy", .compression = ::arrow::Compression::SNAPPY},
+      {.name = "gzip", .compression = ::arrow::Compression::GZIP},
+      {.name = "brotli", .compression = ::arrow::Compression::BROTLI},
+      {.name = "lz4", .compression = ::arrow::Compression::LZ4},
+      {.name = "zstd", .compression = ::arrow::Compression::ZSTD},
+  };
+  for (const auto& codec : codecs) {
+    if (!::arrow::util::Codec::IsAvailable(codec.compression)) {
+      return codec;
+    }
+  }
+  return std::nullopt;
+}
+
 }  // namespace
 
 class ParquetReaderTest : public TempFileTestBase {
@@ -461,6 +486,29 @@ TEST_F(ParquetReadWrite, EmptyStruct) {
               IsError(ErrorKind::kNotImplemented));
 }
 
+TEST_F(ParquetReadWrite, RejectsUnavailableCompressionCodec) {
+  auto unavailable_codec = FirstUnavailableParquetCodec();
+  if (!unavailable_codec.has_value()) {
+    GTEST_SKIP() << "All optional Parquet compression codecs are available";
+  }
+
+  auto schema = std::make_shared<Schema>(
+      std::vector<SchemaField>{SchemaField::MakeRequired(1, "id", int32())});
+  WriterProperties writer_properties;
+  writer_properties.Set(WriterProperties::kParquetCompression, 
unavailable_codec->name);
+
+  auto writer = WriterFactoryRegistry::Open(
+      FileFormatType::kParquet, {.path = "unavailable_codec.parquet",
+                                 .schema = schema,
+                                 .io = 
arrow::ArrowFileSystemFileIO::MakeMockFileIO(),
+                                 .properties = std::move(writer_properties)});
+
+  EXPECT_THAT(writer, IsError(ErrorKind::kInvalidArgument));
+  EXPECT_THAT(writer,
+              HasErrorMessage("Parquet compression codec " + 
unavailable_codec->name +
+                              " is not available in the current build"));
+}
+
 TEST_F(ParquetReadWrite, SimpleStructRoundTrip) {
   auto schema = std::make_shared<Schema>(std::vector<SchemaField>{
       SchemaField::MakeOptional(1, "a",

Reply via email to