This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 1fcc89240d GH-45227: [C++][Parquet] Enable Size Stats and Page Index 
by default (#45249)
1fcc89240d is described below

commit 1fcc89240db4fe0ad798498e7410668423846118
Author: Gang Wu <[email protected]>
AuthorDate: Tue Jan 21 17:34:44 2025 +0800

    GH-45227: [C++][Parquet] Enable Size Stats and Page Index by default 
(#45249)
    
    ### Rationale for this change
    
    Benchmark data shows that enabling page index and size stats by default 
does not have significant penalty.
    
    ### What changes are included in this PR?
    
    Enable the parquet writer to generate page index and size stats by default.
    
    ### Are these changes tested?
    
    Pass CIs.
    
    ### Are there any user-facing changes?
    
    No.
    * GitHub Issue: #45227
    
    Authored-by: Gang Wu <[email protected]>
    Signed-off-by: Gang Wu <[email protected]>
---
 cpp/src/parquet/arrow/arrow_reader_writer_test.cc |  9 +++-
 cpp/src/parquet/arrow/size_stats_benchmark.cc     | 56 ++++++++++++++---------
 cpp/src/parquet/properties.h                      |  6 ++-
 3 files changed, 46 insertions(+), 25 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc 
b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index cedcebbfb6..47a00016b9 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -4371,6 +4371,7 @@ TEST_P(TestArrowWriteDictionary, Statistics) {
             ->data_page_version(this->GetParquetDataPageVersion())
             ->write_batch_size(2)
             ->data_pagesize(2)
+            ->disable_write_page_index()
             ->build();
     std::unique_ptr<FileWriter> writer;
     ASSERT_OK_AND_ASSIGN(
@@ -4476,6 +4477,7 @@ TEST_P(TestArrowWriteDictionary, 
StatisticsUnifiedDictionary) {
             ->data_page_version(this->GetParquetDataPageVersion())
             ->write_batch_size(3)
             ->data_pagesize(3)
+            ->disable_write_page_index()
             ->build();
     std::unique_ptr<FileWriter> writer;
     ASSERT_OK_AND_ASSIGN(
@@ -5290,7 +5292,10 @@ TEST(TestArrowReadWrite, WriteAndReadRecordBatch) {
   auto pool = ::arrow::default_memory_pool();
   auto sink = CreateOutputStream();
   // Limit the max number of rows in a row group to 10
-  auto writer_properties = 
WriterProperties::Builder().max_row_group_length(10)->build();
+  auto writer_properties = WriterProperties::Builder()
+                               .max_row_group_length(10)
+                               ->disable_write_page_index()
+                               ->build();
   auto arrow_writer_properties = default_arrow_writer_properties();
 
   // Prepare schema
@@ -5346,7 +5351,7 @@ TEST(TestArrowReadWrite, WriteAndReadRecordBatch) {
   ASSERT_EQ(10, file_metadata->RowGroup(0)->num_rows());
   ASSERT_EQ(2, file_metadata->RowGroup(1)->num_rows());
 
-  // Verify that page index is not written by default.
+  // Verify that page index is not written.
   for (int i = 0; i < num_row_groups; ++i) {
     auto row_group_metadata = file_metadata->RowGroup(i);
     for (int j = 0; j < row_group_metadata->num_columns(); ++j) {
diff --git a/cpp/src/parquet/arrow/size_stats_benchmark.cc 
b/cpp/src/parquet/arrow/size_stats_benchmark.cc
index d43a3737b1..c5c95fc614 100644
--- a/cpp/src/parquet/arrow/size_stats_benchmark.cc
+++ b/cpp/src/parquet/arrow/size_stats_benchmark.cc
@@ -80,12 +80,16 @@ int64_t GetTotalPageIndexSize(const 
std::shared_ptr<::parquet::FileMetaData>& me
 }
 
 void WriteColumn(::benchmark::State& state, const 
std::shared_ptr<::arrow::Table>& table,
-                 SizeStatisticsLevel stats_level) {
+                 SizeStatisticsLevel stats_level, bool enable_page_index) {
   // Use the fastest possible encoding and compression settings, to better 
exhibit
   // the size statistics overhead.
-  auto properties = WriterProperties::Builder()
-                        .enable_statistics()
-                        ->enable_write_page_index()
+  auto builder = WriterProperties::Builder();
+  if (enable_page_index) {
+    builder.enable_write_page_index();
+  } else {
+    builder.disable_write_page_index();
+  }
+  auto properties = builder.enable_statistics()
                         ->disable_dictionary()
                         ->encoding(Encoding::PLAIN)
                         ->set_size_statistics_level(stats_level)
@@ -113,17 +117,17 @@ void WriteColumn(::benchmark::State& state, const 
std::shared_ptr<::arrow::Table
   state.SetBytesProcessed(state.iterations() * GetTotalBytes(table));
 }
 
-template <SizeStatisticsLevel level, typename ArrowType>
+template <SizeStatisticsLevel level, typename ArrowType, bool 
enable_page_index>
 void BM_WritePrimitiveColumn(::benchmark::State& state) {
   ::arrow::random::RandomArrayGenerator generator(/*seed=*/42);
   auto type = std::make_shared<ArrowType>();
   auto array = generator.ArrayOf(type, kBenchmarkSize, kNullProbability);
   auto table = ::arrow::Table::Make(
       ::arrow::schema({::arrow::field("column", type, kNullProbability > 0)}), 
{array});
-  WriteColumn(state, table, level);
+  WriteColumn(state, table, level, enable_page_index);
 }
 
-template <SizeStatisticsLevel level, typename ArrowType>
+template <SizeStatisticsLevel level, typename ArrowType, bool 
enable_page_index>
 void BM_WriteListColumn(::benchmark::State& state) {
   ::arrow::random::RandomArrayGenerator generator(/*seed=*/42);
   auto element_type = std::make_shared<ArrowType>();
@@ -133,33 +137,43 @@ void BM_WriteListColumn(::benchmark::State& state) {
   auto table = ::arrow::Table::Make(
       ::arrow::schema({::arrow::field("column", list_type, kNullProbability > 
0)}),
       {list_array});
-  WriteColumn(state, table, level);
+  WriteColumn(state, table, level, enable_page_index);
 }
 
-BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::None,
-                   ::arrow::Int64Type);
+BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::None, 
::arrow::Int64Type,
+                   /*enable_page_index=*/false);
+BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::None, 
::arrow::Int64Type,
+                   /*enable_page_index=*/true);
 BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::ColumnChunk,
-                   ::arrow::Int64Type);
+                   ::arrow::Int64Type, /*enable_page_index=*/true);
 BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, 
SizeStatisticsLevel::PageAndColumnChunk,
-                   ::arrow::Int64Type);
+                   ::arrow::Int64Type, /*enable_page_index=*/true);
 
 BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::None,
-                   ::arrow::StringType);
+                   ::arrow::StringType, /*enable_page_index=*/false);
+BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::None,
+                   ::arrow::StringType, /*enable_page_index=*/true);
 BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::ColumnChunk,
-                   ::arrow::StringType);
+                   ::arrow::StringType, /*enable_page_index=*/true);
 BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, 
SizeStatisticsLevel::PageAndColumnChunk,
-                   ::arrow::StringType);
+                   ::arrow::StringType, /*enable_page_index=*/true);
 
-BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None, 
::arrow::Int64Type);
+BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None, 
::arrow::Int64Type,
+                   /*enable_page_index=*/false);
+BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None, 
::arrow::Int64Type,
+                   /*enable_page_index=*/true);
 BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::ColumnChunk,
-                   ::arrow::Int64Type);
+                   ::arrow::Int64Type, /*enable_page_index=*/true);
 BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::PageAndColumnChunk,
-                   ::arrow::Int64Type);
+                   ::arrow::Int64Type, /*enable_page_index=*/true);
 
-BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None, 
::arrow::StringType);
+BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None, 
::arrow::StringType,
+                   /*enable_page_index=*/false);
+BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None, 
::arrow::StringType,
+                   /*enable_page_index=*/true);
 BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::ColumnChunk,
-                   ::arrow::StringType);
+                   ::arrow::StringType, /*enable_page_index=*/true);
 BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::PageAndColumnChunk,
-                   ::arrow::StringType);
+                   ::arrow::StringType, /*enable_page_index=*/true);
 
 }  // namespace parquet::benchmark
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index edaf28cd92..8ae3660014 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -164,7 +164,9 @@ static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
 static constexpr Encoding::type DEFAULT_ENCODING = Encoding::UNKNOWN;
 static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION;
 static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = 
Compression::UNCOMPRESSED;
-static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = false;
+static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = true;
+static constexpr SizeStatisticsLevel DEFAULT_SIZE_STATISTICS_LEVEL =
+    SizeStatisticsLevel::PageAndColumnChunk;
 
 class PARQUET_EXPORT ColumnProperties {
  public:
@@ -258,7 +260,7 @@ class PARQUET_EXPORT WriterProperties {
           created_by_(DEFAULT_CREATED_BY),
           store_decimal_as_integer_(false),
           page_checksum_enabled_(false),
-          size_statistics_level_(SizeStatisticsLevel::None) {}
+          size_statistics_level_(DEFAULT_SIZE_STATISTICS_LEVEL) {}
 
     explicit Builder(const WriterProperties& properties)
         : pool_(properties.memory_pool()),

Reply via email to