This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 5e1a4fd8a4 GH-40767: [C++][Parquet] Simplify PageWriter and 
ColumnWriter creation (#40768)
5e1a4fd8a4 is described below

commit 5e1a4fd8a4ed3630c9549c611222d2d6c32357ca
Author: mwish <[email protected]>
AuthorDate: Tue Mar 26 20:38:58 2024 +0800

    GH-40767: [C++][Parquet] Simplify PageWriter and ColumnWriter creation 
(#40768)
    
    
    
    ### Rationale for this change
    
    Enhance style for creating page writer.
    
    ### What changes are included in this PR?
    
    * create PageWriter with one calling
    * extract `column_properties`
    
    ### Are these changes tested?
    
    no need
    
    ### Are there any user-facing changes?
    
    no
    
    * GitHub Issue: #40767
    
    Authored-by: mwish <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/parquet/file_writer.cc | 101 +++++++++++++++--------------------------
 1 file changed, 36 insertions(+), 65 deletions(-)

diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc
index 6f5610b934..baa9e00da2 100644
--- a/cpp/src/parquet/file_writer.cc
+++ b/cpp/src/parquet/file_writer.cc
@@ -139,39 +139,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents 
{
     }
 
     const int32_t column_ordinal = next_column_index_++;
-    const auto& path = col_meta->descr()->path();
-    auto meta_encryptor =
-        file_encryptor_ ? 
file_encryptor_->GetColumnMetaEncryptor(path->ToDotString())
-                        : nullptr;
-    auto data_encryptor =
-        file_encryptor_ ? 
file_encryptor_->GetColumnDataEncryptor(path->ToDotString())
-                        : nullptr;
-    auto ci_builder = page_index_builder_ && 
properties_->page_index_enabled(path) &&
-                              properties_->statistics_enabled(path)
-                          ? 
page_index_builder_->GetColumnIndexBuilder(column_ordinal)
-                          : nullptr;
-    auto oi_builder = page_index_builder_ && 
properties_->page_index_enabled(path)
-                          ? 
page_index_builder_->GetOffsetIndexBuilder(column_ordinal)
-                          : nullptr;
-    auto codec_options = properties_->codec_options(path)
-                             ? properties_->codec_options(path).get()
-                             : nullptr;
-
-    std::unique_ptr<PageWriter> pager;
-    if (!codec_options) {
-      pager = PageWriter::Open(sink_, properties_->compression(path), col_meta,
-                               row_group_ordinal_, 
static_cast<int16_t>(column_ordinal),
-                               properties_->memory_pool(), false, 
meta_encryptor,
-                               data_encryptor, 
properties_->page_checksum_enabled(),
-                               ci_builder, oi_builder, CodecOptions());
-    } else {
-      pager = PageWriter::Open(sink_, properties_->compression(path), col_meta,
-                               row_group_ordinal_, 
static_cast<int16_t>(column_ordinal),
-                               properties_->memory_pool(), false, 
meta_encryptor,
-                               data_encryptor, 
properties_->page_checksum_enabled(),
-                               ci_builder, oi_builder, *codec_options);
-    }
-    column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), 
properties_);
+    column_writers_[0] = CreateColumnWriterForColumn(col_meta, column_ordinal);
     return column_writers_[0].get();
   }
 
@@ -288,45 +256,48 @@ class RowGroupSerializer : public 
RowGroupWriter::Contents {
   }
 
   void InitColumns() {
-    for (int i = 0; i < num_columns(); i++) {
+    for (int i = 0; i < RowGroupSerializer::num_columns(); i++) {
       auto col_meta = metadata_->NextColumnChunk();
-      const auto& path = col_meta->descr()->path();
       const int32_t column_ordinal = next_column_index_++;
-      auto meta_encryptor =
-          file_encryptor_ ? 
file_encryptor_->GetColumnMetaEncryptor(path->ToDotString())
+      column_writers_.push_back(CreateColumnWriterForColumn(col_meta, 
column_ordinal));
+    }
+  }
+
+  std::shared_ptr<ColumnWriter> CreateColumnWriterForColumn(
+      ColumnChunkMetaDataBuilder* col_meta, int32_t column_ordinal) const {
+    const auto& path = col_meta->descr()->path();
+    const ColumnProperties& column_properties = 
properties_->column_properties(path);
+    auto meta_encryptor =
+        file_encryptor_ ? 
file_encryptor_->GetColumnMetaEncryptor(path->ToDotString())
+                        : nullptr;
+    auto data_encryptor =
+        file_encryptor_ ? 
file_encryptor_->GetColumnDataEncryptor(path->ToDotString())
+                        : nullptr;
+    auto ci_builder = page_index_builder_ && 
column_properties.page_index_enabled()
+                          ? 
page_index_builder_->GetColumnIndexBuilder(column_ordinal)
                           : nullptr;
-      auto data_encryptor =
-          file_encryptor_ ? 
file_encryptor_->GetColumnDataEncryptor(path->ToDotString())
+    auto oi_builder = page_index_builder_ && 
column_properties.page_index_enabled()
+                          ? 
page_index_builder_->GetOffsetIndexBuilder(column_ordinal)
                           : nullptr;
-      auto ci_builder = page_index_builder_ && 
properties_->page_index_enabled(path)
-                            ? 
page_index_builder_->GetColumnIndexBuilder(column_ordinal)
-                            : nullptr;
-      auto oi_builder = page_index_builder_ && 
properties_->page_index_enabled(path)
-                            ? 
page_index_builder_->GetOffsetIndexBuilder(column_ordinal)
-                            : nullptr;
-      auto codec_options = properties_->codec_options(path)
-                               ? (properties_->codec_options(path)).get()
-                               : nullptr;
-
-      std::unique_ptr<PageWriter> pager;
-      if (!codec_options) {
-        pager = PageWriter::Open(
-            sink_, properties_->compression(path), col_meta, 
row_group_ordinal_,
-            static_cast<int16_t>(column_ordinal), properties_->memory_pool(),
-            buffered_row_group_, meta_encryptor, data_encryptor,
-            properties_->page_checksum_enabled(), ci_builder, oi_builder, 
CodecOptions());
-      } else {
-        pager = PageWriter::Open(
-            sink_, properties_->compression(path), col_meta, 
row_group_ordinal_,
-            static_cast<int16_t>(column_ordinal), properties_->memory_pool(),
-            buffered_row_group_, meta_encryptor, data_encryptor,
-            properties_->page_checksum_enabled(), ci_builder, oi_builder, 
*codec_options);
-      }
-      column_writers_.push_back(
-          ColumnWriter::Make(col_meta, std::move(pager), properties_));
+
+    const CodecOptions* codec_options = column_properties.codec_options()
+                                            ? 
column_properties.codec_options().get()
+                                            : nullptr;
+    CodecOptions default_codec_options;
+    if (!codec_options) {
+      codec_options = &default_codec_options;
     }
+    DCHECK_NE(nullptr, codec_options);
+    std::unique_ptr<PageWriter> pager = PageWriter::Open(
+        sink_, column_properties.compression(), col_meta, row_group_ordinal_,
+        static_cast<int16_t>(column_ordinal), properties_->memory_pool(),
+        buffered_row_group_, meta_encryptor, data_encryptor,
+        properties_->page_checksum_enabled(), ci_builder, oi_builder, 
*codec_options);
+    return ColumnWriter::Make(col_meta, std::move(pager), properties_);
   }
 
+  // If buffered_row_group_ is false, only column_writers_[0] is used as 
current writer.
+  // If buffered_row_group_ is true, multiple column writers are used.
   std::vector<std::shared_ptr<ColumnWriter>> column_writers_;
 };
 

Reply via email to