This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 5e1a4fd8a4 GH-40767: [C++][Parquet] Simplify PageWriter and
ColumnWriter creation (#40768)
5e1a4fd8a4 is described below
commit 5e1a4fd8a4ed3630c9549c611222d2d6c32357ca
Author: mwish <[email protected]>
AuthorDate: Tue Mar 26 20:38:58 2024 +0800
GH-40767: [C++][Parquet] Simplify PageWriter and ColumnWriter creation
(#40768)
### Rationale for this change
Enhance style for creating page writer.
### What changes are included in this PR?
* create PageWriter with one calling
* extract `column_properties`
### Are these changes tested?
no need
### Are there any user-facing changes?
no
* GitHub Issue: #40767
Authored-by: mwish <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/parquet/file_writer.cc | 101 +++++++++++++++--------------------------
1 file changed, 36 insertions(+), 65 deletions(-)
diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc
index 6f5610b934..baa9e00da2 100644
--- a/cpp/src/parquet/file_writer.cc
+++ b/cpp/src/parquet/file_writer.cc
@@ -139,39 +139,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents
{
}
const int32_t column_ordinal = next_column_index_++;
- const auto& path = col_meta->descr()->path();
- auto meta_encryptor =
- file_encryptor_ ?
file_encryptor_->GetColumnMetaEncryptor(path->ToDotString())
- : nullptr;
- auto data_encryptor =
- file_encryptor_ ?
file_encryptor_->GetColumnDataEncryptor(path->ToDotString())
- : nullptr;
- auto ci_builder = page_index_builder_ &&
properties_->page_index_enabled(path) &&
- properties_->statistics_enabled(path)
- ?
page_index_builder_->GetColumnIndexBuilder(column_ordinal)
- : nullptr;
- auto oi_builder = page_index_builder_ &&
properties_->page_index_enabled(path)
- ?
page_index_builder_->GetOffsetIndexBuilder(column_ordinal)
- : nullptr;
- auto codec_options = properties_->codec_options(path)
- ? properties_->codec_options(path).get()
- : nullptr;
-
- std::unique_ptr<PageWriter> pager;
- if (!codec_options) {
- pager = PageWriter::Open(sink_, properties_->compression(path), col_meta,
- row_group_ordinal_,
static_cast<int16_t>(column_ordinal),
- properties_->memory_pool(), false,
meta_encryptor,
- data_encryptor,
properties_->page_checksum_enabled(),
- ci_builder, oi_builder, CodecOptions());
- } else {
- pager = PageWriter::Open(sink_, properties_->compression(path), col_meta,
- row_group_ordinal_,
static_cast<int16_t>(column_ordinal),
- properties_->memory_pool(), false,
meta_encryptor,
- data_encryptor,
properties_->page_checksum_enabled(),
- ci_builder, oi_builder, *codec_options);
- }
- column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager),
properties_);
+ column_writers_[0] = CreateColumnWriterForColumn(col_meta, column_ordinal);
return column_writers_[0].get();
}
@@ -288,45 +256,48 @@ class RowGroupSerializer : public
RowGroupWriter::Contents {
}
void InitColumns() {
- for (int i = 0; i < num_columns(); i++) {
+ for (int i = 0; i < RowGroupSerializer::num_columns(); i++) {
auto col_meta = metadata_->NextColumnChunk();
- const auto& path = col_meta->descr()->path();
const int32_t column_ordinal = next_column_index_++;
- auto meta_encryptor =
- file_encryptor_ ?
file_encryptor_->GetColumnMetaEncryptor(path->ToDotString())
+ column_writers_.push_back(CreateColumnWriterForColumn(col_meta,
column_ordinal));
+ }
+ }
+
+ std::shared_ptr<ColumnWriter> CreateColumnWriterForColumn(
+ ColumnChunkMetaDataBuilder* col_meta, int32_t column_ordinal) const {
+ const auto& path = col_meta->descr()->path();
+ const ColumnProperties& column_properties =
properties_->column_properties(path);
+ auto meta_encryptor =
+ file_encryptor_ ?
file_encryptor_->GetColumnMetaEncryptor(path->ToDotString())
+ : nullptr;
+ auto data_encryptor =
+ file_encryptor_ ?
file_encryptor_->GetColumnDataEncryptor(path->ToDotString())
+ : nullptr;
+ auto ci_builder = page_index_builder_ &&
column_properties.page_index_enabled()
+ ?
page_index_builder_->GetColumnIndexBuilder(column_ordinal)
: nullptr;
- auto data_encryptor =
- file_encryptor_ ?
file_encryptor_->GetColumnDataEncryptor(path->ToDotString())
+ auto oi_builder = page_index_builder_ &&
column_properties.page_index_enabled()
+ ?
page_index_builder_->GetOffsetIndexBuilder(column_ordinal)
: nullptr;
- auto ci_builder = page_index_builder_ &&
properties_->page_index_enabled(path)
- ?
page_index_builder_->GetColumnIndexBuilder(column_ordinal)
- : nullptr;
- auto oi_builder = page_index_builder_ &&
properties_->page_index_enabled(path)
- ?
page_index_builder_->GetOffsetIndexBuilder(column_ordinal)
- : nullptr;
- auto codec_options = properties_->codec_options(path)
- ? (properties_->codec_options(path)).get()
- : nullptr;
-
- std::unique_ptr<PageWriter> pager;
- if (!codec_options) {
- pager = PageWriter::Open(
- sink_, properties_->compression(path), col_meta,
row_group_ordinal_,
- static_cast<int16_t>(column_ordinal), properties_->memory_pool(),
- buffered_row_group_, meta_encryptor, data_encryptor,
- properties_->page_checksum_enabled(), ci_builder, oi_builder,
CodecOptions());
- } else {
- pager = PageWriter::Open(
- sink_, properties_->compression(path), col_meta,
row_group_ordinal_,
- static_cast<int16_t>(column_ordinal), properties_->memory_pool(),
- buffered_row_group_, meta_encryptor, data_encryptor,
- properties_->page_checksum_enabled(), ci_builder, oi_builder,
*codec_options);
- }
- column_writers_.push_back(
- ColumnWriter::Make(col_meta, std::move(pager), properties_));
+
+ const CodecOptions* codec_options = column_properties.codec_options()
+ ?
column_properties.codec_options().get()
+ : nullptr;
+ CodecOptions default_codec_options;
+ if (!codec_options) {
+ codec_options = &default_codec_options;
}
+ DCHECK_NE(nullptr, codec_options);
+ std::unique_ptr<PageWriter> pager = PageWriter::Open(
+ sink_, column_properties.compression(), col_meta, row_group_ordinal_,
+ static_cast<int16_t>(column_ordinal), properties_->memory_pool(),
+ buffered_row_group_, meta_encryptor, data_encryptor,
+ properties_->page_checksum_enabled(), ci_builder, oi_builder,
*codec_options);
+ return ColumnWriter::Make(col_meta, std::move(pager), properties_);
}
+ // If buffered_row_group_ is false, only column_writers_[0] is used as
current writer.
+ // If buffered_row_group_ is true, multiple column writers are used.
std::vector<std::shared_ptr<ColumnWriter>> column_writers_;
};