kszucs commented on code in PR #45360:
URL: https://github.com/apache/arrow/pull/45360#discussion_r2083105816
##########
cpp/src/parquet/column_writer.cc:
##########
@@ -1337,13 +1368,47 @@ class TypedColumnWriterImpl : public ColumnWriterImpl,
bits_buffer_->ZeroPadding();
}
- if (leaf_array.type()->id() == ::arrow::Type::DICTIONARY) {
- return WriteArrowDictionary(def_levels, rep_levels, num_levels,
leaf_array, ctx,
- maybe_parent_nulls);
+ if (properties_->content_defined_chunking_enabled()) {
+ DCHECK(content_defined_chunker_.has_value());
+ auto chunks = content_defined_chunker_->GetChunks(def_levels, rep_levels,
+ num_levels,
leaf_array);
+ for (size_t i = 0; i < chunks.size(); i++) {
+ auto chunk = chunks[i];
+ auto chunk_array = leaf_array.Slice(chunk.value_offset);
+ auto chunk_def_levels = AddIfNotNull(def_levels, chunk.level_offset);
+ auto chunk_rep_levels = AddIfNotNull(rep_levels, chunk.level_offset);
+ if (leaf_array.type()->id() == ::arrow::Type::DICTIONARY) {
+ ARROW_CHECK_OK(WriteArrowDictionary(chunk_def_levels,
chunk_rep_levels,
+ chunk.levels_to_write,
*chunk_array, ctx,
+ maybe_parent_nulls));
+ } else {
+ ARROW_CHECK_OK(WriteArrowDense(chunk_def_levels, chunk_rep_levels,
+ chunk.levels_to_write, *chunk_array,
ctx,
+ maybe_parent_nulls));
+ }
+ bool is_last_chunk = i == (chunks.size() - 1);
+ if (num_buffered_values_ > 0 && !is_last_chunk) {
+ // Explicitly add a new data page according to the content-defined
chunk
+ // boundaries. This way the same chunks will have the same
byte-sequence
+ // in the resulting file, which can be identified by content
addressible
+ // storage.
+ // Note that the last chunk doesn't trigger a new data page in order
to
+ // allow subsequent WriteArrow() calls to continue writing to the
same
+ // data page, the chunker's state is not being reset after the last
chunk.
+ AddDataPage();
Review Comment:
Yes, you did raise it here
https://github.com/apache/arrow/pull/45360#discussion_r1978497698
CDC always cuts at record boundaries, see the [relevant condition
here](https://github.com/apache/arrow/pull/45360/files#diff-2a6e72625f8a6cc84b1b6b0c84954dabc47d95eba8bfa5726e6bc5ba4fee1cccR276),
also added [DataPageV2 test
cases](https://github.com/apache/arrow/pull/45360/files#diff-9c78490b8a40a2051a1341c5fd65d2669b5ea4991f4e17282a7bbc220632afbeR1480).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]