pitrou commented on code in PR #40594:
URL: https://github.com/apache/arrow/pull/40594#discussion_r1888191801
##########
cpp/src/parquet/encoder.cc:
##########
@@ -79,6 +79,15 @@ class EncoderImpl : virtual public Encoder {
MemoryPool* memory_pool() const override { return pool_; }
+ int64_t ReportUnencodedDataBytes() override {
+ if (descr_->physical_type() != Type::BYTE_ARRAY) {
+ throw ParquetException("ReportUnencodedDataBytes is only supported for
BYTE_ARRAY");
+ }
+ int64_t byte = unencoded_byte_array_data_bytes_;
+ unencoded_byte_array_data_bytes_ = 0;
+ return byte;
Review Comment:
Nit: let's make the name less confusing?
```suggestion
int64_t bytes = unencoded_byte_array_data_bytes_;
unencoded_byte_array_data_bytes_ = 0;
return bytes;
```
##########
cpp/src/parquet/page_index.cc:
##########
@@ -533,6 +569,28 @@ class ColumnIndexBuilderImpl final : public
ColumnIndexBuilder {
/// Decide the boundary order from decoded min/max values.
auto boundary_order = DetermineBoundaryOrder(min_values, max_values);
column_index_.__set_boundary_order(ToThrift(boundary_order));
+
+ // Finalize level histogram.
+ const int64_t num_pages = column_index_.null_pages.size();
+ const int64_t def_level_hist_size =
column_index_.definition_level_histograms.size();
+ const int64_t rep_level_hist_size =
column_index_.repetition_level_histograms.size();
+ if (def_level_hist_size != 0 &&
+ def_level_hist_size != (descr_->max_definition_level() + 1) *
num_pages) {
+ std::stringstream ss;
+ ss << "Invalid definition level histogram size: " << def_level_hist_size
+ << ", expected: " << (descr_->max_definition_level() + 1) * num_pages;
Review Comment:
We're missing the actual throw here:
```suggestion
<< ", expected: " << (descr_->max_definition_level() + 1) *
num_pages;
throw ParquetException(ss.str());
```
##########
cpp/src/parquet/properties.h:
##########
@@ -639,6 +650,16 @@ class PARQUET_EXPORT WriterProperties {
return this->disable_write_page_index(path->ToDotString());
}
+ /// \brief Set the level to write size statistics for all columns. Default
is NONE.
+ ///
+ /// \param level The level to write size statistics. Note that if page
index is not
+ /// enabled, page level size statistics will not be written even if the
level
+ /// is set to PAGE.
Review Comment:
```suggestion
/// \brief Set the level to write size statistics for all columns.
Default is None.
///
/// \param level The level to write size statistics. Note that if page
index is not
/// enabled, page level size statistics will not be written even if the
level
/// is set to PageAndColumnChunk.
```
##########
cpp/src/parquet/size_statistics.cc:
##########
@@ -0,0 +1,90 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliancec
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/size_statistics.h"
+
+#include <algorithm>
+
+#include "arrow/util/logging.h"
+#include "parquet/exception.h"
+#include "parquet/schema.h"
+
+namespace parquet {
+
+void SizeStatistics::Merge(const SizeStatistics& other) {
+ if (repetition_level_histogram.size() !=
other.repetition_level_histogram.size()) {
+ throw ParquetException("Repetition level histogram size mismatch");
+ }
+ if (definition_level_histogram.size() !=
other.definition_level_histogram.size()) {
+ throw ParquetException("Definition level histogram size mismatch");
+ }
+ if (unencoded_byte_array_data_bytes.has_value() !=
+ other.unencoded_byte_array_data_bytes.has_value()) {
+ throw ParquetException("Unencoded byte array data bytes are not
consistent");
+ }
+ std::transform(repetition_level_histogram.begin(),
repetition_level_histogram.end(),
+ other.repetition_level_histogram.begin(),
+ repetition_level_histogram.begin(), std::plus<>());
+ std::transform(definition_level_histogram.begin(),
definition_level_histogram.end(),
+ other.definition_level_histogram.begin(),
+ definition_level_histogram.begin(), std::plus<>());
+ if (unencoded_byte_array_data_bytes.has_value()) {
+ unencoded_byte_array_data_bytes = unencoded_byte_array_data_bytes.value() +
+
other.unencoded_byte_array_data_bytes.value();
+ }
+}
+
+void SizeStatistics::IncrementUnencodedByteArrayDataBytes(int64_t value) {
+ ARROW_CHECK(unencoded_byte_array_data_bytes.has_value());
+ unencoded_byte_array_data_bytes = unencoded_byte_array_data_bytes.value() +
value;
+}
+
+void SizeStatistics::Validate(const ColumnDescriptor* descr) const {
+ if (repetition_level_histogram.size() !=
+ static_cast<size_t>(descr->max_repetition_level() + 1)) {
+ throw ParquetException("Repetition level histogram size mismatch");
+ }
+ if (definition_level_histogram.size() !=
+ static_cast<size_t>(descr->max_definition_level() + 1)) {
+ throw ParquetException("Definition level histogram size mismatch");
+ }
+ if (unencoded_byte_array_data_bytes.has_value() &&
+ descr->physical_type() != Type::BYTE_ARRAY) {
+ throw ParquetException("Unencoded byte array data bytes does not support "
+
+ TypeToString(descr->physical_type()));
+ }
Review Comment:
Should we conversely also throw an exception if the type is `BYTE_ARRAY` and
`unencoded_byte_array_data_bytes` is not set? After all, we also mandate that
`definition_level_histogram` and `repetition_level_histogram` are set above.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]