wgtmac commented on code in PR #17877:
URL: https://github.com/apache/arrow/pull/17877#discussion_r1066769889
##########
cpp/src/parquet/column_reader.cc:
##########
@@ -1762,7 +1763,7 @@ class TypedRecordReader : public
TypedColumnReaderImpl<DType>,
}
// Return number of logical records read
- int64_t ReadRecordData(int64_t num_records) {
+ int64_t ReadRecordData(int64_t num_records, bool read_dense_for_nullable) {
Review Comment:
If `read_dense_for_nullable` is set to true but the column has null values,
it seems that we haven't checked it and the reader may fail with random error
or read wrong values.
##########
cpp/src/parquet/column_reader.cc:
##########
@@ -1787,21 +1788,34 @@ class TypedRecordReader : public
TypedColumnReaderImpl<DType>,
int64_t null_count = 0;
if (leaf_info_.HasNullableValues()) {
- ValidityBitmapInputOutput validity_io;
- validity_io.values_read_upper_bound = levels_position_ -
start_levels_position;
- validity_io.valid_bits = valid_bits_->mutable_data();
- validity_io.valid_bits_offset = values_written_;
-
- DefLevelsToBitmap(def_levels() + start_levels_position,
- levels_position_ - start_levels_position, leaf_info_,
- &validity_io);
- values_to_read = validity_io.values_read - validity_io.null_count;
- null_count = validity_io.null_count;
- DCHECK_GE(values_to_read, 0);
- ReadValuesSpaced(validity_io.values_read, null_count);
+ if (read_dense_for_nullable) {
+ DCHECK_GE(values_to_read, 0);
+ ReadValuesDense(values_to_read);
+ // Total values, if reading dense, we don't have the nulls in the
values
+ // vector.
+ values_written_ += values_to_read;
+ // Any values with def_level < max_def_level is considered null.
+ null_count = levels_position_ - start_levels_position - values_to_read;
+ } else {
+ ValidityBitmapInputOutput validity_io;
+ validity_io.values_read_upper_bound = levels_position_ -
start_levels_position;
+ validity_io.valid_bits = valid_bits_->mutable_data();
+ validity_io.valid_bits_offset = values_written_;
+
+ DefLevelsToBitmap(def_levels() + start_levels_position,
+ levels_position_ - start_levels_position, leaf_info_,
+ &validity_io);
+ values_to_read = validity_io.values_read - validity_io.null_count;
+ null_count = validity_io.null_count;
+ ARROW_DCHECK_GE(values_to_read, 0);
+ ReadValuesSpaced(validity_io.values_read, null_count);
Review Comment:
What about checking if `null_count` is zero and calling `ReadValuesDense` if
that is true? Actually `ReadValuesSpaced()` internally calls
`decoder->DecodeSpaced()` which goes to the fast path `decoder->Decode()` if
`null_count` is ZERO.
##########
cpp/src/parquet/column_reader.h:
##########
@@ -278,9 +278,13 @@ class PARQUET_EXPORT RecordReader {
/// \brief Attempt to read indicated number of records from column chunk
/// Note that for repeated fields, a record may have more than one value
- /// and all of them are read.
+ /// and all of them are read. If read_dense_for_nullable is true, it will
+ /// not leave any space for null values. Otherwise, it will read spaced.
+ /// Readers must call Reset() before switching between reading dense and
+ /// spaced since reading dense will not update the valid_bits_.
/// \return number of records read
- virtual int64_t ReadRecords(int64_t num_records) = 0;
+ virtual int64_t ReadRecords(int64_t num_records,
+ bool read_dense_for_nullable = false) = 0;
Review Comment:
Can we derive the null_count info from ColumnChunkMetaData or PageHeader and
automatically enable it? To be conservative, we cannot enable it if null_count
stats if missing. This would be much safer and user-friendly.
##########
cpp/src/parquet/column_reader.cc:
##########
@@ -1787,21 +1788,34 @@ class TypedRecordReader : public
TypedColumnReaderImpl<DType>,
int64_t null_count = 0;
if (leaf_info_.HasNullableValues()) {
- ValidityBitmapInputOutput validity_io;
- validity_io.values_read_upper_bound = levels_position_ -
start_levels_position;
- validity_io.valid_bits = valid_bits_->mutable_data();
- validity_io.valid_bits_offset = values_written_;
-
- DefLevelsToBitmap(def_levels() + start_levels_position,
- levels_position_ - start_levels_position, leaf_info_,
- &validity_io);
- values_to_read = validity_io.values_read - validity_io.null_count;
- null_count = validity_io.null_count;
- DCHECK_GE(values_to_read, 0);
- ReadValuesSpaced(validity_io.values_read, null_count);
+ if (read_dense_for_nullable) {
+ DCHECK_GE(values_to_read, 0);
+ ReadValuesDense(values_to_read);
+ // Total values, if reading dense, we don't have the nulls in the
values
+ // vector.
+ values_written_ += values_to_read;
+ // Any values with def_level < max_def_level is considered null.
+ null_count = levels_position_ - start_levels_position - values_to_read;
+ } else {
+ ValidityBitmapInputOutput validity_io;
+ validity_io.values_read_upper_bound = levels_position_ -
start_levels_position;
+ validity_io.valid_bits = valid_bits_->mutable_data();
+ validity_io.valid_bits_offset = values_written_;
+
+ DefLevelsToBitmap(def_levels() + start_levels_position,
Review Comment:
IIUC, your major saving comes from skipping decoding def/ref levels if one
is pretty sure the nullable column does not have any null.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]