emkornfield commented on code in PR #14142: URL: https://github.com/apache/arrow/pull/14142#discussion_r984252292
########## cpp/src/parquet/column_reader.cc: ########## @@ -1329,6 +1332,183 @@ class TypedRecordReader : public ColumnReaderImplBase<DType>, return records_read; } + // Throw away levels from start_levels_position to levels_position_. + // Will update levels_position_ and levels_written_ accordingly and move + // the levels to left to fill in the gap. It will not shrink the size + // of the buffer or overwrite the positions after levels_written_. + // This is inefficient, though necessary to consume levels that we have + // already read into the buffer and we want to Skip. + void ThrowAwayLevels(int64_t start_levels_position) { + ARROW_DCHECK_LE(levels_position_, levels_written_); + ARROW_DCHECK_LE(start_levels_position, levels_position_); + int64_t gap = levels_position_ - start_levels_position; + + for (int64_t i = levels_position_; i < levels_written_; ++i) { + *(def_levels() + i - gap) = *(def_levels() + i); + *(rep_levels() + i - gap) = *(rep_levels() + i); + } + + levels_written_ -= gap; + levels_position_ -= gap; + } + + // Skip records that we have in our buffer. This function is only for + // non-repeated fields. + int64_t SkipRecordsInBufferNonRepeated(int64_t num_records) { + ARROW_DCHECK_EQ(this->max_rep_level_, 0); + if (!this->has_values_to_process()) return 0; + + int64_t remaining_records = levels_written_ - levels_position_; + int64_t skipped_records = std::min(num_records, remaining_records); + int64_t start_levels_position = levels_position_; + // Since there is no repetition, number of levels equals number of records. + levels_position_ += skipped_records; + + // We skipped the levels by incrementing 'levels_position_'. For values + // we do not have a buffer, so we need to read them and throw them away. + // First we need to figure out how many present/not-null values there are. + std::shared_ptr<::arrow::ResizableBuffer> valid_bits; + valid_bits = AllocateBuffer(this->pool_); + PARQUET_THROW_NOT_OK( + valid_bits->Resize(bit_util::BytesForBits(skipped_records), true)); + ValidityBitmapInputOutput validity_io; + validity_io.values_read_upper_bound = skipped_records; + validity_io.valid_bits = valid_bits->mutable_data(); + validity_io.valid_bits_offset = 0; + DefLevelsToBitmap(def_levels() + start_levels_position, + skipped_records, + this->leaf_info_, &validity_io); + int64_t values_to_read = validity_io.values_read - validity_io.null_count; + + // Now that we have figured out number of values to read, we do not need + // these levels anymore. Updates levels_position_ and levels_written. + ThrowAwayLevels(start_levels_position); + ReadAndThrowAwayValues(values_to_read); + + // Mark the levels as read in the underlying column reader. + this->ConsumeBufferedValues(skipped_records); Review Comment: it might be worth adding comments to distringuish the difference in this operation from ThrowAwayLevels above. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org