emkornfield commented on code in PR #14964: URL: https://github.com/apache/arrow/pull/14964#discussion_r1083087767
########## cpp/src/parquet/page_index.cc: ########## @@ -184,8 +185,219 @@ class OffsetIndexImpl : public OffsetIndex { std::vector<PageLocation> page_locations_; }; +class RowGroupPageIndexReaderImpl : public RowGroupPageIndexReader { + public: + RowGroupPageIndexReaderImpl(::arrow::io::RandomAccessFile* input, + std::shared_ptr<RowGroupMetaData> row_group_metadata, + const ReaderProperties& properties, + int32_t row_group_ordinal, + std::shared_ptr<InternalFileDecryptor> file_decryptor) + : input_(input), + row_group_metadata_(std::move(row_group_metadata)), + properties_(properties), + file_decryptor_(std::move(file_decryptor)), + index_read_range_( + PageIndexReader::DeterminePageIndexRangesInRowGroup(*row_group_metadata_)) {} + + /// Read column index of a column chunk. + std::shared_ptr<ColumnIndex> GetColumnIndex(int32_t i) override { + if (i < 0 || i >= row_group_metadata_->num_columns()) { + throw ParquetException("Invalid column {} to get column index", i); + } + + auto col_chunk = row_group_metadata_->ColumnChunk(i); + + std::unique_ptr<ColumnCryptoMetaData> crypto_metadata = col_chunk->crypto_metadata(); + if (crypto_metadata != nullptr && file_decryptor_ == nullptr) { + ParquetException::NYI("Cannot read encrypted column index yet"); + } + + auto column_index_location = col_chunk->GetColumnIndexLocation(); + if (!column_index_location.has_value()) { + return nullptr; + } + + if (!index_read_range_.column_index.has_value()) { + throw ParquetException("Missing column index read range"); + } + + if (column_index_buffer_ == nullptr) { + PARQUET_ASSIGN_OR_THROW(column_index_buffer_, + input_->ReadAt(index_read_range_.column_index->offset, + index_read_range_.column_index->length)); + } + + auto buffer = column_index_buffer_.get(); + int64_t buffer_offset = + column_index_location->offset - index_read_range_.column_index->offset; + uint32_t length = static_cast<uint32_t>(column_index_location->length); + DCHECK_GE(buffer_offset, 0); Review Comment: if location->length is derived from data in the parquet file, we should be checking eagerly that the length is within the allowed range. Could we also add a comment on why the down-cast -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org