emkornfield commented on code in PR #14964:
URL: https://github.com/apache/arrow/pull/14964#discussion_r1083087767


##########
cpp/src/parquet/page_index.cc:
##########
@@ -184,8 +185,219 @@ class OffsetIndexImpl : public OffsetIndex {
   std::vector<PageLocation> page_locations_;
 };
 
+class RowGroupPageIndexReaderImpl : public RowGroupPageIndexReader {
+ public:
+  RowGroupPageIndexReaderImpl(::arrow::io::RandomAccessFile* input,
+                              std::shared_ptr<RowGroupMetaData> 
row_group_metadata,
+                              const ReaderProperties& properties,
+                              int32_t row_group_ordinal,
+                              std::shared_ptr<InternalFileDecryptor> 
file_decryptor)
+      : input_(input),
+        row_group_metadata_(std::move(row_group_metadata)),
+        properties_(properties),
+        file_decryptor_(std::move(file_decryptor)),
+        index_read_range_(
+            
PageIndexReader::DeterminePageIndexRangesInRowGroup(*row_group_metadata_)) {}
+
+  /// Read column index of a column chunk.
+  std::shared_ptr<ColumnIndex> GetColumnIndex(int32_t i) override {
+    if (i < 0 || i >= row_group_metadata_->num_columns()) {
+      throw ParquetException("Invalid column {} to get column index", i);
+    }
+
+    auto col_chunk = row_group_metadata_->ColumnChunk(i);
+
+    std::unique_ptr<ColumnCryptoMetaData> crypto_metadata = 
col_chunk->crypto_metadata();
+    if (crypto_metadata != nullptr && file_decryptor_ == nullptr) {
+      ParquetException::NYI("Cannot read encrypted column index yet");
+    }
+
+    auto column_index_location = col_chunk->GetColumnIndexLocation();
+    if (!column_index_location.has_value()) {
+      return nullptr;
+    }
+
+    if (!index_read_range_.column_index.has_value()) {
+      throw ParquetException("Missing column index read range");
+    }
+
+    if (column_index_buffer_ == nullptr) {
+      PARQUET_ASSIGN_OR_THROW(column_index_buffer_,
+                              
input_->ReadAt(index_read_range_.column_index->offset,
+                                             
index_read_range_.column_index->length));
+    }
+
+    auto buffer = column_index_buffer_.get();
+    int64_t buffer_offset =
+        column_index_location->offset - index_read_range_.column_index->offset;
+    uint32_t length = static_cast<uint32_t>(column_index_location->length);
+    DCHECK_GE(buffer_offset, 0);

Review Comment:
   if location->length is derived from data in the parquet file, we should be 
checking eagerly that the length is within the allowed range.
   
   Could we also add a comment on why the down-cast



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to