emkornfield commented on code in PR #14964:
URL: https://github.com/apache/arrow/pull/14964#discussion_r1083090721
##########
cpp/src/parquet/page_index.h:
##########
@@ -126,4 +132,94 @@ class PARQUET_EXPORT OffsetIndex {
virtual const std::vector<PageLocation>& page_locations() const = 0;
};
+/// \brief Interface for reading the page index for a Parquet row group.
+class PARQUET_EXPORT RowGroupPageIndexReader {
+ public:
+ virtual ~RowGroupPageIndexReader() = default;
+
+ /// \brief Read column index of a column chunk.
+ ///
+ /// \param[in] i column ordinal of the column chunk.
+ /// \returns column index of the column or nullptr if it does not exist.
+ /// \throws ParquetException if the index is out of bound.
+ virtual std::shared_ptr<ColumnIndex> GetColumnIndex(int32_t i) = 0;
+
+ /// \brief Read offset index of a column chunk.
+ ///
+ /// \param[in] i column ordinal of the column chunk.
+ /// \returns offset index of the column or nullptr if it does not exist.
+ /// \throws ParquetException if the index is out of bound.
+ virtual std::shared_ptr<OffsetIndex> GetOffsetIndex(int32_t i) = 0;
+};
+
+struct IndexSelection {
+ /// Specifies whether to read the column index.
+ bool column_index = false;
+ /// Specifies whether to read the offset index.
+ bool offset_index = false;
+};
+
+struct RowGroupIndexReadRange {
+ /// Base start and total size of column index of all column chunks in a row
group.
+ /// If none of the column chunks have column index, it is set to
std::nullopt.
+ std::optional<::arrow::io::ReadRange> column_index = std::nullopt;
+ /// Base start and total size of offset index of all column chunks in a row
group.
+ /// If none of the column chunks have offset index, it is set to
std::nullopt.
+ std::optional<::arrow::io::ReadRange> offset_index = std::nullopt;
+};
+
+/// \brief Interface for reading the page index for a Parquet file.
+class PARQUET_EXPORT PageIndexReader {
+ public:
+ virtual ~PageIndexReader() = default;
+
+ /// \brief Create a PageIndexReader instance.
+ /// \returns a PageIndexReader instance.
+ /// WARNING: The returned PageIndexReader references to all the input
parameters, so
+ /// it must not outlive all of the input parameters. Usually these input
parameters
+ /// come from the same ParquetFileReader object, so it must not outlive the
reader
+ /// that creates this PageIndexReader.
+ static std::shared_ptr<PageIndexReader> Make(
+ ::arrow::io::RandomAccessFile* input, std::shared_ptr<FileMetaData>
file_metadata,
+ const ReaderProperties& properties,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
+
+ /// \brief Get the page index reader of a specific row group.
+ /// \param[in] i row group ordinal to get page index reader.
+ /// \returns RowGroupPageIndexReader of the specified row group. A nullptr
may or may
+ /// not be returned if the page index for the row group is
unavailable. It is
+ /// the caller's responsibility to check the return value of
follow-up calls
+ /// to the RowGroupPageIndexReader.
+ /// \throws ParquetException if the index is out of bound.
+ virtual std::shared_ptr<RowGroupPageIndexReader> RowGroup(int i) = 0;
+
+ /// \brief Advise the reader which part of page index will be read later.
+ ///
+ /// The PageIndexReader implementation can optionally prefetch and cache
page index
+ /// that may be read later. Follow-up read should not fail even if
WillNeed() is not
+ /// called, or the requested page index is out of range from WillNeed() call.
+ ///
+ /// \param[in] row_group_indices list of row group ordinal to read page
index later.
+ /// \param[in] index_selection tell if any of the page index is required
later.
+ virtual void WillNeed(const std::vector<int32_t>& row_group_indices,
+ IndexSelection index_selection) = 0;
Review Comment:
nit, even though this might be more effient, probably pays to use const
IndexSelection& as the formal parameter.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]