alamb commented on code in PR #8160:
URL: https://github.com/apache/arrow-rs/pull/8160#discussion_r2283054774
##########
parquet/src/file/page_index/index.rs:
##########
@@ -306,6 +308,75 @@ impl<T: ParquetValueType> NativeIndex<T> {
definition_level_histograms,
)
}
+
+ /// Creates a new [`NativeIndex`]
+ pub(crate) fn try_new_local(index: ColumnIndex) -> Result<Self,
ParquetError> {
+ let len = index.min_values.len();
+
+ // turn Option<Vec<i64>> into Vec<Option<i64>>
+ let null_counts = index
+ .null_counts
+ .map(|x| x.into_iter().map(Some).collect::<Vec<_>>())
+ .unwrap_or_else(|| vec![None; len]);
+
+ // histograms are a 1D array encoding a 2D num_pages X num_levels
matrix.
+ let to_page_histograms = |opt_hist: Option<Vec<i64>>| {
+ if let Some(hist) = opt_hist {
+ // TODO: should we assert (hist.len() % len) == 0?
+ let num_levels = hist.len() / len;
+ let mut res = Vec::with_capacity(len);
+ for i in 0..len {
+ let page_idx = i * num_levels;
+ let page_hist = hist[page_idx..page_idx +
num_levels].to_vec();
+ res.push(Some(LevelHistogram::from(page_hist)));
+ }
+ res
+ } else {
+ vec![None; len]
+ }
+ };
+
+ // turn Option<Vec<i64>> into Vec<Option<i64>>
Review Comment:
😢
##########
parquet/src/file/page_index/offset_index.rs:
##########
@@ -55,24 +59,26 @@ impl From<&PageLocation> for crate::format::PageLocation {
}
}
+thrift_struct!(
/// [`OffsetIndex`] information for a column chunk. Contains offsets and sizes
for each page
/// in the chunk. Optionally stores fully decoded page sizes for BYTE_ARRAY
columns.
///
/// [`OffsetIndex`]:
https://github.com/apache/parquet-format/blob/master/PageIndex.md
-#[derive(Debug, Clone, PartialEq)]
pub struct OffsetIndexMetaData {
- /// Vector of [`PageLocation`] objects, one per page in the chunk.
- pub page_locations: Vec<PageLocation>,
- /// Optional vector of unencoded page sizes, one per page in the chunk.
- /// Only defined for BYTE_ARRAY columns.
- pub unencoded_byte_array_data_bytes: Option<Vec<i64>>,
+ /// Vector of [`PageLocation`] objects, one per page in the chunk.
+ 1: required list<PageLocation> page_locations
Review Comment:
I will admit these macros are quite neat and are growing on me
##########
parquet/src/file/page_index/index_reader.rs:
##########
@@ -129,25 +130,37 @@ pub fn read_offset_indexes<R: ChunkReader>(
}
pub(crate) fn decode_offset_index(data: &[u8]) -> Result<OffsetIndexMetaData,
ParquetError> {
- let mut prot = TCompactSliceInputProtocol::new(data);
- let offset = crate::format::OffsetIndex::read_from_in_protocol(&mut prot)?;
- OffsetIndexMetaData::try_new(offset)
+ let mut prot = ThriftCompactInputProtocol::new(data);
+ OffsetIndexMetaData::try_from(&mut prot)
}
-pub(crate) fn decode_column_index(data: &[u8], column_type: Type) ->
Result<Index, ParquetError> {
- let mut prot = TCompactSliceInputProtocol::new(data);
+thrift_struct!(
+pub(crate) struct ColumnIndex<'a> {
+ 1: required list<bool> null_pages
+ 2: required list<'a><binary> min_values
Review Comment:
This is so sad -- the structure actually starts out column oriented and then
we pivot it to rows (just to have to pivot it back to columns to use in
DataFusion)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]