This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 80ed712851 Use `LevelHistogram` in `PageIndex` (#6135)
80ed712851 is described below
commit 80ed7128510bac114c6feec08c34ef3beed3a44a
Author: Ed Seidl <[email protected]>
AuthorDate: Mon Jul 29 03:32:41 2024 -0700
Use `LevelHistogram` in `PageIndex` (#6135)
* use LevelHistogram in PageIndex and ColumnIndexBuilder
* revert changes to OffsetIndexBuilder
---
parquet/src/file/metadata/mod.rs | 2 +-
parquet/src/file/page_index/index.rs | 28 ++++++++++++++++------------
parquet/src/file/writer.rs | 6 +++---
3 files changed, 20 insertions(+), 16 deletions(-)
diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs
index cd3555de82..d99cd95103 100644
--- a/parquet/src/file/metadata/mod.rs
+++ b/parquet/src/file/metadata/mod.rs
@@ -569,7 +569,7 @@ pub struct ColumnChunkMetaData {
/// For example, `vec[0]` is the number of rows with level 0, `vec[1]` is the
/// number of rows with level 1, and so on.
///
-#[derive(Debug, Clone, PartialEq)]
+#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
pub struct LevelHistogram {
inner: Vec<i64>,
}
diff --git a/parquet/src/file/page_index/index.rs
b/parquet/src/file/page_index/index.rs
index 68412572b5..cebb602b31 100644
--- a/parquet/src/file/page_index/index.rs
+++ b/parquet/src/file/page_index/index.rs
@@ -21,6 +21,7 @@ use crate::basic::Type;
use crate::data_type::private::ParquetValueType;
use crate::data_type::{AsBytes, ByteArray, FixedLenByteArray, Int96};
use crate::errors::ParquetError;
+use crate::file::metadata::LevelHistogram;
use crate::format::{BoundaryOrder, ColumnIndex};
use crate::util::bit_util::from_le_slice;
use std::fmt::Debug;
@@ -40,13 +41,13 @@ pub struct PageIndex<T> {
///
/// `repetition_level_histogram[i]` is a count of how many values are at
repetition level `i`.
/// For example, `repetition_level_histogram[0]` indicates how many rows
the page contains.
- pub repetition_level_histogram: Option<Vec<i64>>,
+ pub repetition_level_histogram: Option<LevelHistogram>,
/// Definition level histogram for the page
///
/// `definition_level_histogram[i]` is a count of how many values are at
definition level `i`.
/// For example, `definition_level_histogram[max_definition_level]`
indicates how many
/// non-null values are present in the page.
- pub definition_level_histogram: Option<Vec<i64>>,
+ pub definition_level_histogram: Option<LevelHistogram>,
}
impl<T> PageIndex<T> {
@@ -59,10 +60,10 @@ impl<T> PageIndex<T> {
pub fn null_count(&self) -> Option<i64> {
self.null_count
}
- pub fn repetition_level_histogram(&self) -> Option<&Vec<i64>> {
+ pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
self.repetition_level_histogram.as_ref()
}
- pub fn definition_level_histogram(&self) -> Option<&Vec<i64>> {
+ pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
self.definition_level_histogram.as_ref()
}
}
@@ -175,7 +176,7 @@ impl<T: ParquetValueType> NativeIndex<T> {
for i in 0..len {
let page_idx = i * num_levels;
let page_hist = hist[page_idx..page_idx +
num_levels].to_vec();
- res.push(Some(page_hist));
+ res.push(Some(LevelHistogram::from(page_hist)));
}
res
} else {
@@ -183,9 +184,9 @@ impl<T: ParquetValueType> NativeIndex<T> {
}
};
- let rep_hists: Vec<Option<Vec<i64>>> =
+ let rep_hists: Vec<Option<LevelHistogram>> =
to_page_histograms(index.repetition_level_histograms);
- let def_hists: Vec<Option<Vec<i64>>> =
+ let def_hists: Vec<Option<LevelHistogram>> =
to_page_histograms(index.definition_level_histograms);
let indexes = index
@@ -236,8 +237,8 @@ mod tests {
min: Some(-123),
max: Some(234),
null_count: Some(0),
- repetition_level_histogram: Some(vec![1, 2]),
- definition_level_histogram: Some(vec![1, 2, 3]),
+ repetition_level_histogram: Some(LevelHistogram::from(vec![1, 2])),
+ definition_level_histogram: Some(LevelHistogram::from(vec![1, 2,
3])),
};
assert_eq!(page_index.min().unwrap(), &-123);
@@ -245,10 +246,13 @@ mod tests {
assert_eq!(page_index.min_bytes().unwrap(), (-123).as_bytes());
assert_eq!(page_index.max_bytes().unwrap(), 234.as_bytes());
assert_eq!(page_index.null_count().unwrap(), 0);
- assert_eq!(page_index.repetition_level_histogram(), Some(&vec![1, 2]));
assert_eq!(
- page_index.definition_level_histogram(),
- Some(&vec![1, 2, 3])
+ page_index.repetition_level_histogram().unwrap().values(),
+ &vec![1, 2]
+ );
+ assert_eq!(
+ page_index.definition_level_histogram().unwrap().values(),
+ &vec![1, 2, 3]
);
}
diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs
index f2e8f74a37..89aaf028d1 100644
--- a/parquet/src/file/writer.rs
+++ b/parquet/src/file/writer.rs
@@ -1951,7 +1951,7 @@ mod tests {
assert!(col_idx.repetition_level_histogram().is_none());
assert!(col_idx.definition_level_histogram().is_some());
- check_def_hist(col_idx.definition_level_histogram().unwrap());
+ check_def_hist(col_idx.definition_level_histogram().unwrap().values());
assert!(reader.metadata().offset_index().is_some());
let offset_index = reader.metadata().offset_index().unwrap();
@@ -2066,8 +2066,8 @@ mod tests {
unreachable!()
};
- check_def_hist(col_idx.definition_level_histogram().unwrap());
- check_rep_hist(col_idx.repetition_level_histogram().unwrap());
+ check_def_hist(col_idx.definition_level_histogram().unwrap().values());
+ check_rep_hist(col_idx.repetition_level_histogram().unwrap().values());
assert!(reader.metadata().offset_index().is_some());
let offset_index = reader.metadata().offset_index().unwrap();