This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new c3aac9345 [feat] Add pub api for checking column index is sorted.
(#2849)
c3aac9345 is described below
commit c3aac93454c67b7b1b2ee38cd33aa93c1a8e568e
Author: Yang Jiang <[email protected]>
AuthorDate: Mon Oct 10 16:24:16 2022 +0800
[feat] Add pub api for checking column index is sorted. (#2849)
* [feat]Add pub api for checking column index is sorted.
* export boundary_order
* simplify the code
---
parquet/src/file/page_index/index.rs | 27 +++++++++++++++++++++++++++
parquet/src/file/serialized_reader.rs | 16 ++++++++++++++++
2 files changed, 43 insertions(+)
diff --git a/parquet/src/file/page_index/index.rs
b/parquet/src/file/page_index/index.rs
index 062dc3966..7adf2c08a 100644
--- a/parquet/src/file/page_index/index.rs
+++ b/parquet/src/file/page_index/index.rs
@@ -63,6 +63,33 @@ pub enum Index {
FIXED_LEN_BYTE_ARRAY(ByteArrayIndex),
}
+impl Index {
+ /// Return min/max elements inside ColumnIndex are ordered or not.
+ pub fn is_sorted(&self) -> bool {
+ // 0:UNORDERED, 1:ASCENDING ,2:DESCENDING,
+ if let Some(order) = self.get_boundary_order() {
+ order.0 > (BoundaryOrder::UNORDERED.0)
+ } else {
+ false
+ }
+ }
+
+ /// Get boundary_order of this page index.
+ pub fn get_boundary_order(&self) -> Option<BoundaryOrder> {
+ match self {
+ Index::NONE => None,
+ Index::BOOLEAN(index) => Some(index.boundary_order),
+ Index::INT32(index) => Some(index.boundary_order),
+ Index::INT64(index) => Some(index.boundary_order),
+ Index::INT96(index) => Some(index.boundary_order),
+ Index::FLOAT(index) => Some(index.boundary_order),
+ Index::DOUBLE(index) => Some(index.boundary_order),
+ Index::BYTE_ARRAY(index) => Some(index.boundary_order),
+ Index::FIXED_LEN_BYTE_ARRAY(index) => Some(index.boundary_order),
+ }
+ }
+}
+
/// An index of a column of [`Type`] physical representation
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct NativeIndex<T: ParquetValueType> {
diff --git a/parquet/src/file/serialized_reader.rs
b/parquet/src/file/serialized_reader.rs
index cd90b0d0b..6b416e34d 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -1325,6 +1325,10 @@ mod tests {
let row_group_metadata = metadata.row_group(0);
//col0->id: INT32 UNCOMPRESSED DO:0 FPO:4 SZ:37325/37325/1.00 VC:7300
ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 7299, num_nulls: 0]
+ assert!(!&page_indexes[0][0].is_sorted());
+ let boundary_order = &page_indexes[0][0].get_boundary_order();
+ assert!(boundary_order.is_some());
+ matches!(boundary_order.unwrap(), BoundaryOrder::UNORDERED);
if let Index::INT32(index) = &page_indexes[0][0] {
check_native_page_index(
index,
@@ -1337,6 +1341,7 @@ mod tests {
unreachable!()
};
//col1->bool_col:BOOLEAN UNCOMPRESSED DO:0 FPO:37329 SZ:3022/3022/1.00
VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: false, max: true, num_nulls: 0]
+ assert!(&page_indexes[0][1].is_sorted());
if let Index::BOOLEAN(index) = &page_indexes[0][1] {
assert_eq!(index.indexes.len(), 82);
assert_eq!(row_group_offset_indexes[1].len(), 82);
@@ -1344,6 +1349,7 @@ mod tests {
unreachable!()
};
//col2->tinyint_col: INT32 UNCOMPRESSED DO:0 FPO:40351
SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9,
num_nulls: 0]
+ assert!(&page_indexes[0][2].is_sorted());
if let Index::INT32(index) = &page_indexes[0][2] {
check_native_page_index(
index,
@@ -1356,6 +1362,7 @@ mod tests {
unreachable!()
};
//col4->smallint_col: INT32 UNCOMPRESSED DO:0 FPO:77676
SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9,
num_nulls: 0]
+ assert!(&page_indexes[0][3].is_sorted());
if let Index::INT32(index) = &page_indexes[0][3] {
check_native_page_index(
index,
@@ -1368,6 +1375,7 @@ mod tests {
unreachable!()
};
//col5->smallint_col: INT32 UNCOMPRESSED DO:0 FPO:77676
SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9,
num_nulls: 0]
+ assert!(&page_indexes[0][4].is_sorted());
if let Index::INT32(index) = &page_indexes[0][4] {
check_native_page_index(
index,
@@ -1380,6 +1388,7 @@ mod tests {
unreachable!()
};
//col6->bigint_col: INT64 UNCOMPRESSED DO:0 FPO:152326
SZ:71598/71598/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 90,
num_nulls: 0]
+ assert!(!&page_indexes[0][5].is_sorted());
if let Index::INT64(index) = &page_indexes[0][5] {
check_native_page_index(
index,
@@ -1392,6 +1401,7 @@ mod tests {
unreachable!()
};
//col7->float_col: FLOAT UNCOMPRESSED DO:0 FPO:223924
SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: -0.0, max: 9.9,
num_nulls: 0]
+ assert!(&page_indexes[0][6].is_sorted());
if let Index::FLOAT(index) = &page_indexes[0][6] {
check_native_page_index(
index,
@@ -1404,6 +1414,7 @@ mod tests {
unreachable!()
};
//col8->double_col: DOUBLE UNCOMPRESSED DO:0 FPO:261249
SZ:71598/71598/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: -0.0, max:
90.89999999999999, num_nulls: 0]
+ assert!(!&page_indexes[0][7].is_sorted());
if let Index::DOUBLE(index) = &page_indexes[0][7] {
check_native_page_index(
index,
@@ -1416,6 +1427,7 @@ mod tests {
unreachable!()
};
//col9->date_string_col: BINARY UNCOMPRESSED DO:0 FPO:332847
SZ:111948/111948/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 01/01/09, max:
12/31/10, num_nulls: 0]
+ assert!(!&page_indexes[0][8].is_sorted());
if let Index::BYTE_ARRAY(index) = &page_indexes[0][8] {
check_bytes_page_index(
index,
@@ -1428,6 +1440,7 @@ mod tests {
unreachable!()
};
//col10->string_col: BINARY UNCOMPRESSED DO:0 FPO:444795
SZ:45298/45298/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9,
num_nulls: 0]
+ assert!(&page_indexes[0][9].is_sorted());
if let Index::BYTE_ARRAY(index) = &page_indexes[0][9] {
check_bytes_page_index(
index,
@@ -1441,12 +1454,14 @@ mod tests {
};
//col11->timestamp_col: INT96 UNCOMPRESSED DO:0 FPO:490093
SZ:111948/111948/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[num_nulls: 0,
min/max not defined]
//Notice: min_max values for each page for this col not exits.
+ assert!(!&page_indexes[0][10].is_sorted());
if let Index::NONE = &page_indexes[0][10] {
assert_eq!(row_group_offset_indexes[10].len(), 974);
} else {
unreachable!()
};
//col12->year: INT32 UNCOMPRESSED DO:0 FPO:602041 SZ:37325/37325/1.00
VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 2009, max: 2010, num_nulls: 0]
+ assert!(&page_indexes[0][11].is_sorted());
if let Index::INT32(index) = &page_indexes[0][11] {
check_native_page_index(
index,
@@ -1459,6 +1474,7 @@ mod tests {
unreachable!()
};
//col13->month: INT32 UNCOMPRESSED DO:0 FPO:639366 SZ:37325/37325/1.00
VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 1, max: 12, num_nulls: 0]
+ assert!(!&page_indexes[0][12].is_sorted());
if let Index::INT32(index) = &page_indexes[0][12] {
check_native_page_index(
index,