This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 4ba9a2509c [geo] add geospatial statistics and bbox types for parquet
(#8225)
4ba9a2509c is described below
commit 4ba9a2509c45ea20944c383efb958a3139b63e74
Author: Kaushik Srinivasan <[email protected]>
AuthorDate: Sun Sep 28 09:22:33 2025 -0400
[geo] add geospatial statistics and bbox types for parquet (#8225)
# Which issue does this PR close?
- Builds on #7799 by supporting the new struct `GeospatialStatistics`.
# Rationale for this change
* This is part of a draft to support geospatial types (geometry and
geography) in Parquet. This has been
# What changes are included in this PR?
- Structs for supporting geospatial statistics information (bbox and
geospatial types) derived from thrift classes.
- Would appreciate feedback on structure and where certain parts should
go.
# Are these changes tested?
Yes
# Are there any user-facing changes?
If there are user-facing changes then we may require documentation to be
updated before approving the PR.
If there are any breaking changes to public APIs, please call them out.
---
parquet/src/arrow/schema/primitive.rs | 2 +
parquet/src/basic.rs | 2 +
parquet/src/file/metadata/mod.rs | 30 ++-
parquet/src/geospatial/bounding_box.rs | 413 +++++++++++++++++++++++++++++++++
parquet/src/geospatial/mod.rs | 50 ++++
parquet/src/geospatial/statistics.rs | 164 +++++++++++++
parquet/src/lib.rs | 2 +-
parquet/src/schema/types.rs | 2 +
8 files changed, 659 insertions(+), 6 deletions(-)
diff --git a/parquet/src/arrow/schema/primitive.rs
b/parquet/src/arrow/schema/primitive.rs
index 1b3ab7d45c..24b18b39bc 100644
--- a/parquet/src/arrow/schema/primitive.rs
+++ b/parquet/src/arrow/schema/primitive.rs
@@ -276,6 +276,8 @@ fn from_byte_array(info: &BasicTypeInfo, precision: i32,
scale: i32) -> Result<D
(Some(LogicalType::Json), _) => Ok(DataType::Utf8),
(Some(LogicalType::Bson), _) => Ok(DataType::Binary),
(Some(LogicalType::Enum), _) => Ok(DataType::Binary),
+ (Some(LogicalType::Geometry), _) => Ok(DataType::Binary),
+ (Some(LogicalType::Geography), _) => Ok(DataType::Binary),
(None, ConvertedType::NONE) => Ok(DataType::Binary),
(None, ConvertedType::JSON) => Ok(DataType::Utf8),
(None, ConvertedType::BSON) => Ok(DataType::Binary),
diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs
index 2cf5e46fea..6353b5f4ee 100644
--- a/parquet/src/basic.rs
+++ b/parquet/src/basic.rs
@@ -1201,6 +1201,8 @@ impl str::FromStr for LogicalType {
"Interval parquet logical type not yet supported"
)),
"FLOAT16" => Ok(LogicalType::Float16),
+ "GEOMETRY" => Ok(LogicalType::Geometry),
+ "GEOGRAPHY" => Ok(LogicalType::Geography),
other => Err(general_err!("Invalid parquet logical type {}",
other)),
}
}
diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs
index e04b8c9c8e..4aa0388fd2 100644
--- a/parquet/src/file/metadata/mod.rs
+++ b/parquet/src/file/metadata/mod.rs
@@ -116,6 +116,7 @@ use crate::format::{
BoundaryOrder, ColumnChunk, ColumnIndex, ColumnMetaData, OffsetIndex,
PageLocation, RowGroup,
SizeStatistics, SortingColumn,
};
+use crate::geospatial::statistics as geo_statistics;
use crate::schema::types::{
ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr,
SchemaDescriptor,
Type as SchemaType,
@@ -839,6 +840,7 @@ pub struct ColumnChunkMetaData {
index_page_offset: Option<i64>,
dictionary_page_offset: Option<i64>,
statistics: Option<Statistics>,
+ geo_statistics: Option<Box<geo_statistics::GeospatialStatistics>>,
encoding_stats: Option<Vec<PageEncodingStats>>,
bloom_filter_offset: Option<i64>,
bloom_filter_length: Option<i32>,
@@ -1064,6 +1066,12 @@ impl ColumnChunkMetaData {
self.statistics.as_ref()
}
+ /// Returns geospatial statistics that are set for this column chunk,
+ /// or `None` if no geospatial statistics are available.
+ pub fn geo_statistics(&self) ->
Option<&geo_statistics::GeospatialStatistics> {
+ self.geo_statistics.as_deref()
+ }
+
/// Returns the offset for the page encoding stats,
/// or `None` if no page encoding stats are available.
pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
@@ -1168,6 +1176,8 @@ impl ColumnChunkMetaData {
let index_page_offset = col_metadata.index_page_offset;
let dictionary_page_offset = col_metadata.dictionary_page_offset;
let statistics = statistics::from_thrift(column_type,
col_metadata.statistics)?;
+ let geo_statistics =
+
geo_statistics::from_thrift(col_metadata.geospatial_statistics).map(Box::new);
let encoding_stats = col_metadata
.encoding_stats
.as_ref()
@@ -1230,6 +1240,7 @@ impl ColumnChunkMetaData {
unencoded_byte_array_data_bytes,
repetition_level_histogram,
definition_level_histogram,
+ geo_statistics,
#[cfg(feature = "encryption")]
column_crypto_metadata,
};
@@ -1298,7 +1309,9 @@ impl ColumnChunkMetaData {
bloom_filter_offset: self.bloom_filter_offset,
bloom_filter_length: self.bloom_filter_length,
size_statistics,
- geospatial_statistics: None,
+ geospatial_statistics: geo_statistics::to_thrift(
+ self.geo_statistics.as_ref().map(|boxed| boxed.as_ref()),
+ ),
}
}
@@ -1358,6 +1371,7 @@ impl ColumnChunkMetaDataBuilder {
index_page_offset: None,
dictionary_page_offset: None,
statistics: None,
+ geo_statistics: None,
encoding_stats: None,
bloom_filter_offset: None,
bloom_filter_length: None,
@@ -1433,6 +1447,12 @@ impl ColumnChunkMetaDataBuilder {
self
}
+ /// Sets geospatial statistics for this column chunk.
+ pub fn set_geo_statistics(mut self, value:
Box<geo_statistics::GeospatialStatistics>) -> Self {
+ self.0.geo_statistics = Some(value);
+ self
+ }
+
/// Clears the statistics for this column chunk.
pub fn clear_statistics(mut self) -> Self {
self.0.statistics = None;
@@ -1972,9 +1992,9 @@ mod tests {
.build();
#[cfg(not(feature = "encryption"))]
- let base_expected_size = 2312;
+ let base_expected_size = 2344;
#[cfg(feature = "encryption")]
- let base_expected_size = 2648;
+ let base_expected_size = 2680;
assert_eq!(parquet_meta.memory_size(), base_expected_size);
@@ -2002,9 +2022,9 @@ mod tests {
.build();
#[cfg(not(feature = "encryption"))]
- let bigger_expected_size = 2816;
+ let bigger_expected_size = 2848;
#[cfg(feature = "encryption")]
- let bigger_expected_size = 3152;
+ let bigger_expected_size = 3184;
// more set fields means more memory usage
assert!(bigger_expected_size > base_expected_size);
diff --git a/parquet/src/geospatial/bounding_box.rs
b/parquet/src/geospatial/bounding_box.rs
new file mode 100644
index 0000000000..7d9eb58d00
--- /dev/null
+++ b/parquet/src/geospatial/bounding_box.rs
@@ -0,0 +1,413 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Bounding box for GEOMETRY or GEOGRAPHY type in the representation of
min/max
+//! value pair of coordinates from each axis.
+//!
+//! Derived from the parquet format spec:
<https://github.com/apache/parquet-format/blob/master/Geospatial.md>
+//!
+//!
+use crate::format as parquet;
+
+/// A geospatial instance has at least two coordinate dimensions: X and Y for
2D coordinates of each point.
+/// X represents longitude/easting and Y represents latitude/northing. A
geospatial instance can optionally
+/// have Z and/or M values associated with each point.
+///
+/// The Z values introduce the third dimension coordinate, typically used to
indicate height or elevation.
+///
+/// M values allow tracking a value in a fourth dimension. These can represent:
+/// - Linear reference values (e.g., highway milepost)
+/// - Timestamps
+/// - Other values defined by the CRS
+///
+/// The bounding box is defined as min/max value pairs of coordinates from
each axis. X and Y values are
+/// always present, while Z and M are omitted for 2D geospatial instances.
+///
+/// When calculating a bounding box:
+/// - Null or NaN values in a coordinate dimension are skipped
+/// - If a dimension has only null/NaN values, that dimension is omitted
+/// - If either X or Y dimension is missing, no bounding box is produced
+/// - Example: POINT (1 NaN) contributes to X but not to Y, Z, or M dimensions
+///
+/// Special cases:
+/// - For X values only, xmin may exceed xmax. In this case, a point matches
if x >= xmin OR x <= xmax
+/// - This wraparound can occur when the bounding box crosses the antimeridian
line.
+/// - In geographic terms: xmin=westernmost, xmax=easternmost,
ymin=southernmost, ymax=northernmost
+///
+/// For GEOGRAPHY types:
+/// - X values must be within [-180, 180] (longitude)
+/// - Y values must be within [-90, 90] (latitude)
+///
+/// Derived from the parquet format [spec][bounding-box-spec]
+///
+/// # Examples
+///
+/// ```
+/// use parquet::geospatial::bounding_box::BoundingBox;
+///
+/// // 2D bounding box
+/// let bbox_2d = BoundingBox::new(0.0, 0.0, 100.0, 100.0);
+///
+/// // 3D bounding box with elevation
+/// let bbox_3d = BoundingBox::new(0.0, 0.0, 100.0, 100.0)
+/// .with_zrange(0.0, 1000.0);
+///
+/// // 3D bounding box with elevation and measured value
+/// let bbox_3d_m = BoundingBox::new(0.0, 0.0, 100.0, 100.0)
+/// .with_zrange(0.0, 1000.0)
+/// .with_mrange(0.0, 1000.0);
+/// ```
+///
+/// [bounding-box-spec]:
https://github.com/apache/parquet-format/blob/master/Geospatial.md#bounding-box
+#[derive(Clone, Debug, PartialEq)]
+pub struct BoundingBox {
+ /// X coordinates (longitude or easting): (min, max)
+ x_range: (f64, f64),
+ /// Y coordinates (latitude or northing): (min, max)
+ y_range: (f64, f64),
+ /// Z coordinates (elevation/height): (min, max), if present
+ z_range: Option<(f64, f64)>,
+ /// M coordinates (measured value): (min, max), if present
+ m_range: Option<(f64, f64)>,
+}
+
+impl BoundingBox {
+ /// Creates a new bounding box with the specified coordinates.
+ pub fn new(xmin: f64, xmax: f64, ymin: f64, ymax: f64) -> Self {
+ Self {
+ x_range: (xmin, xmax),
+ y_range: (ymin, ymax),
+ z_range: None,
+ m_range: None,
+ }
+ }
+
+ /// Updates the bounding box with specified X-coordinate range.
+ pub fn with_xrange(mut self, xmin: f64, xmax: f64) -> Self {
+ self.x_range = (xmin, xmax);
+ self
+ }
+
+ /// Updates the bounding box with specified Y-coordinate range.
+ pub fn with_yrange(mut self, ymin: f64, ymax: f64) -> Self {
+ self.y_range = (ymin, ymax);
+ self
+ }
+
+ /// Creates a new bounding box with the specified Z-coordinate range.
+ pub fn with_zrange(mut self, zmin: f64, zmax: f64) -> Self {
+ self.z_range = Some((zmin, zmax));
+ self
+ }
+
+ /// Creates a new bounding box with the specified M-coordinate range.
+ pub fn with_mrange(mut self, mmin: f64, mmax: f64) -> Self {
+ self.m_range = Some((mmin, mmax));
+ self
+ }
+
+ /// Returns the minimum x-coordinate.
+ pub fn get_xmin(&self) -> f64 {
+ self.x_range.0
+ }
+
+ /// Returns the maximum x-coordinate.
+ pub fn get_xmax(&self) -> f64 {
+ self.x_range.1
+ }
+
+ /// Returns the minimum y-coordinate.
+ pub fn get_ymin(&self) -> f64 {
+ self.y_range.0
+ }
+
+ /// Returns the maximum y-coordinate.
+ pub fn get_ymax(&self) -> f64 {
+ self.y_range.1
+ }
+
+ /// Returns the minimum z-coordinate, if present.
+ pub fn get_zmin(&self) -> Option<f64> {
+ self.z_range.map(|z| z.0)
+ }
+
+ /// Returns the maximum z-coordinate, if present.
+ pub fn get_zmax(&self) -> Option<f64> {
+ self.z_range.map(|z| z.1)
+ }
+
+ /// Returns the minimum m-value (measure), if present.
+ pub fn get_mmin(&self) -> Option<f64> {
+ self.m_range.map(|m| m.0)
+ }
+
+ /// Returns the maximum m-value (measure), if present.
+ pub fn get_mmax(&self) -> Option<f64> {
+ self.m_range.map(|m| m.1)
+ }
+
+ /// Returns `true` if both zmin and zmax are present.
+ pub fn is_z_valid(&self) -> bool {
+ self.z_range.is_some()
+ }
+
+ /// Returns `true` if both mmin and mmax are present.
+ pub fn is_m_valid(&self) -> bool {
+ self.m_range.is_some()
+ }
+}
+
+impl From<BoundingBox> for parquet::BoundingBox {
+ /// Converts our internal `BoundingBox` to the Thrift-generated format.
+ fn from(b: BoundingBox) -> parquet::BoundingBox {
+ parquet::BoundingBox {
+ xmin: b.x_range.0.into(),
+ xmax: b.x_range.1.into(),
+ ymin: b.y_range.0.into(),
+ ymax: b.y_range.1.into(),
+ zmin: b.z_range.map(|z| z.0.into()),
+ zmax: b.z_range.map(|z| z.1.into()),
+ mmin: b.m_range.map(|m| m.0.into()),
+ mmax: b.m_range.map(|m| m.1.into()),
+ }
+ }
+}
+
+impl From<parquet::BoundingBox> for BoundingBox {
+ fn from(bbox: parquet::BoundingBox) -> Self {
+ let mut new_bbox = Self::new(
+ bbox.xmin.into(),
+ bbox.xmax.into(),
+ bbox.ymin.into(),
+ bbox.ymax.into(),
+ );
+
+ new_bbox = match (bbox.zmin, bbox.zmax) {
+ (Some(zmin), Some(zmax)) => new_bbox.with_zrange(zmin.into(),
zmax.into()),
+ // If either None or mismatch, leave it as None and don't error
+ _ => new_bbox,
+ };
+
+ new_bbox = match (bbox.mmin, bbox.mmax) {
+ (Some(mmin), Some(mmax)) => new_bbox.with_mrange(mmin.into(),
mmax.into()),
+ // If either None or mismatch, leave it as None and don't error
+ _ => new_bbox,
+ };
+
+ new_bbox
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_bounding_box() {
+ let bbox = BoundingBox::new(0.0, 0.0, 10.0, 10.0);
+ assert_eq!(bbox.get_xmin(), 0.0);
+ assert_eq!(bbox.get_xmax(), 0.0);
+ assert_eq!(bbox.get_ymin(), 10.0);
+ assert_eq!(bbox.get_ymax(), 10.0);
+ assert_eq!(bbox.get_zmin(), None);
+ assert_eq!(bbox.get_zmax(), None);
+ assert_eq!(bbox.get_mmin(), None);
+ assert_eq!(bbox.get_mmax(), None);
+ assert!(!bbox.is_z_valid());
+ assert!(!bbox.is_m_valid());
+
+ // test with zrange
+ let bbox_z = BoundingBox::new(0.0, 0.0, 10.0, 10.0).with_zrange(5.0,
15.0);
+ assert_eq!(bbox_z.get_zmin(), Some(5.0));
+ assert_eq!(bbox_z.get_zmax(), Some(15.0));
+ assert!(bbox_z.is_z_valid());
+ assert!(!bbox_z.is_m_valid());
+
+ // test with mrange
+ let bbox_m = BoundingBox::new(0.0, 0.0, 10.0, 10.0).with_mrange(10.0,
20.0);
+ assert_eq!(bbox_m.get_mmin(), Some(10.0));
+ assert_eq!(bbox_m.get_mmax(), Some(20.0));
+ assert!(!bbox_m.is_z_valid());
+ assert!(bbox_m.is_m_valid());
+
+ // test with zrange and mrange
+ let bbox_zm = BoundingBox::new(0.0, 0.0, 10.0, 10.0)
+ .with_zrange(5.0, 15.0)
+ .with_mrange(10.0, 20.0);
+ assert_eq!(bbox_zm.get_zmin(), Some(5.0));
+ assert_eq!(bbox_zm.get_zmax(), Some(15.0));
+ assert_eq!(bbox_zm.get_mmin(), Some(10.0));
+ assert_eq!(bbox_zm.get_mmax(), Some(20.0));
+ assert!(bbox_zm.is_z_valid());
+ assert!(bbox_zm.is_m_valid());
+ }
+
+ #[test]
+ fn test_bounding_box_to_thrift() {
+ use thrift::OrderedFloat;
+
+ let bbox = BoundingBox::new(0.0, 0.0, 10.0, 10.0);
+ let thrift_bbox: parquet::BoundingBox = bbox.into();
+ assert_eq!(thrift_bbox.xmin, 0.0);
+ assert_eq!(thrift_bbox.xmax, 0.0);
+ assert_eq!(thrift_bbox.ymin, 10.0);
+ assert_eq!(thrift_bbox.ymax, 10.0);
+ assert_eq!(thrift_bbox.zmin, None);
+ assert_eq!(thrift_bbox.zmax, None);
+ assert_eq!(thrift_bbox.mmin, None);
+ assert_eq!(thrift_bbox.mmax, None);
+
+ let bbox_z = BoundingBox::new(0.0, 0.0, 10.0, 10.0).with_zrange(5.0,
15.0);
+ let thrift_bbox_z: parquet::BoundingBox = bbox_z.into();
+ assert_eq!(thrift_bbox_z.zmin, Some(OrderedFloat(5.0)));
+ assert_eq!(thrift_bbox_z.zmax, Some(OrderedFloat(15.0)));
+ assert_eq!(thrift_bbox_z.mmin, None);
+ assert_eq!(thrift_bbox_z.mmax, None);
+
+ let bbox_m = BoundingBox::new(0.0, 0.0, 10.0, 10.0).with_mrange(10.0,
20.0);
+ let thrift_bbox_m: parquet::BoundingBox = bbox_m.into();
+ assert_eq!(thrift_bbox_m.zmin, None);
+ assert_eq!(thrift_bbox_m.zmax, None);
+ assert_eq!(thrift_bbox_m.mmin, Some(OrderedFloat(10.0)));
+ assert_eq!(thrift_bbox_m.mmax, Some(OrderedFloat(20.0)));
+
+ let bbox_z_m = BoundingBox::new(0.0, 0.0, 10.0, 10.0)
+ .with_zrange(5.0, 15.0)
+ .with_mrange(10.0, 20.0);
+ let thrift_bbox_zm: parquet::BoundingBox = bbox_z_m.into();
+ assert_eq!(thrift_bbox_zm.zmin, Some(OrderedFloat(5.0)));
+ assert_eq!(thrift_bbox_zm.zmax, Some(OrderedFloat(15.0)));
+ assert_eq!(thrift_bbox_zm.mmin, Some(OrderedFloat(10.0)));
+ assert_eq!(thrift_bbox_zm.mmax, Some(OrderedFloat(20.0)));
+ }
+
+ #[test]
+ fn test_bounding_box_from_thrift() {
+ use thrift::OrderedFloat;
+
+ let thrift_bbox = parquet::BoundingBox {
+ xmin: OrderedFloat(0.0),
+ xmax: OrderedFloat(0.0),
+ ymin: OrderedFloat(10.0),
+ ymax: OrderedFloat(10.0),
+ zmin: None,
+ zmax: None,
+ mmin: None,
+ mmax: None,
+ };
+ let bbox: BoundingBox = thrift_bbox.into();
+ assert_eq!(bbox.get_xmin(), 0.0);
+ assert_eq!(bbox.get_xmax(), 0.0);
+ assert_eq!(bbox.get_ymin(), 10.0);
+ assert_eq!(bbox.get_ymax(), 10.0);
+ assert_eq!(bbox.get_zmin(), None);
+ assert_eq!(bbox.get_zmax(), None);
+ assert_eq!(bbox.get_mmin(), None);
+ assert_eq!(bbox.get_mmax(), None);
+
+ let thrift_bbox_z = parquet::BoundingBox {
+ xmin: OrderedFloat(0.0),
+ xmax: OrderedFloat(0.0),
+ ymin: OrderedFloat(10.0),
+ ymax: OrderedFloat(10.0),
+ zmin: Some(OrderedFloat(130.0)),
+ zmax: Some(OrderedFloat(130.0)),
+ mmin: None,
+ mmax: None,
+ };
+ let bbox_z: BoundingBox = thrift_bbox_z.into();
+ assert_eq!(bbox_z.get_xmin(), 0.0);
+ assert_eq!(bbox_z.get_xmax(), 0.0);
+ assert_eq!(bbox_z.get_ymin(), 10.0);
+ assert_eq!(bbox_z.get_ymax(), 10.0);
+ assert_eq!(bbox_z.get_zmin(), Some(130.0));
+ assert_eq!(bbox_z.get_zmax(), Some(130.0));
+ assert_eq!(bbox_z.get_mmin(), None);
+ assert_eq!(bbox_z.get_mmax(), None);
+
+ let thrift_bbox_m = parquet::BoundingBox {
+ xmin: OrderedFloat(0.0),
+ xmax: OrderedFloat(0.0),
+ ymin: OrderedFloat(10.0),
+ ymax: OrderedFloat(10.0),
+ zmin: None,
+ zmax: None,
+ mmin: Some(OrderedFloat(120.0)),
+ mmax: Some(OrderedFloat(120.0)),
+ };
+ let bbox_m: BoundingBox = thrift_bbox_m.into();
+ assert_eq!(bbox_m.get_xmin(), 0.0);
+ assert_eq!(bbox_m.get_xmax(), 0.0);
+ assert_eq!(bbox_m.get_ymin(), 10.0);
+ assert_eq!(bbox_m.get_ymax(), 10.0);
+ assert_eq!(bbox_m.get_zmin(), None);
+ assert_eq!(bbox_m.get_zmax(), None);
+ assert_eq!(bbox_m.get_mmin(), Some(120.0));
+ assert_eq!(bbox_m.get_mmax(), Some(120.0));
+
+ let thrift_bbox_zm = parquet::BoundingBox {
+ xmin: OrderedFloat(0.0),
+ xmax: OrderedFloat(0.0),
+ ymin: OrderedFloat(10.0),
+ ymax: OrderedFloat(10.0),
+ zmin: Some(OrderedFloat(130.0)),
+ zmax: Some(OrderedFloat(130.0)),
+ mmin: Some(OrderedFloat(120.0)),
+ mmax: Some(OrderedFloat(120.0)),
+ };
+
+ let bbox_zm: BoundingBox = thrift_bbox_zm.into();
+ assert_eq!(bbox_zm.get_xmin(), 0.0);
+ assert_eq!(bbox_zm.get_xmax(), 0.0);
+ assert_eq!(bbox_zm.get_ymin(), 10.0);
+ assert_eq!(bbox_zm.get_ymax(), 10.0);
+ assert_eq!(bbox_zm.get_zmin(), Some(130.0));
+ assert_eq!(bbox_zm.get_zmax(), Some(130.0));
+ assert_eq!(bbox_zm.get_mmin(), Some(120.0));
+ assert_eq!(bbox_zm.get_mmax(), Some(120.0));
+ }
+
+ #[test]
+ fn test_bounding_box_thrift_roundtrip() {
+ use thrift::OrderedFloat;
+
+ let thrift_bbox = parquet::BoundingBox {
+ xmin: OrderedFloat(0.0),
+ xmax: OrderedFloat(0.0),
+ ymin: OrderedFloat(10.0),
+ ymax: OrderedFloat(10.0),
+ zmin: Some(OrderedFloat(130.0)),
+ zmax: Some(OrderedFloat(130.0)),
+ mmin: Some(OrderedFloat(120.0)),
+ mmax: Some(OrderedFloat(120.0)),
+ };
+
+ // cloning to make sure it's not moved
+ let bbox: BoundingBox = thrift_bbox.clone().into();
+ assert_eq!(bbox.get_xmin(), 0.0);
+ assert_eq!(bbox.get_xmax(), 0.0);
+ assert_eq!(bbox.get_ymin(), 10.0);
+ assert_eq!(bbox.get_ymax(), 10.0);
+ assert_eq!(bbox.get_zmin(), Some(130.0));
+ assert_eq!(bbox.get_zmax(), Some(130.0));
+ assert_eq!(bbox.get_mmin(), Some(120.0));
+ assert_eq!(bbox.get_mmax(), Some(120.0));
+
+ let thrift_bbox_2: parquet::BoundingBox = bbox.into();
+ assert_eq!(thrift_bbox_2, thrift_bbox);
+ }
+}
diff --git a/parquet/src/geospatial/mod.rs b/parquet/src/geospatial/mod.rs
new file mode 100644
index 0000000000..b6864cf8bd
--- /dev/null
+++ b/parquet/src/geospatial/mod.rs
@@ -0,0 +1,50 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! This module provides functionality for working with geospatial data in
Parquet file as defined in the [spec][parquet-geo-spec].
+//!
+//! * [`GeospatialStatistics`]: describes the geospatial statistics for a
Parquet column.
+//! * [`BoundingBox`]: describes the bounding box values for a geospatial
column.
+//!
+//! [`GeospatialStatistics`] describes the geospatial statistics for a Parquet
column.
+//! * bbox: the [`BoundingBox`] for the geospatial data
+//! * geospatial_types: the geospatial types for the geospatial data as
specified in [specification][geo-types].
+//!
+//! Geospatial bounding box describes the spatial extent of the geospatial
data within a Parquet row group.
+//! * xmin, xmax: the minimum and maximum longitude values
+//! * ymin, ymax: the minimum and maximum latitude values
+//! * zmin, zmax: (optional) the minimum and maximum elevation values
+//! * mmin, mmax: (optional) the minimum and maximum linear reference values
+//!
+//! In 2D representation, where x are points:
+//! ```text
+//! ymax +-----------------------+
+//! | x |
+//! | x |
+//! | x |
+//! | x |
+//! ymin +-----------------------+
+//! xmin xmax
+//! ```
+//!
+//! [`GeospatialStatistics`]:
crate::geospatial::statistics::GeospatialStatistics
+//! [`BoundingBox`]: crate::geospatial::bounding_box::BoundingBox
+//! [parquet-geo-spec]:
https://github.com/apache/parquet-format/blob/master/Geospatial.md
+//! [geo-types]:
https://github.com/apache/parquet-format/blob/master/Geospatial.md#geospatial-types
+
+pub mod bounding_box;
+pub mod statistics;
diff --git a/parquet/src/geospatial/statistics.rs
b/parquet/src/geospatial/statistics.rs
new file mode 100644
index 0000000000..2a39c494bd
--- /dev/null
+++ b/parquet/src/geospatial/statistics.rs
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Geospatial statistics for Parquet files.
+//!
+//! This module provides functionality for working with geospatial statistics
in Parquet files.
+//! It includes support for bounding boxes and geospatial statistics in column
chunk metadata.
+
+use crate::format::GeospatialStatistics as TGeospatialStatistics;
+use crate::geospatial::bounding_box::BoundingBox;
+
+// ----------------------------------------------------------------------
+// Geospatial Statistics
+
+/// Represents geospatial statistics for a Parquet column or dataset.
+///
+/// This struct contains metadata about the spatial characteristics of
geospatial data,
+/// including bounding box information and the types of geospatial geometries
present.
+/// It's used to optimize spatial queries and provide spatial context for data
analysis.
+///
+/// # Examples
+///
+/// ```
+/// use parquet::geospatial::statistics::GeospatialStatistics;
+/// use parquet::geospatial::bounding_box::BoundingBox;
+///
+/// // Statistics with bounding box
+/// let bbox = BoundingBox::new(0.0, 0.0, 100.0, 100.0);
+/// let stats = GeospatialStatistics::new(Some(bbox), Some(vec![1, 2, 3]));
+/// ```
+#[derive(Clone, Debug, PartialEq, Default)]
+pub struct GeospatialStatistics {
+ /// Optional bounding defining the spatial extent, where None represents a
lack of information.
+ bbox: Option<BoundingBox>,
+ /// Optional list of geometry type identifiers, where None represents lack
of information
+ geospatial_types: Option<Vec<i32>>,
+}
+
+impl GeospatialStatistics {
+ /// Creates a new geospatial statistics instance with the specified data.
+ pub fn new(bbox: Option<BoundingBox>, geospatial_types: Option<Vec<i32>>)
-> Self {
+ Self {
+ bbox,
+ geospatial_types,
+ }
+ }
+}
+
+/// Converts a Thrift-generated geospatial statistics object to the internal
representation.
+pub fn from_thrift(geo_statistics: Option<TGeospatialStatistics>) ->
Option<GeospatialStatistics> {
+ let geo_stats = geo_statistics?;
+ let bbox = geo_stats.bbox.map(|bbox| bbox.into());
+ // If vector is empty, then set it to None
+ let geospatial_types: Option<Vec<i32>> =
geo_stats.geospatial_types.filter(|v| !v.is_empty());
+ Some(GeospatialStatistics::new(bbox, geospatial_types))
+}
+
+/// Converts our internal geospatial statistics to the Thrift-generated format.
+pub fn to_thrift(geo_statistics: Option<&GeospatialStatistics>) ->
Option<TGeospatialStatistics> {
+ let geo_stats = geo_statistics?;
+ let bbox = geo_stats.bbox.clone().map(|bbox| bbox.into());
+ let geospatial_types = geo_stats.geospatial_types.clone();
+ Some(TGeospatialStatistics::new(bbox, geospatial_types))
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ /// Tests the conversion from Thrift format when no statistics are
provided.
+ #[test]
+ fn test_from_thrift() {
+ assert_eq!(from_thrift(None), None);
+ assert_eq!(
+ from_thrift(Some(TGeospatialStatistics::new(None, None))),
+ Some(GeospatialStatistics::default())
+ );
+ }
+
+ /// Tests the conversion from Thrift format with actual geospatial data.
+ #[test]
+ fn test_geo_statistics_from_thrift() {
+ let bbox = BoundingBox::new(0.0, 0.0, 100.0, 100.0);
+ let geospatial_types = vec![1, 2, 3];
+ let stats = GeospatialStatistics::new(Some(bbox),
Some(geospatial_types));
+ let thrift_stats = to_thrift(Some(&stats));
+ assert_eq!(from_thrift(thrift_stats), Some(stats));
+ }
+
+ #[test]
+ fn test_bbox_to_thrift() {
+ use crate::format as parquet;
+ use thrift::OrderedFloat;
+
+ let bbox = BoundingBox::new(0.0, 0.0, 100.0, 100.0);
+ let thrift_bbox: parquet::BoundingBox = bbox.into();
+ assert_eq!(thrift_bbox.xmin, 0.0);
+ assert_eq!(thrift_bbox.xmax, 0.0);
+ assert_eq!(thrift_bbox.ymin, 100.0);
+ assert_eq!(thrift_bbox.ymax, 100.0);
+ assert_eq!(thrift_bbox.zmin, None);
+ assert_eq!(thrift_bbox.zmax, None);
+ assert_eq!(thrift_bbox.mmin, None);
+ assert_eq!(thrift_bbox.mmax, None);
+
+ let bbox_z = BoundingBox::new(0.0, 0.0, 100.0, 100.0).with_zrange(5.0,
15.0);
+ let thrift_bbox_z: parquet::BoundingBox = bbox_z.into();
+ assert_eq!(thrift_bbox_z.zmin, Some(OrderedFloat(5.0)));
+ assert_eq!(thrift_bbox_z.zmax, Some(OrderedFloat(15.0)));
+
+ let bbox_m = BoundingBox::new(0.0, 0.0, 100.0,
100.0).with_mrange(10.0, 20.0);
+ let thrift_bbox_m: parquet::BoundingBox = bbox_m.into();
+ assert_eq!(thrift_bbox_m.mmin, Some(OrderedFloat(10.0)));
+ assert_eq!(thrift_bbox_m.mmax, Some(OrderedFloat(20.0)));
+ }
+
+ #[test]
+ fn test_read_geospatial_statistics_from_file() {
+ use crate::file::reader::{FileReader, SerializedFileReader};
+ use std::fs::File;
+
+ let path = format!(
+ "{}/geospatial/geospatial.parquet",
+ arrow::util::test_util::parquet_test_data(),
+ );
+ let file = File::open(path).unwrap();
+ let reader = SerializedFileReader::try_from(file).unwrap();
+ let metadata = reader.metadata();
+
+ // geospatial.parquet schema:
+ // optional binary field_id=-1 group (String);
+ // optional binary field_id=-1 wkt (String);
+ // optional binary field_id=-1 geometry (Geometry(crs=));
+ let geo_statistics = metadata.row_group(0).column(2).geo_statistics();
+ assert!(geo_statistics.is_some());
+
+ let expected_bbox = BoundingBox::new(10.0, 40.0, 10.0, 40.0)
+ .with_zrange(30.0, 80.0)
+ .with_mrange(200.0, 1600.0);
+ let expected_geospatial_types = vec![
+ 1, 2, 3, 4, 5, 6, 7, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
2001, 2002, 2003, 2004,
+ 2005, 2006, 2007, 3001, 3002, 3003, 3004, 3005, 3006, 3007,
+ ];
+ assert_eq!(
+ geo_statistics.unwrap().geospatial_types,
+ Some(expected_geospatial_types)
+ );
+ assert_eq!(geo_statistics.unwrap().bbox, Some(expected_bbox));
+ }
+}
diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs
index b1100c4bc4..446a500aaf 100644
--- a/parquet/src/lib.rs
+++ b/parquet/src/lib.rs
@@ -189,7 +189,6 @@ pub mod record;
pub mod schema;
pub mod thrift;
-
/// What data is needed to read the next item from a decoder.
///
/// This is used to communicate between the decoder and the caller
@@ -207,3 +206,4 @@ pub enum DecodeResult<T: Debug> {
#[cfg(feature = "variant_experimental")]
pub mod variant;
+experimental!(pub mod geospatial);
diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs
index 05df9536bf..2f6131571e 100644
--- a/parquet/src/schema/types.rs
+++ b/parquet/src/schema/types.rs
@@ -401,6 +401,8 @@ impl<'a> PrimitiveTypeBuilder<'a> {
(LogicalType::String, PhysicalType::BYTE_ARRAY) => {}
(LogicalType::Json, PhysicalType::BYTE_ARRAY) => {}
(LogicalType::Bson, PhysicalType::BYTE_ARRAY) => {}
+ (LogicalType::Geometry, PhysicalType::BYTE_ARRAY) => {}
+ (LogicalType::Geography, PhysicalType::BYTE_ARRAY) => {}
(LogicalType::Uuid, PhysicalType::FIXED_LEN_BYTE_ARRAY) if
self.length == 16 => {}
(LogicalType::Uuid, PhysicalType::FIXED_LEN_BYTE_ARRAY) => {
return Err(general_err!(