This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 4ba9a2509c [geo] add geospatial statistics and bbox types for parquet 
(#8225)
4ba9a2509c is described below

commit 4ba9a2509c45ea20944c383efb958a3139b63e74
Author: Kaushik Srinivasan <[email protected]>
AuthorDate: Sun Sep 28 09:22:33 2025 -0400

    [geo] add geospatial statistics and bbox types for parquet (#8225)
    
    # Which issue does this PR close?
    
    - Builds on #7799 by supporting the new struct `GeospatialStatistics`.
    
    # Rationale for this change
    
    * This is part of a draft to support geospatial types (geometry and
    geography) in Parquet. This has been
    
    # What changes are included in this PR?
    
    - Structs for supporting geospatial statistics information (bbox and
    geospatial types) derived from thrift classes.
    - Would appreciate feedback on structure and where certain parts should
    go.
    
    # Are these changes tested?
    
    Yes
    
    # Are there any user-facing changes?
    
    If there are user-facing changes then we may require documentation to be
    updated before approving the PR.
    
    If there are any breaking changes to public APIs, please call them out.
---
 parquet/src/arrow/schema/primitive.rs  |   2 +
 parquet/src/basic.rs                   |   2 +
 parquet/src/file/metadata/mod.rs       |  30 ++-
 parquet/src/geospatial/bounding_box.rs | 413 +++++++++++++++++++++++++++++++++
 parquet/src/geospatial/mod.rs          |  50 ++++
 parquet/src/geospatial/statistics.rs   | 164 +++++++++++++
 parquet/src/lib.rs                     |   2 +-
 parquet/src/schema/types.rs            |   2 +
 8 files changed, 659 insertions(+), 6 deletions(-)

diff --git a/parquet/src/arrow/schema/primitive.rs 
b/parquet/src/arrow/schema/primitive.rs
index 1b3ab7d45c..24b18b39bc 100644
--- a/parquet/src/arrow/schema/primitive.rs
+++ b/parquet/src/arrow/schema/primitive.rs
@@ -276,6 +276,8 @@ fn from_byte_array(info: &BasicTypeInfo, precision: i32, 
scale: i32) -> Result<D
         (Some(LogicalType::Json), _) => Ok(DataType::Utf8),
         (Some(LogicalType::Bson), _) => Ok(DataType::Binary),
         (Some(LogicalType::Enum), _) => Ok(DataType::Binary),
+        (Some(LogicalType::Geometry), _) => Ok(DataType::Binary),
+        (Some(LogicalType::Geography), _) => Ok(DataType::Binary),
         (None, ConvertedType::NONE) => Ok(DataType::Binary),
         (None, ConvertedType::JSON) => Ok(DataType::Utf8),
         (None, ConvertedType::BSON) => Ok(DataType::Binary),
diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs
index 2cf5e46fea..6353b5f4ee 100644
--- a/parquet/src/basic.rs
+++ b/parquet/src/basic.rs
@@ -1201,6 +1201,8 @@ impl str::FromStr for LogicalType {
                 "Interval parquet logical type not yet supported"
             )),
             "FLOAT16" => Ok(LogicalType::Float16),
+            "GEOMETRY" => Ok(LogicalType::Geometry),
+            "GEOGRAPHY" => Ok(LogicalType::Geography),
             other => Err(general_err!("Invalid parquet logical type {}", 
other)),
         }
     }
diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs
index e04b8c9c8e..4aa0388fd2 100644
--- a/parquet/src/file/metadata/mod.rs
+++ b/parquet/src/file/metadata/mod.rs
@@ -116,6 +116,7 @@ use crate::format::{
     BoundaryOrder, ColumnChunk, ColumnIndex, ColumnMetaData, OffsetIndex, 
PageLocation, RowGroup,
     SizeStatistics, SortingColumn,
 };
+use crate::geospatial::statistics as geo_statistics;
 use crate::schema::types::{
     ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, 
SchemaDescriptor,
     Type as SchemaType,
@@ -839,6 +840,7 @@ pub struct ColumnChunkMetaData {
     index_page_offset: Option<i64>,
     dictionary_page_offset: Option<i64>,
     statistics: Option<Statistics>,
+    geo_statistics: Option<Box<geo_statistics::GeospatialStatistics>>,
     encoding_stats: Option<Vec<PageEncodingStats>>,
     bloom_filter_offset: Option<i64>,
     bloom_filter_length: Option<i32>,
@@ -1064,6 +1066,12 @@ impl ColumnChunkMetaData {
         self.statistics.as_ref()
     }
 
+    /// Returns geospatial statistics that are set for this column chunk,
+    /// or `None` if no geospatial statistics are available.
+    pub fn geo_statistics(&self) -> 
Option<&geo_statistics::GeospatialStatistics> {
+        self.geo_statistics.as_deref()
+    }
+
     /// Returns the offset for the page encoding stats,
     /// or `None` if no page encoding stats are available.
     pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
@@ -1168,6 +1176,8 @@ impl ColumnChunkMetaData {
         let index_page_offset = col_metadata.index_page_offset;
         let dictionary_page_offset = col_metadata.dictionary_page_offset;
         let statistics = statistics::from_thrift(column_type, 
col_metadata.statistics)?;
+        let geo_statistics =
+            
geo_statistics::from_thrift(col_metadata.geospatial_statistics).map(Box::new);
         let encoding_stats = col_metadata
             .encoding_stats
             .as_ref()
@@ -1230,6 +1240,7 @@ impl ColumnChunkMetaData {
             unencoded_byte_array_data_bytes,
             repetition_level_histogram,
             definition_level_histogram,
+            geo_statistics,
             #[cfg(feature = "encryption")]
             column_crypto_metadata,
         };
@@ -1298,7 +1309,9 @@ impl ColumnChunkMetaData {
             bloom_filter_offset: self.bloom_filter_offset,
             bloom_filter_length: self.bloom_filter_length,
             size_statistics,
-            geospatial_statistics: None,
+            geospatial_statistics: geo_statistics::to_thrift(
+                self.geo_statistics.as_ref().map(|boxed| boxed.as_ref()),
+            ),
         }
     }
 
@@ -1358,6 +1371,7 @@ impl ColumnChunkMetaDataBuilder {
             index_page_offset: None,
             dictionary_page_offset: None,
             statistics: None,
+            geo_statistics: None,
             encoding_stats: None,
             bloom_filter_offset: None,
             bloom_filter_length: None,
@@ -1433,6 +1447,12 @@ impl ColumnChunkMetaDataBuilder {
         self
     }
 
+    /// Sets geospatial statistics for this column chunk.
+    pub fn set_geo_statistics(mut self, value: 
Box<geo_statistics::GeospatialStatistics>) -> Self {
+        self.0.geo_statistics = Some(value);
+        self
+    }
+
     /// Clears the statistics for this column chunk.
     pub fn clear_statistics(mut self) -> Self {
         self.0.statistics = None;
@@ -1972,9 +1992,9 @@ mod tests {
             .build();
 
         #[cfg(not(feature = "encryption"))]
-        let base_expected_size = 2312;
+        let base_expected_size = 2344;
         #[cfg(feature = "encryption")]
-        let base_expected_size = 2648;
+        let base_expected_size = 2680;
 
         assert_eq!(parquet_meta.memory_size(), base_expected_size);
 
@@ -2002,9 +2022,9 @@ mod tests {
             .build();
 
         #[cfg(not(feature = "encryption"))]
-        let bigger_expected_size = 2816;
+        let bigger_expected_size = 2848;
         #[cfg(feature = "encryption")]
-        let bigger_expected_size = 3152;
+        let bigger_expected_size = 3184;
 
         // more set fields means more memory usage
         assert!(bigger_expected_size > base_expected_size);
diff --git a/parquet/src/geospatial/bounding_box.rs 
b/parquet/src/geospatial/bounding_box.rs
new file mode 100644
index 0000000000..7d9eb58d00
--- /dev/null
+++ b/parquet/src/geospatial/bounding_box.rs
@@ -0,0 +1,413 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Bounding box for GEOMETRY or GEOGRAPHY type in the representation of 
min/max
+//! value pair of coordinates from each axis.
+//!
+//! Derived from the parquet format spec: 
<https://github.com/apache/parquet-format/blob/master/Geospatial.md>
+//!
+//!
+use crate::format as parquet;
+
+/// A geospatial instance has at least two coordinate dimensions: X and Y for 
2D coordinates of each point.
+/// X represents longitude/easting and Y represents latitude/northing. A 
geospatial instance can optionally
+/// have Z and/or M values associated with each point.
+///
+/// The Z values introduce the third dimension coordinate, typically used to 
indicate height or elevation.
+///
+/// M values allow tracking a value in a fourth dimension. These can represent:
+/// - Linear reference values (e.g., highway milepost)
+/// - Timestamps
+/// - Other values defined by the CRS
+///
+/// The bounding box is defined as min/max value pairs of coordinates from 
each axis. X and Y values are
+/// always present, while Z and M are omitted for 2D geospatial instances.
+///
+/// When calculating a bounding box:
+/// - Null or NaN values in a coordinate dimension are skipped
+/// - If a dimension has only null/NaN values, that dimension is omitted
+/// - If either X or Y dimension is missing, no bounding box is produced
+/// - Example: POINT (1 NaN) contributes to X but not to Y, Z, or M dimensions
+///
+/// Special cases:
+/// - For X values only, xmin may exceed xmax. In this case, a point matches 
if x >= xmin OR x <= xmax
+/// - This wraparound can occur when the bounding box crosses the antimeridian 
line.
+/// - In geographic terms: xmin=westernmost, xmax=easternmost, 
ymin=southernmost, ymax=northernmost
+///
+/// For GEOGRAPHY types:
+/// - X values must be within [-180, 180] (longitude)
+/// - Y values must be within [-90, 90] (latitude)
+///
+/// Derived from the parquet format [spec][bounding-box-spec]
+///
+/// # Examples
+///
+/// ```
+/// use parquet::geospatial::bounding_box::BoundingBox;
+///
+/// // 2D bounding box
+/// let bbox_2d = BoundingBox::new(0.0, 0.0, 100.0, 100.0);
+///
+/// // 3D bounding box with elevation
+/// let bbox_3d = BoundingBox::new(0.0, 0.0, 100.0, 100.0)
+///     .with_zrange(0.0, 1000.0);
+///
+/// // 3D bounding box with elevation and measured value
+/// let bbox_3d_m = BoundingBox::new(0.0, 0.0, 100.0, 100.0)
+///     .with_zrange(0.0, 1000.0)
+///     .with_mrange(0.0, 1000.0);
+/// ```
+///
+/// [bounding-box-spec]: 
https://github.com/apache/parquet-format/blob/master/Geospatial.md#bounding-box
+#[derive(Clone, Debug, PartialEq)]
+pub struct BoundingBox {
+    /// X coordinates (longitude or easting): (min, max)
+    x_range: (f64, f64),
+    /// Y coordinates (latitude or northing): (min, max)
+    y_range: (f64, f64),
+    /// Z coordinates (elevation/height): (min, max), if present
+    z_range: Option<(f64, f64)>,
+    /// M coordinates (measured value): (min, max), if present
+    m_range: Option<(f64, f64)>,
+}
+
+impl BoundingBox {
+    /// Creates a new bounding box with the specified coordinates.
+    pub fn new(xmin: f64, xmax: f64, ymin: f64, ymax: f64) -> Self {
+        Self {
+            x_range: (xmin, xmax),
+            y_range: (ymin, ymax),
+            z_range: None,
+            m_range: None,
+        }
+    }
+
+    /// Updates the bounding box with specified X-coordinate range.
+    pub fn with_xrange(mut self, xmin: f64, xmax: f64) -> Self {
+        self.x_range = (xmin, xmax);
+        self
+    }
+
+    /// Updates the bounding box with specified Y-coordinate range.
+    pub fn with_yrange(mut self, ymin: f64, ymax: f64) -> Self {
+        self.y_range = (ymin, ymax);
+        self
+    }
+
+    /// Creates a new bounding box with the specified Z-coordinate range.
+    pub fn with_zrange(mut self, zmin: f64, zmax: f64) -> Self {
+        self.z_range = Some((zmin, zmax));
+        self
+    }
+
+    /// Creates a new bounding box with the specified M-coordinate range.
+    pub fn with_mrange(mut self, mmin: f64, mmax: f64) -> Self {
+        self.m_range = Some((mmin, mmax));
+        self
+    }
+
+    /// Returns the minimum x-coordinate.
+    pub fn get_xmin(&self) -> f64 {
+        self.x_range.0
+    }
+
+    /// Returns the maximum x-coordinate.
+    pub fn get_xmax(&self) -> f64 {
+        self.x_range.1
+    }
+
+    /// Returns the minimum y-coordinate.
+    pub fn get_ymin(&self) -> f64 {
+        self.y_range.0
+    }
+
+    /// Returns the maximum y-coordinate.
+    pub fn get_ymax(&self) -> f64 {
+        self.y_range.1
+    }
+
+    /// Returns the minimum z-coordinate, if present.
+    pub fn get_zmin(&self) -> Option<f64> {
+        self.z_range.map(|z| z.0)
+    }
+
+    /// Returns the maximum z-coordinate, if present.
+    pub fn get_zmax(&self) -> Option<f64> {
+        self.z_range.map(|z| z.1)
+    }
+
+    /// Returns the minimum m-value (measure), if present.
+    pub fn get_mmin(&self) -> Option<f64> {
+        self.m_range.map(|m| m.0)
+    }
+
+    /// Returns the maximum m-value (measure), if present.
+    pub fn get_mmax(&self) -> Option<f64> {
+        self.m_range.map(|m| m.1)
+    }
+
+    /// Returns `true` if both zmin and zmax are present.
+    pub fn is_z_valid(&self) -> bool {
+        self.z_range.is_some()
+    }
+
+    /// Returns `true` if both mmin and mmax are present.
+    pub fn is_m_valid(&self) -> bool {
+        self.m_range.is_some()
+    }
+}
+
+impl From<BoundingBox> for parquet::BoundingBox {
+    /// Converts our internal `BoundingBox` to the Thrift-generated format.
+    fn from(b: BoundingBox) -> parquet::BoundingBox {
+        parquet::BoundingBox {
+            xmin: b.x_range.0.into(),
+            xmax: b.x_range.1.into(),
+            ymin: b.y_range.0.into(),
+            ymax: b.y_range.1.into(),
+            zmin: b.z_range.map(|z| z.0.into()),
+            zmax: b.z_range.map(|z| z.1.into()),
+            mmin: b.m_range.map(|m| m.0.into()),
+            mmax: b.m_range.map(|m| m.1.into()),
+        }
+    }
+}
+
+impl From<parquet::BoundingBox> for BoundingBox {
+    fn from(bbox: parquet::BoundingBox) -> Self {
+        let mut new_bbox = Self::new(
+            bbox.xmin.into(),
+            bbox.xmax.into(),
+            bbox.ymin.into(),
+            bbox.ymax.into(),
+        );
+
+        new_bbox = match (bbox.zmin, bbox.zmax) {
+            (Some(zmin), Some(zmax)) => new_bbox.with_zrange(zmin.into(), 
zmax.into()),
+            // If either None or mismatch, leave it as None and don't error
+            _ => new_bbox,
+        };
+
+        new_bbox = match (bbox.mmin, bbox.mmax) {
+            (Some(mmin), Some(mmax)) => new_bbox.with_mrange(mmin.into(), 
mmax.into()),
+            // If either None or mismatch, leave it as None and don't error
+            _ => new_bbox,
+        };
+
+        new_bbox
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_bounding_box() {
+        let bbox = BoundingBox::new(0.0, 0.0, 10.0, 10.0);
+        assert_eq!(bbox.get_xmin(), 0.0);
+        assert_eq!(bbox.get_xmax(), 0.0);
+        assert_eq!(bbox.get_ymin(), 10.0);
+        assert_eq!(bbox.get_ymax(), 10.0);
+        assert_eq!(bbox.get_zmin(), None);
+        assert_eq!(bbox.get_zmax(), None);
+        assert_eq!(bbox.get_mmin(), None);
+        assert_eq!(bbox.get_mmax(), None);
+        assert!(!bbox.is_z_valid());
+        assert!(!bbox.is_m_valid());
+
+        // test with zrange
+        let bbox_z = BoundingBox::new(0.0, 0.0, 10.0, 10.0).with_zrange(5.0, 
15.0);
+        assert_eq!(bbox_z.get_zmin(), Some(5.0));
+        assert_eq!(bbox_z.get_zmax(), Some(15.0));
+        assert!(bbox_z.is_z_valid());
+        assert!(!bbox_z.is_m_valid());
+
+        // test with mrange
+        let bbox_m = BoundingBox::new(0.0, 0.0, 10.0, 10.0).with_mrange(10.0, 
20.0);
+        assert_eq!(bbox_m.get_mmin(), Some(10.0));
+        assert_eq!(bbox_m.get_mmax(), Some(20.0));
+        assert!(!bbox_m.is_z_valid());
+        assert!(bbox_m.is_m_valid());
+
+        // test with zrange and mrange
+        let bbox_zm = BoundingBox::new(0.0, 0.0, 10.0, 10.0)
+            .with_zrange(5.0, 15.0)
+            .with_mrange(10.0, 20.0);
+        assert_eq!(bbox_zm.get_zmin(), Some(5.0));
+        assert_eq!(bbox_zm.get_zmax(), Some(15.0));
+        assert_eq!(bbox_zm.get_mmin(), Some(10.0));
+        assert_eq!(bbox_zm.get_mmax(), Some(20.0));
+        assert!(bbox_zm.is_z_valid());
+        assert!(bbox_zm.is_m_valid());
+    }
+
+    #[test]
+    fn test_bounding_box_to_thrift() {
+        use thrift::OrderedFloat;
+
+        let bbox = BoundingBox::new(0.0, 0.0, 10.0, 10.0);
+        let thrift_bbox: parquet::BoundingBox = bbox.into();
+        assert_eq!(thrift_bbox.xmin, 0.0);
+        assert_eq!(thrift_bbox.xmax, 0.0);
+        assert_eq!(thrift_bbox.ymin, 10.0);
+        assert_eq!(thrift_bbox.ymax, 10.0);
+        assert_eq!(thrift_bbox.zmin, None);
+        assert_eq!(thrift_bbox.zmax, None);
+        assert_eq!(thrift_bbox.mmin, None);
+        assert_eq!(thrift_bbox.mmax, None);
+
+        let bbox_z = BoundingBox::new(0.0, 0.0, 10.0, 10.0).with_zrange(5.0, 
15.0);
+        let thrift_bbox_z: parquet::BoundingBox = bbox_z.into();
+        assert_eq!(thrift_bbox_z.zmin, Some(OrderedFloat(5.0)));
+        assert_eq!(thrift_bbox_z.zmax, Some(OrderedFloat(15.0)));
+        assert_eq!(thrift_bbox_z.mmin, None);
+        assert_eq!(thrift_bbox_z.mmax, None);
+
+        let bbox_m = BoundingBox::new(0.0, 0.0, 10.0, 10.0).with_mrange(10.0, 
20.0);
+        let thrift_bbox_m: parquet::BoundingBox = bbox_m.into();
+        assert_eq!(thrift_bbox_m.zmin, None);
+        assert_eq!(thrift_bbox_m.zmax, None);
+        assert_eq!(thrift_bbox_m.mmin, Some(OrderedFloat(10.0)));
+        assert_eq!(thrift_bbox_m.mmax, Some(OrderedFloat(20.0)));
+
+        let bbox_z_m = BoundingBox::new(0.0, 0.0, 10.0, 10.0)
+            .with_zrange(5.0, 15.0)
+            .with_mrange(10.0, 20.0);
+        let thrift_bbox_zm: parquet::BoundingBox = bbox_z_m.into();
+        assert_eq!(thrift_bbox_zm.zmin, Some(OrderedFloat(5.0)));
+        assert_eq!(thrift_bbox_zm.zmax, Some(OrderedFloat(15.0)));
+        assert_eq!(thrift_bbox_zm.mmin, Some(OrderedFloat(10.0)));
+        assert_eq!(thrift_bbox_zm.mmax, Some(OrderedFloat(20.0)));
+    }
+
+    #[test]
+    fn test_bounding_box_from_thrift() {
+        use thrift::OrderedFloat;
+
+        let thrift_bbox = parquet::BoundingBox {
+            xmin: OrderedFloat(0.0),
+            xmax: OrderedFloat(0.0),
+            ymin: OrderedFloat(10.0),
+            ymax: OrderedFloat(10.0),
+            zmin: None,
+            zmax: None,
+            mmin: None,
+            mmax: None,
+        };
+        let bbox: BoundingBox = thrift_bbox.into();
+        assert_eq!(bbox.get_xmin(), 0.0);
+        assert_eq!(bbox.get_xmax(), 0.0);
+        assert_eq!(bbox.get_ymin(), 10.0);
+        assert_eq!(bbox.get_ymax(), 10.0);
+        assert_eq!(bbox.get_zmin(), None);
+        assert_eq!(bbox.get_zmax(), None);
+        assert_eq!(bbox.get_mmin(), None);
+        assert_eq!(bbox.get_mmax(), None);
+
+        let thrift_bbox_z = parquet::BoundingBox {
+            xmin: OrderedFloat(0.0),
+            xmax: OrderedFloat(0.0),
+            ymin: OrderedFloat(10.0),
+            ymax: OrderedFloat(10.0),
+            zmin: Some(OrderedFloat(130.0)),
+            zmax: Some(OrderedFloat(130.0)),
+            mmin: None,
+            mmax: None,
+        };
+        let bbox_z: BoundingBox = thrift_bbox_z.into();
+        assert_eq!(bbox_z.get_xmin(), 0.0);
+        assert_eq!(bbox_z.get_xmax(), 0.0);
+        assert_eq!(bbox_z.get_ymin(), 10.0);
+        assert_eq!(bbox_z.get_ymax(), 10.0);
+        assert_eq!(bbox_z.get_zmin(), Some(130.0));
+        assert_eq!(bbox_z.get_zmax(), Some(130.0));
+        assert_eq!(bbox_z.get_mmin(), None);
+        assert_eq!(bbox_z.get_mmax(), None);
+
+        let thrift_bbox_m = parquet::BoundingBox {
+            xmin: OrderedFloat(0.0),
+            xmax: OrderedFloat(0.0),
+            ymin: OrderedFloat(10.0),
+            ymax: OrderedFloat(10.0),
+            zmin: None,
+            zmax: None,
+            mmin: Some(OrderedFloat(120.0)),
+            mmax: Some(OrderedFloat(120.0)),
+        };
+        let bbox_m: BoundingBox = thrift_bbox_m.into();
+        assert_eq!(bbox_m.get_xmin(), 0.0);
+        assert_eq!(bbox_m.get_xmax(), 0.0);
+        assert_eq!(bbox_m.get_ymin(), 10.0);
+        assert_eq!(bbox_m.get_ymax(), 10.0);
+        assert_eq!(bbox_m.get_zmin(), None);
+        assert_eq!(bbox_m.get_zmax(), None);
+        assert_eq!(bbox_m.get_mmin(), Some(120.0));
+        assert_eq!(bbox_m.get_mmax(), Some(120.0));
+
+        let thrift_bbox_zm = parquet::BoundingBox {
+            xmin: OrderedFloat(0.0),
+            xmax: OrderedFloat(0.0),
+            ymin: OrderedFloat(10.0),
+            ymax: OrderedFloat(10.0),
+            zmin: Some(OrderedFloat(130.0)),
+            zmax: Some(OrderedFloat(130.0)),
+            mmin: Some(OrderedFloat(120.0)),
+            mmax: Some(OrderedFloat(120.0)),
+        };
+
+        let bbox_zm: BoundingBox = thrift_bbox_zm.into();
+        assert_eq!(bbox_zm.get_xmin(), 0.0);
+        assert_eq!(bbox_zm.get_xmax(), 0.0);
+        assert_eq!(bbox_zm.get_ymin(), 10.0);
+        assert_eq!(bbox_zm.get_ymax(), 10.0);
+        assert_eq!(bbox_zm.get_zmin(), Some(130.0));
+        assert_eq!(bbox_zm.get_zmax(), Some(130.0));
+        assert_eq!(bbox_zm.get_mmin(), Some(120.0));
+        assert_eq!(bbox_zm.get_mmax(), Some(120.0));
+    }
+
+    #[test]
+    fn test_bounding_box_thrift_roundtrip() {
+        use thrift::OrderedFloat;
+
+        let thrift_bbox = parquet::BoundingBox {
+            xmin: OrderedFloat(0.0),
+            xmax: OrderedFloat(0.0),
+            ymin: OrderedFloat(10.0),
+            ymax: OrderedFloat(10.0),
+            zmin: Some(OrderedFloat(130.0)),
+            zmax: Some(OrderedFloat(130.0)),
+            mmin: Some(OrderedFloat(120.0)),
+            mmax: Some(OrderedFloat(120.0)),
+        };
+
+        // cloning to make sure it's not moved
+        let bbox: BoundingBox = thrift_bbox.clone().into();
+        assert_eq!(bbox.get_xmin(), 0.0);
+        assert_eq!(bbox.get_xmax(), 0.0);
+        assert_eq!(bbox.get_ymin(), 10.0);
+        assert_eq!(bbox.get_ymax(), 10.0);
+        assert_eq!(bbox.get_zmin(), Some(130.0));
+        assert_eq!(bbox.get_zmax(), Some(130.0));
+        assert_eq!(bbox.get_mmin(), Some(120.0));
+        assert_eq!(bbox.get_mmax(), Some(120.0));
+
+        let thrift_bbox_2: parquet::BoundingBox = bbox.into();
+        assert_eq!(thrift_bbox_2, thrift_bbox);
+    }
+}
diff --git a/parquet/src/geospatial/mod.rs b/parquet/src/geospatial/mod.rs
new file mode 100644
index 0000000000..b6864cf8bd
--- /dev/null
+++ b/parquet/src/geospatial/mod.rs
@@ -0,0 +1,50 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! This module provides functionality for working with geospatial data in 
Parquet file as defined in the [spec][parquet-geo-spec].
+//!
+//! * [`GeospatialStatistics`]: describes the geospatial statistics for a 
Parquet column.
+//! * [`BoundingBox`]: describes the bounding box values for a geospatial 
column.
+//!
+//! [`GeospatialStatistics`] describes the geospatial statistics for a Parquet 
column.
+//! * bbox: the [`BoundingBox`] for the geospatial data
+//! * geospatial_types: the geospatial types for the geospatial data as 
specified in [specification][geo-types].
+//!
+//! Geospatial bounding box describes the spatial extent of the geospatial 
data within a Parquet row group.
+//! * xmin, xmax: the minimum and maximum longitude values
+//! * ymin, ymax: the minimum and maximum latitude values
+//! * zmin, zmax: (optional) the minimum and maximum elevation values
+//! * mmin, mmax: (optional) the minimum and maximum linear reference values
+//!
+//! In 2D representation, where x are points:
+//! ```text      
+//!  ymax +-----------------------+
+//!       |               x       |
+//!       |      x                |
+//!       |              x        |
+//!       |      x                |
+//!  ymin +-----------------------+
+//!       xmin                    xmax
+//! ```
+//!
+//! [`GeospatialStatistics`]: 
crate::geospatial::statistics::GeospatialStatistics
+//! [`BoundingBox`]: crate::geospatial::bounding_box::BoundingBox
+//! [parquet-geo-spec]: 
https://github.com/apache/parquet-format/blob/master/Geospatial.md
+//! [geo-types]: 
https://github.com/apache/parquet-format/blob/master/Geospatial.md#geospatial-types
+
+pub mod bounding_box;
+pub mod statistics;
diff --git a/parquet/src/geospatial/statistics.rs 
b/parquet/src/geospatial/statistics.rs
new file mode 100644
index 0000000000..2a39c494bd
--- /dev/null
+++ b/parquet/src/geospatial/statistics.rs
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Geospatial statistics for Parquet files.
+//!
+//! This module provides functionality for working with geospatial statistics 
in Parquet files.
+//! It includes support for bounding boxes and geospatial statistics in column 
chunk metadata.
+
+use crate::format::GeospatialStatistics as TGeospatialStatistics;
+use crate::geospatial::bounding_box::BoundingBox;
+
+// ----------------------------------------------------------------------
+// Geospatial Statistics
+
+/// Represents geospatial statistics for a Parquet column or dataset.
+///
+/// This struct contains metadata about the spatial characteristics of 
geospatial data,
+/// including bounding box information and the types of geospatial geometries 
present.
+/// It's used to optimize spatial queries and provide spatial context for data 
analysis.
+///
+/// # Examples
+///
+/// ```
+/// use parquet::geospatial::statistics::GeospatialStatistics;
+/// use parquet::geospatial::bounding_box::BoundingBox;
+///
+/// // Statistics with bounding box
+/// let bbox = BoundingBox::new(0.0, 0.0, 100.0, 100.0);
+/// let stats = GeospatialStatistics::new(Some(bbox), Some(vec![1, 2, 3]));
+/// ```
+#[derive(Clone, Debug, PartialEq, Default)]
+pub struct GeospatialStatistics {
+    /// Optional bounding defining the spatial extent, where None represents a 
lack of information.
+    bbox: Option<BoundingBox>,
+    /// Optional list of geometry type identifiers, where None represents lack 
of information
+    geospatial_types: Option<Vec<i32>>,
+}
+
+impl GeospatialStatistics {
+    /// Creates a new geospatial statistics instance with the specified data.
+    pub fn new(bbox: Option<BoundingBox>, geospatial_types: Option<Vec<i32>>) 
-> Self {
+        Self {
+            bbox,
+            geospatial_types,
+        }
+    }
+}
+
+/// Converts a Thrift-generated geospatial statistics object to the internal 
representation.
+pub fn from_thrift(geo_statistics: Option<TGeospatialStatistics>) -> 
Option<GeospatialStatistics> {
+    let geo_stats = geo_statistics?;
+    let bbox = geo_stats.bbox.map(|bbox| bbox.into());
+    // If vector is empty, then set it to None
+    let geospatial_types: Option<Vec<i32>> = 
geo_stats.geospatial_types.filter(|v| !v.is_empty());
+    Some(GeospatialStatistics::new(bbox, geospatial_types))
+}
+
+/// Converts our internal geospatial statistics to the Thrift-generated format.
+pub fn to_thrift(geo_statistics: Option<&GeospatialStatistics>) -> 
Option<TGeospatialStatistics> {
+    let geo_stats = geo_statistics?;
+    let bbox = geo_stats.bbox.clone().map(|bbox| bbox.into());
+    let geospatial_types = geo_stats.geospatial_types.clone();
+    Some(TGeospatialStatistics::new(bbox, geospatial_types))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Tests the conversion from Thrift format when no statistics are 
provided.
+    #[test]
+    fn test_from_thrift() {
+        assert_eq!(from_thrift(None), None);
+        assert_eq!(
+            from_thrift(Some(TGeospatialStatistics::new(None, None))),
+            Some(GeospatialStatistics::default())
+        );
+    }
+
+    /// Tests the conversion from Thrift format with actual geospatial data.
+    #[test]
+    fn test_geo_statistics_from_thrift() {
+        let bbox = BoundingBox::new(0.0, 0.0, 100.0, 100.0);
+        let geospatial_types = vec![1, 2, 3];
+        let stats = GeospatialStatistics::new(Some(bbox), 
Some(geospatial_types));
+        let thrift_stats = to_thrift(Some(&stats));
+        assert_eq!(from_thrift(thrift_stats), Some(stats));
+    }
+
+    #[test]
+    fn test_bbox_to_thrift() {
+        use crate::format as parquet;
+        use thrift::OrderedFloat;
+
+        let bbox = BoundingBox::new(0.0, 0.0, 100.0, 100.0);
+        let thrift_bbox: parquet::BoundingBox = bbox.into();
+        assert_eq!(thrift_bbox.xmin, 0.0);
+        assert_eq!(thrift_bbox.xmax, 0.0);
+        assert_eq!(thrift_bbox.ymin, 100.0);
+        assert_eq!(thrift_bbox.ymax, 100.0);
+        assert_eq!(thrift_bbox.zmin, None);
+        assert_eq!(thrift_bbox.zmax, None);
+        assert_eq!(thrift_bbox.mmin, None);
+        assert_eq!(thrift_bbox.mmax, None);
+
+        let bbox_z = BoundingBox::new(0.0, 0.0, 100.0, 100.0).with_zrange(5.0, 
15.0);
+        let thrift_bbox_z: parquet::BoundingBox = bbox_z.into();
+        assert_eq!(thrift_bbox_z.zmin, Some(OrderedFloat(5.0)));
+        assert_eq!(thrift_bbox_z.zmax, Some(OrderedFloat(15.0)));
+
+        let bbox_m = BoundingBox::new(0.0, 0.0, 100.0, 
100.0).with_mrange(10.0, 20.0);
+        let thrift_bbox_m: parquet::BoundingBox = bbox_m.into();
+        assert_eq!(thrift_bbox_m.mmin, Some(OrderedFloat(10.0)));
+        assert_eq!(thrift_bbox_m.mmax, Some(OrderedFloat(20.0)));
+    }
+
+    #[test]
+    fn test_read_geospatial_statistics_from_file() {
+        use crate::file::reader::{FileReader, SerializedFileReader};
+        use std::fs::File;
+
+        let path = format!(
+            "{}/geospatial/geospatial.parquet",
+            arrow::util::test_util::parquet_test_data(),
+        );
+        let file = File::open(path).unwrap();
+        let reader = SerializedFileReader::try_from(file).unwrap();
+        let metadata = reader.metadata();
+
+        // geospatial.parquet schema:
+        //    optional binary field_id=-1 group (String);
+        //    optional binary field_id=-1 wkt (String);
+        //    optional binary field_id=-1 geometry (Geometry(crs=));
+        let geo_statistics = metadata.row_group(0).column(2).geo_statistics();
+        assert!(geo_statistics.is_some());
+
+        let expected_bbox = BoundingBox::new(10.0, 40.0, 10.0, 40.0)
+            .with_zrange(30.0, 80.0)
+            .with_mrange(200.0, 1600.0);
+        let expected_geospatial_types = vec![
+            1, 2, 3, 4, 5, 6, 7, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 
2001, 2002, 2003, 2004,
+            2005, 2006, 2007, 3001, 3002, 3003, 3004, 3005, 3006, 3007,
+        ];
+        assert_eq!(
+            geo_statistics.unwrap().geospatial_types,
+            Some(expected_geospatial_types)
+        );
+        assert_eq!(geo_statistics.unwrap().bbox, Some(expected_bbox));
+    }
+}
diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs
index b1100c4bc4..446a500aaf 100644
--- a/parquet/src/lib.rs
+++ b/parquet/src/lib.rs
@@ -189,7 +189,6 @@ pub mod record;
 pub mod schema;
 
 pub mod thrift;
-
 /// What data is needed to read the next item from a decoder.
 ///
 /// This is used to communicate between the decoder and the caller
@@ -207,3 +206,4 @@ pub enum DecodeResult<T: Debug> {
 
 #[cfg(feature = "variant_experimental")]
 pub mod variant;
+experimental!(pub mod geospatial);
diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs
index 05df9536bf..2f6131571e 100644
--- a/parquet/src/schema/types.rs
+++ b/parquet/src/schema/types.rs
@@ -401,6 +401,8 @@ impl<'a> PrimitiveTypeBuilder<'a> {
                 (LogicalType::String, PhysicalType::BYTE_ARRAY) => {}
                 (LogicalType::Json, PhysicalType::BYTE_ARRAY) => {}
                 (LogicalType::Bson, PhysicalType::BYTE_ARRAY) => {}
+                (LogicalType::Geometry, PhysicalType::BYTE_ARRAY) => {}
+                (LogicalType::Geography, PhysicalType::BYTE_ARRAY) => {}
                 (LogicalType::Uuid, PhysicalType::FIXED_LEN_BYTE_ARRAY) if 
self.length == 16 => {}
                 (LogicalType::Uuid, PhysicalType::FIXED_LEN_BYTE_ARRAY) => {
                     return Err(general_err!(

Reply via email to