This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new d4d79be389 Add tests and fix corner cases for Parquet/GeoArrow 
extension type conversion (#10065)
d4d79be389 is described below

commit d4d79be3896085cc082881b54f92175798e5ab53
Author: Dewey Dunnington <[email protected]>
AuthorDate: Wed Jun 17 07:36:35 2026 -0500

    Add tests and fix corner cases for Parquet/GeoArrow extension type 
conversion (#10065)
    
    # Which issue does this PR close?
    
    
    - Closes #9929.
    
    # Rationale for this change
    
    There were several issues with conversion identified when I tried to
    integrate this in SedonaDB and that came to light when the spec was
    recently clarified.
    
    I am sorry for missing these changes when I reviewed the initial
    implementation.
    
    # What changes are included in this PR?
    
    - A Parquet crs of `None` for geometry or geography is now converted to
    a GeoArrow CRS of `"OGC:CRS84"` (the named value for the default CRS in
    the Parquet spec)
    - A Parquet crs of `"srid:0"` is now converted to a GeoArrow "omitted"
    CRS. This was recently clarified in the Parquet spec (srid:0 is a named
    example in the list of allowed values).
    - A GeoArrow missing CRS is now encoded as `"srid:0"`
    - A GeoArrow CRS that is "lonlat-like" is now encoded as a Parquet crs
    of `None`. This logic was included in the previous implementation but
    was reversed (Parquet CRSes that looked like lonlat were omitted when
    written to GeoArrow, which is not correct).
    - The GeoArrow metadata struct uses the name `"algorithm"` and was
    serializing it to JSON. The GeoArrow spec uses the `"edges"` key. This
    led to invalid metadata being generated which was either rejected or
    incorrectly interpreted by consumers.
    
    # Are these changes tested?
    
    Yes. I added high-level end-to-end LogicalType <-> extension metadata
    tests, since that is what matters (there were a few lower level tests
    that I updated as well).
---
 parquet-geospatial/src/types.rs       |  74 ++++-------
 parquet/src/arrow/schema/extension.rs |  55 +++++++--
 parquet/tests/geospatial.rs           | 223 +++++++++++++++++++++++++++++++++-
 3 files changed, 291 insertions(+), 61 deletions(-)

diff --git a/parquet-geospatial/src/types.rs b/parquet-geospatial/src/types.rs
index f19911ad05..b19e9103fc 100644
--- a/parquet-geospatial/src/types.rs
+++ b/parquet-geospatial/src/types.rs
@@ -57,12 +57,10 @@ pub struct Metadata {
     /// The Coordinate Reference System (CRS) of the [`WkbType`], if present.
     ///
     /// This may be a raw string value (e.g., "EPSG:3857") or a JSON object 
(e.g., PROJJSON).
-    /// Note: Common lon/lat CRS representations (EPSG:4326, OGC:CRS84) are 
canonicalized
-    /// to `None` during serialization to match Parquet conventions.
     #[serde(skip_serializing_if = "Option::is_none")]
     pub crs: Option<serde_json::Value>,
     /// The edge interpolation algorithm of the [`WkbType`], if present.
-    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(rename = "edges", skip_serializing_if = "Option::is_none")]
     pub algorithm: Option<Edges>,
 }
 
@@ -88,8 +86,10 @@ impl Metadata {
         }
     }
 
-    /// Detect if the CRS is a common representation of lon/lat on the 
standard WGS84 ellipsoid
-    fn crs_is_lon_lat(&self) -> bool {
+    /// Detect if the CRS is a common representation of lon/lat on the 
standard WGS84 ellipsoid.
+    ///
+    /// Returns `true` for OGC:CRS84, EPSG:4326, and PROJJSON representations 
thereof.
+    pub fn crs_is_lon_lat(&self) -> bool {
         use serde_json::Value;
 
         let Some(crs) = &self.crs else {
@@ -144,16 +144,7 @@ impl ExtensionType for WkbType {
     }
 
     fn serialize_metadata(&self) -> Option<String> {
-        let md = if self.0.crs_is_lon_lat() {
-            &Metadata {
-                crs: None, // lon/lat CRS is canonicalized as omitted (None) 
for Parquet
-                algorithm: self.0.algorithm,
-            }
-        } else {
-            &self.0
-        };
-
-        serde_json::to_string(md).ok()
+        serde_json::to_string(&self.0).ok()
     }
 
     fn deserialize_metadata(metadata: Option<&str>) -> 
ArrowResult<Self::Metadata> {
@@ -251,7 +242,7 @@ mod tests {
         let wkb = WkbType::new(Some(metadata));
 
         let serialized = wkb.serialize_metadata().unwrap();
-        assert_eq!(serialized, r#"{"algorithm":"spherical"}"#);
+        assert_eq!(serialized, r#"{"edges":"spherical"}"#);
 
         let deserialized = WkbType::deserialize_metadata(Some(&serialized))?;
         assert!(deserialized.crs.is_none());
@@ -267,7 +258,7 @@ mod tests {
         let wkb = WkbType::new(Some(metadata));
 
         let serialized = wkb.serialize_metadata().unwrap();
-        assert_eq!(serialized, 
r#"{"crs":"srid:1234","algorithm":"spherical"}"#);
+        assert_eq!(serialized, r#"{"crs":"srid:1234","edges":"spherical"}"#);
 
         let deserialized = WkbType::deserialize_metadata(Some(&serialized))?;
         assert_eq!(
@@ -353,54 +344,43 @@ mod tests {
         Ok(())
     }
 
-    /// Test CRS canonicalization logic for common lon/lat representations
+    /// Test crs_is_lon_lat() detection for common lon/lat representations
     #[test]
-    fn test_crs_canonicalization() -> ArrowResult<()> {
-        // EPSG:4326 as string should be omitted
+    fn test_crs_is_lon_lat() -> ArrowResult<()> {
+        // EPSG:4326 as string should be detected
         let metadata = Metadata::new(Some("EPSG:4326"), None);
-        let wkb = WkbType::new(Some(metadata));
-        let serialized = wkb.serialize_metadata().unwrap();
-        assert_eq!(serialized, "{}");
+        assert!(metadata.crs_is_lon_lat());
 
-        // OGC:CRS84 as string should be omitted
+        // OGC:CRS84 as string should be detected
         let metadata = Metadata::new(Some("OGC:CRS84"), None);
-        let wkb = WkbType::new(Some(metadata));
-        let serialized = wkb.serialize_metadata().unwrap();
-        assert_eq!(serialized, "{}");
+        assert!(metadata.crs_is_lon_lat());
 
-        // A JSON object that reasonably looks like PROJJSON for EPSG:4326 
should be omitted
+        // A JSON object that reasonably looks like PROJJSON for EPSG:4326 
should be detected
         // detect "4326" as a string
         let crs_json = r#"{"id":{"authority":"EPSG","code":"4326"}}"#;
         let metadata = Metadata::new(Some(crs_json), None);
-        let wkb = WkbType::new(Some(metadata));
-        let serialized = wkb.serialize_metadata().unwrap();
-        assert_eq!(serialized, "{}");
+        assert!(metadata.crs_is_lon_lat());
 
         // detect 4326 as a number
         let crs_json = r#"{"id":{"authority":"EPSG","code":4326}}"#;
         let metadata = Metadata::new(Some(crs_json), None);
-        let wkb = WkbType::new(Some(metadata));
-        let serialized = wkb.serialize_metadata().unwrap();
-        assert_eq!(serialized, "{}");
+        assert!(metadata.crs_is_lon_lat());
 
-        // A JSON object that reasonably looks like PROJJSON for OGC:CRS84 
should be omitted
+        // A JSON object that reasonably looks like PROJJSON for OGC:CRS84 
should be detected
         let crs_json = r#"{"id":{"authority":"OGC","code":"CRS84"}}"#;
         let metadata = Metadata::new(Some(crs_json), None);
-        let wkb = WkbType::new(Some(metadata));
-        let serialized = wkb.serialize_metadata().unwrap();
-        assert_eq!(serialized, "{}");
+        assert!(metadata.crs_is_lon_lat());
 
-        // Other input types should be preserved
+        // Other CRS values should NOT be detected as lon/lat
         let metadata = Metadata::new(Some("srid:1234"), None);
-        let wkb = WkbType::new(Some(metadata));
-        let serialized = wkb.serialize_metadata().unwrap();
-        assert_eq!(serialized, r#"{"crs":"srid:1234"}"#);
+        assert!(!metadata.crs_is_lon_lat());
 
-        // Canonicalization should work with algorithm field
-        let metadata = Metadata::new(Some("EPSG:4326"), 
Some(Edges::Spherical));
-        let wkb = WkbType::new(Some(metadata));
-        let serialized = wkb.serialize_metadata().unwrap();
-        assert_eq!(serialized, r#"{"algorithm":"spherical"}"#);
+        let metadata = Metadata::new(Some("EPSG:3857"), None);
+        assert!(!metadata.crs_is_lon_lat());
+
+        // None CRS should NOT be detected as lon/lat
+        let metadata = Metadata::new(None, None);
+        assert!(!metadata.crs_is_lon_lat());
 
         Ok(())
     }
diff --git a/parquet/src/arrow/schema/extension.rs 
b/parquet/src/arrow/schema/extension.rs
index 353770ddbd..4e1ec75084 100644
--- a/parquet/src/arrow/schema/extension.rs
+++ b/parquet/src/arrow/schema/extension.rs
@@ -67,7 +67,13 @@ pub(crate) fn try_add_extension_type(
         }
         #[cfg(feature = "geospatial")]
         LogicalType::Geometry(geometry) => {
-            let md = 
parquet_geospatial::WkbMetadata::new(geometry.crs.as_deref(), None);
+            // Per Parquet spec: omitted CRS defaults to OGC:CRS84, srid:0 
means unset CRS
+            let crs = match geometry.crs.as_deref() {
+                None => Some("OGC:CRS84"),
+                Some("srid:0") => None,
+                Some(crs) => Some(crs),
+            };
+            let md = parquet_geospatial::WkbMetadata::new(crs, None);
             let mut arrow_field = arrow_field;
             
arrow_field.try_with_extension_type(parquet_geospatial::WkbType::new(Some(md)))?;
             arrow_field
@@ -78,7 +84,13 @@ pub(crate) fn try_add_extension_type(
                 .algorithm()
                 .map(|a| a.try_as_edges())
                 .transpose()?;
-            let md = 
parquet_geospatial::WkbMetadata::new(geography.crs.as_deref(), algorithm);
+            // Per Parquet spec: omitted CRS defaults to OGC:CRS84, srid:0 
means unset CRS
+            let crs = match geography.crs.as_deref() {
+                None => Some("OGC:CRS84"),
+                Some("srid:0") => None,
+                Some(crs) => Some(crs),
+            };
+            let md = parquet_geospatial::WkbMetadata::new(crs, algorithm);
             let mut arrow_field = arrow_field;
             
arrow_field.try_with_extension_type(parquet_geospatial::WkbType::new(Some(md)))?;
             arrow_field
@@ -167,15 +179,36 @@ pub(crate) fn logical_type_for_binary(field: &Field) -> 
Option<LogicalType> {
 
     match field.extension_type_name() {
         Some(n) if n == WkbType::NAME => match 
field.try_extension_type::<WkbType>() {
-            Ok(wkb_type) => match wkb_type.metadata().type_hint() {
-                WkbTypeHint::Geometry => Some(LogicalType::geometry(
-                    wkb_type.metadata().crs.as_ref().map(|c| c.to_string()),
-                )),
-                WkbTypeHint::Geography => Some(LogicalType::geography(
-                    wkb_type.metadata().crs.as_ref().map(|c| c.to_string()),
-                    wkb_type.metadata().algorithm.map(|a| a.into()),
-                )),
-            },
+            Ok(wkb_type) => {
+                // Convert Arrow CRS to Parquet CRS:
+                // - None → "srid:0" (unset CRS in Parquet)
+                // - lon/lat CRS (OGC:CRS84, EPSG:4326) → None (default in 
Parquet)
+                // - Other CRS → JSON string
+                let crs = match &wkb_type.metadata().crs {
+                    None => Some("srid:0".to_string()),
+                    Some(_) if wkb_type.metadata().crs_is_lon_lat() => None,
+                    // For string values, use the raw string; for objects, use 
JSON representation
+                    Some(c) => Some(
+                        c.as_str()
+                            .map(|s| s.to_string())
+                            .unwrap_or_else(|| c.to_string()),
+                    ),
+                };
+                // Convert Arrow edges to Parquet algorithm:
+                // - Spherical → None (default for Geography)
+                // - Other algorithms → Some(algorithm)
+                let algorithm = wkb_type.metadata().algorithm.and_then(|a| {
+                    use parquet_geospatial::WkbEdges;
+                    match a {
+                        WkbEdges::Spherical => None, // spherical is the 
default
+                        _ => Some(a.into()),
+                    }
+                });
+                match wkb_type.metadata().type_hint() {
+                    WkbTypeHint::Geometry => Some(LogicalType::geometry(crs)),
+                    WkbTypeHint::Geography => Some(LogicalType::geography(crs, 
algorithm)),
+                }
+            }
             Err(_e) => None,
         },
         _ => None,
diff --git a/parquet/tests/geospatial.rs b/parquet/tests/geospatial.rs
index bf34528d03..388b5c003d 100644
--- a/parquet/tests/geospatial.rs
+++ b/parquet/tests/geospatial.rs
@@ -30,7 +30,7 @@ mod test {
             ArrowSchemaConverter, ArrowWriter, 
arrow_reader::ParquetRecordBatchReaderBuilder,
             arrow_writer::ArrowWriterOptions,
         },
-        basic::LogicalType,
+        basic::{EdgeInterpolationAlgorithm, LogicalType},
         column::reader::ColumnReader,
         data_type::{ByteArray, ByteArrayType},
         file::{
@@ -63,7 +63,7 @@ mod test {
             (
                 "crs-default.parquet",
                 LogicalType::geometry(None),
-                WkbMetadata::new(None, None),
+                WkbMetadata::new(Some("OGC:CRS84"), None), // omitted CRS 
defaults to OGC:CRS84
             ),
             (
                 "crs-srid.parquet",
@@ -78,7 +78,7 @@ mod test {
             (
                 "crs-geography.parquet",
                 LogicalType::geography(None, None),
-                WkbMetadata::new(None, Some(WkbEdges::Spherical)),
+                WkbMetadata::new(Some("OGC:CRS84"), 
Some(WkbEdges::Spherical)), // omitted CRS defaults to OGC:CRS84
             ),
         ];
 
@@ -425,4 +425,221 @@ mod test {
         );
         Arc::new(array)
     }
+
+    #[test]
+    fn test_logical_type_to_field_conversion() {
+        use parquet::arrow::parquet_to_arrow_schema;
+        use parquet::basic::Type as PhysicalType;
+        use parquet::schema::types::{SchemaDescriptor, Type};
+
+        // Test cases: (LogicalType, expected metadata JSON)
+        let test_cases = [
+            // Geometry with default CRS (defaults to OGC:CRS84 per Parquet 
spec)
+            (LogicalType::geometry(None), r#"{"crs":"OGC:CRS84"}"#),
+            // Geometry with srid:0 should result in an unset (omitted) CRS
+            (LogicalType::geometry(Some("srid:0".to_string())), r#"{}"#),
+            // Geometry with custom CRSes (authority:code and partial projjson)
+            (
+                LogicalType::geometry(Some("EPSG:4267".to_string())),
+                r#"{"crs":"EPSG:4267"}"#,
+            ),
+            (
+                LogicalType::geometry(Some(
+                    r#"{"id":{"authority":"EPSG","code":4326}}"#.to_string(),
+                )),
+                r#"{"crs":{"id":{"authority":"EPSG","code":4326}}}"#,
+            ),
+            // Geography with default CRS (default OGC:CRS84, spherical edges)
+            (
+                LogicalType::geography(None, None),
+                r#"{"crs":"OGC:CRS84","edges":"spherical"}"#,
+            ),
+            // Geography with explicit edges
+            (
+                LogicalType::geography(None, 
Some(EdgeInterpolationAlgorithm::SPHERICAL)),
+                r#"{"crs":"OGC:CRS84","edges":"spherical"}"#,
+            ),
+            (
+                LogicalType::geography(None, 
Some(EdgeInterpolationAlgorithm::KARNEY)),
+                r#"{"crs":"OGC:CRS84","edges":"karney"}"#,
+            ),
+            (
+                LogicalType::geography(None, 
Some(EdgeInterpolationAlgorithm::VINCENTY)),
+                r#"{"crs":"OGC:CRS84","edges":"vincenty"}"#,
+            ),
+            (
+                LogicalType::geography(None, 
Some(EdgeInterpolationAlgorithm::ANDOYER)),
+                r#"{"crs":"OGC:CRS84","edges":"andoyer"}"#,
+            ),
+            (
+                LogicalType::geography(None, 
Some(EdgeInterpolationAlgorithm::THOMAS)),
+                r#"{"crs":"OGC:CRS84","edges":"thomas"}"#,
+            ),
+            // Geometry with srid:0 should result in an unset (omitted) CRS
+            // and spherical edges
+            (
+                LogicalType::geography(Some("srid:0".to_string()), None),
+                r#"{"edges":"spherical"}"#,
+            ),
+            // Geography with custom CRSes (authority:code and partial 
projjson)
+            (
+                LogicalType::geography(Some("EPSG:4267".to_string()), None),
+                r#"{"crs":"EPSG:4267","edges":"spherical"}"#,
+            ),
+            (
+                LogicalType::geography(
+                    
Some(r#"{"id":{"authority":"EPSG","code":4326}}"#.to_string()),
+                    None,
+                ),
+                
r#"{"crs":{"id":{"authority":"EPSG","code":4326}},"edges":"spherical"}"#,
+            ),
+        ];
+
+        for (logical_type, expected_metadata) in test_cases {
+            // Build a Parquet schema with the given LogicalType
+            let parquet_schema = SchemaDescriptor::new(Arc::new(
+                Type::group_type_builder("schema")
+                    .with_fields(vec![Arc::new(
+                        Type::primitive_type_builder("geom", 
PhysicalType::BYTE_ARRAY)
+                            .with_logical_type(Some(logical_type.clone()))
+                            .build()
+                            .unwrap(),
+                    )])
+                    .build()
+                    .unwrap(),
+            ));
+
+            // Convert to Arrow schema
+            let arrow_schema = parquet_to_arrow_schema(&parquet_schema, 
None).unwrap();
+            let field = arrow_schema.field(0);
+
+            // Check extension type name
+            let ext_name = field.metadata().get("ARROW:extension:name");
+            assert_eq!(
+                ext_name,
+                Some(&"geoarrow.wkb".to_string()),
+                "Extension name mismatch for {logical_type:?}"
+            );
+
+            // Check extension metadata
+            let ext_metadata = 
field.metadata().get("ARROW:extension:metadata");
+            assert_eq!(
+                ext_metadata,
+                Some(&expected_metadata.to_string()),
+                "Extension metadata mismatch for {logical_type:?}"
+            );
+        }
+    }
+
+    #[test]
+    fn test_field_to_logical_type_conversion() {
+        use std::collections::HashMap;
+
+        // Test cases: (extension metadata JSON, expected LogicalType)
+        let test_cases = [
+            // Geometry with no CRS should be GEOMETRY(srid:0)
+            (r#"{}"#, LogicalType::geometry(Some("srid:0".to_string()))),
+            // Geometry with string CRS
+            (
+                r#"{"crs":"EPSG:4267"}"#,
+                LogicalType::geometry(Some("EPSG:4267".to_string())),
+            ),
+            // Geometry with PROJJSON CRS
+            (
+                r#"{"crs":{"id":{"authority":"EPSG","code":3857}}}"#,
+                LogicalType::geometry(Some(
+                    r#"{"id":{"authority":"EPSG","code":3857}}"#.to_string(),
+                )),
+            ),
+            // Geometry with lon/lat CRSes (canonically removed because 
lon/lat is the
+            // default Parquet CRS)
+            (r#"{"crs":"OGC:CRS84"}"#, LogicalType::geometry(None)),
+            (r#"{"crs":"EPSG:4326"}"#, LogicalType::geometry(None)),
+            (
+                r#"{"crs":{"id":{"authority":"EPSG","code":4326}}}"#,
+                LogicalType::geometry(None),
+            ),
+            (
+                r#"{"crs":{"id":{"authority":"EPSG","code":"4326"}}}"#,
+                LogicalType::geometry(None),
+            ),
+            (
+                r#"{"crs":{"id":{"authority":"OGC","code":"CRS84"}}}"#,
+                LogicalType::geometry(None),
+            ),
+            // Geography with no CRS, spherical edges
+            (
+                r#"{"edges":"spherical"}"#,
+                LogicalType::geography(Some("srid:0".to_string()), None),
+            ),
+            // Geography with OGC:CRS84 and spherical edges
+            (
+                r#"{"crs":"OGC:CRS84","edges":"spherical"}"#,
+                LogicalType::geography(None, None),
+            ),
+            // Geography with different edge algorithms
+            (
+                r#"{"crs":"OGC:CRS84","edges":"karney"}"#,
+                LogicalType::geography(None, 
Some(EdgeInterpolationAlgorithm::KARNEY)),
+            ),
+            (
+                r#"{"crs":"OGC:CRS84","edges":"vincenty"}"#,
+                LogicalType::geography(None, 
Some(EdgeInterpolationAlgorithm::VINCENTY)),
+            ),
+            (
+                r#"{"crs":"OGC:CRS84","edges":"andoyer"}"#,
+                LogicalType::geography(None, 
Some(EdgeInterpolationAlgorithm::ANDOYER)),
+            ),
+            (
+                r#"{"crs":"OGC:CRS84","edges":"thomas"}"#,
+                LogicalType::geography(None, 
Some(EdgeInterpolationAlgorithm::THOMAS)),
+            ),
+            // Geography with custom CRS and edges
+            (
+                r#"{"crs":"EPSG:4267","edges":"karney"}"#,
+                LogicalType::geography(
+                    Some("EPSG:4267".to_string()),
+                    Some(EdgeInterpolationAlgorithm::KARNEY),
+                ),
+            ),
+            // Geography with PROJJSON CRS
+            (
+                
r#"{"crs":{"id":{"authority":"EPSG","code":4267}},"edges":"spherical"}"#,
+                LogicalType::geography(
+                    
Some(r#"{"id":{"authority":"EPSG","code":4267}}"#.to_string()),
+                    None,
+                ),
+            ),
+        ];
+
+        for (ext_metadata, expected_logical_type) in test_cases {
+            // Create an Arrow Field with raw extension metadata
+            let metadata = HashMap::from([
+                (
+                    "ARROW:extension:name".to_string(),
+                    "geoarrow.wkb".to_string(),
+                ),
+                (
+                    "ARROW:extension:metadata".to_string(),
+                    ext_metadata.to_string(),
+                ),
+            ]);
+            let field =
+                Arc::new(Field::new("geom", DataType::Binary, 
true).with_metadata(metadata));
+            let schema = Schema::new(vec![field]);
+
+            // Convert to Parquet schema
+            let parquet_schema = 
ArrowSchemaConverter::new().convert(&schema).unwrap();
+
+            // Check the logical type
+            let column_descr = parquet_schema.column(0);
+            let logical_type = column_descr.logical_type_ref();
+
+            assert_eq!(
+                logical_type,
+                Some(&expected_logical_type),
+                "LogicalType mismatch for extension metadata: {ext_metadata}"
+            );
+        }
+    }
 }

Reply via email to