This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new d4d79be389 Add tests and fix corner cases for Parquet/GeoArrow
extension type conversion (#10065)
d4d79be389 is described below
commit d4d79be3896085cc082881b54f92175798e5ab53
Author: Dewey Dunnington <[email protected]>
AuthorDate: Wed Jun 17 07:36:35 2026 -0500
Add tests and fix corner cases for Parquet/GeoArrow extension type
conversion (#10065)
# Which issue does this PR close?
- Closes #9929.
# Rationale for this change
There were several issues with conversion identified when I tried to
integrate this in SedonaDB and that came to light when the spec was
recently clarified.
I am sorry for missing these changes when I reviewed the initial
implementation.
# What changes are included in this PR?
- A Parquet crs of `None` for geometry or geography is now converted to
a GeoArrow CRS of `"OGC:CRS84"` (the named value for the default CRS in
the Parquet spec)
- A Parquet crs of `"srid:0"` is now converted to a GeoArrow "omitted"
CRS. This was recently clarified in the Parquet spec (srid:0 is a named
example in the list of allowed values).
- A GeoArrow missing CRS is now encoded as `"srid:0"`
- A GeoArrow CRS that is "lonlat-like" is now encoded as a Parquet crs
of `None`. This logic was included in the previous implementation but
was reversed (Parquet CRSes that looked like lonlat were omitted when
written to GeoArrow, which is not correct).
- The GeoArrow metadata struct uses the name `"algorithm"` and was
serializing it to JSON. The GeoArrow spec uses the `"edges"` key. This
led to invalid metadata being generated which was either rejected or
incorrectly interpreted by consumers.
# Are these changes tested?
Yes. I added high-level end-to-end LogicalType <-> extension metadata
tests, since that is what matters (there were a few lower level tests
that I updated as well).
---
parquet-geospatial/src/types.rs | 74 ++++-------
parquet/src/arrow/schema/extension.rs | 55 +++++++--
parquet/tests/geospatial.rs | 223 +++++++++++++++++++++++++++++++++-
3 files changed, 291 insertions(+), 61 deletions(-)
diff --git a/parquet-geospatial/src/types.rs b/parquet-geospatial/src/types.rs
index f19911ad05..b19e9103fc 100644
--- a/parquet-geospatial/src/types.rs
+++ b/parquet-geospatial/src/types.rs
@@ -57,12 +57,10 @@ pub struct Metadata {
/// The Coordinate Reference System (CRS) of the [`WkbType`], if present.
///
/// This may be a raw string value (e.g., "EPSG:3857") or a JSON object
(e.g., PROJJSON).
- /// Note: Common lon/lat CRS representations (EPSG:4326, OGC:CRS84) are
canonicalized
- /// to `None` during serialization to match Parquet conventions.
#[serde(skip_serializing_if = "Option::is_none")]
pub crs: Option<serde_json::Value>,
/// The edge interpolation algorithm of the [`WkbType`], if present.
- #[serde(skip_serializing_if = "Option::is_none")]
+ #[serde(rename = "edges", skip_serializing_if = "Option::is_none")]
pub algorithm: Option<Edges>,
}
@@ -88,8 +86,10 @@ impl Metadata {
}
}
- /// Detect if the CRS is a common representation of lon/lat on the
standard WGS84 ellipsoid
- fn crs_is_lon_lat(&self) -> bool {
+ /// Detect if the CRS is a common representation of lon/lat on the
standard WGS84 ellipsoid.
+ ///
+ /// Returns `true` for OGC:CRS84, EPSG:4326, and PROJJSON representations
thereof.
+ pub fn crs_is_lon_lat(&self) -> bool {
use serde_json::Value;
let Some(crs) = &self.crs else {
@@ -144,16 +144,7 @@ impl ExtensionType for WkbType {
}
fn serialize_metadata(&self) -> Option<String> {
- let md = if self.0.crs_is_lon_lat() {
- &Metadata {
- crs: None, // lon/lat CRS is canonicalized as omitted (None)
for Parquet
- algorithm: self.0.algorithm,
- }
- } else {
- &self.0
- };
-
- serde_json::to_string(md).ok()
+ serde_json::to_string(&self.0).ok()
}
fn deserialize_metadata(metadata: Option<&str>) ->
ArrowResult<Self::Metadata> {
@@ -251,7 +242,7 @@ mod tests {
let wkb = WkbType::new(Some(metadata));
let serialized = wkb.serialize_metadata().unwrap();
- assert_eq!(serialized, r#"{"algorithm":"spherical"}"#);
+ assert_eq!(serialized, r#"{"edges":"spherical"}"#);
let deserialized = WkbType::deserialize_metadata(Some(&serialized))?;
assert!(deserialized.crs.is_none());
@@ -267,7 +258,7 @@ mod tests {
let wkb = WkbType::new(Some(metadata));
let serialized = wkb.serialize_metadata().unwrap();
- assert_eq!(serialized,
r#"{"crs":"srid:1234","algorithm":"spherical"}"#);
+ assert_eq!(serialized, r#"{"crs":"srid:1234","edges":"spherical"}"#);
let deserialized = WkbType::deserialize_metadata(Some(&serialized))?;
assert_eq!(
@@ -353,54 +344,43 @@ mod tests {
Ok(())
}
- /// Test CRS canonicalization logic for common lon/lat representations
+ /// Test crs_is_lon_lat() detection for common lon/lat representations
#[test]
- fn test_crs_canonicalization() -> ArrowResult<()> {
- // EPSG:4326 as string should be omitted
+ fn test_crs_is_lon_lat() -> ArrowResult<()> {
+ // EPSG:4326 as string should be detected
let metadata = Metadata::new(Some("EPSG:4326"), None);
- let wkb = WkbType::new(Some(metadata));
- let serialized = wkb.serialize_metadata().unwrap();
- assert_eq!(serialized, "{}");
+ assert!(metadata.crs_is_lon_lat());
- // OGC:CRS84 as string should be omitted
+ // OGC:CRS84 as string should be detected
let metadata = Metadata::new(Some("OGC:CRS84"), None);
- let wkb = WkbType::new(Some(metadata));
- let serialized = wkb.serialize_metadata().unwrap();
- assert_eq!(serialized, "{}");
+ assert!(metadata.crs_is_lon_lat());
- // A JSON object that reasonably looks like PROJJSON for EPSG:4326
should be omitted
+ // A JSON object that reasonably looks like PROJJSON for EPSG:4326
should be detected
// detect "4326" as a string
let crs_json = r#"{"id":{"authority":"EPSG","code":"4326"}}"#;
let metadata = Metadata::new(Some(crs_json), None);
- let wkb = WkbType::new(Some(metadata));
- let serialized = wkb.serialize_metadata().unwrap();
- assert_eq!(serialized, "{}");
+ assert!(metadata.crs_is_lon_lat());
// detect 4326 as a number
let crs_json = r#"{"id":{"authority":"EPSG","code":4326}}"#;
let metadata = Metadata::new(Some(crs_json), None);
- let wkb = WkbType::new(Some(metadata));
- let serialized = wkb.serialize_metadata().unwrap();
- assert_eq!(serialized, "{}");
+ assert!(metadata.crs_is_lon_lat());
- // A JSON object that reasonably looks like PROJJSON for OGC:CRS84
should be omitted
+ // A JSON object that reasonably looks like PROJJSON for OGC:CRS84
should be detected
let crs_json = r#"{"id":{"authority":"OGC","code":"CRS84"}}"#;
let metadata = Metadata::new(Some(crs_json), None);
- let wkb = WkbType::new(Some(metadata));
- let serialized = wkb.serialize_metadata().unwrap();
- assert_eq!(serialized, "{}");
+ assert!(metadata.crs_is_lon_lat());
- // Other input types should be preserved
+ // Other CRS values should NOT be detected as lon/lat
let metadata = Metadata::new(Some("srid:1234"), None);
- let wkb = WkbType::new(Some(metadata));
- let serialized = wkb.serialize_metadata().unwrap();
- assert_eq!(serialized, r#"{"crs":"srid:1234"}"#);
+ assert!(!metadata.crs_is_lon_lat());
- // Canonicalization should work with algorithm field
- let metadata = Metadata::new(Some("EPSG:4326"),
Some(Edges::Spherical));
- let wkb = WkbType::new(Some(metadata));
- let serialized = wkb.serialize_metadata().unwrap();
- assert_eq!(serialized, r#"{"algorithm":"spherical"}"#);
+ let metadata = Metadata::new(Some("EPSG:3857"), None);
+ assert!(!metadata.crs_is_lon_lat());
+
+ // None CRS should NOT be detected as lon/lat
+ let metadata = Metadata::new(None, None);
+ assert!(!metadata.crs_is_lon_lat());
Ok(())
}
diff --git a/parquet/src/arrow/schema/extension.rs
b/parquet/src/arrow/schema/extension.rs
index 353770ddbd..4e1ec75084 100644
--- a/parquet/src/arrow/schema/extension.rs
+++ b/parquet/src/arrow/schema/extension.rs
@@ -67,7 +67,13 @@ pub(crate) fn try_add_extension_type(
}
#[cfg(feature = "geospatial")]
LogicalType::Geometry(geometry) => {
- let md =
parquet_geospatial::WkbMetadata::new(geometry.crs.as_deref(), None);
+ // Per Parquet spec: omitted CRS defaults to OGC:CRS84, srid:0
means unset CRS
+ let crs = match geometry.crs.as_deref() {
+ None => Some("OGC:CRS84"),
+ Some("srid:0") => None,
+ Some(crs) => Some(crs),
+ };
+ let md = parquet_geospatial::WkbMetadata::new(crs, None);
let mut arrow_field = arrow_field;
arrow_field.try_with_extension_type(parquet_geospatial::WkbType::new(Some(md)))?;
arrow_field
@@ -78,7 +84,13 @@ pub(crate) fn try_add_extension_type(
.algorithm()
.map(|a| a.try_as_edges())
.transpose()?;
- let md =
parquet_geospatial::WkbMetadata::new(geography.crs.as_deref(), algorithm);
+ // Per Parquet spec: omitted CRS defaults to OGC:CRS84, srid:0
means unset CRS
+ let crs = match geography.crs.as_deref() {
+ None => Some("OGC:CRS84"),
+ Some("srid:0") => None,
+ Some(crs) => Some(crs),
+ };
+ let md = parquet_geospatial::WkbMetadata::new(crs, algorithm);
let mut arrow_field = arrow_field;
arrow_field.try_with_extension_type(parquet_geospatial::WkbType::new(Some(md)))?;
arrow_field
@@ -167,15 +179,36 @@ pub(crate) fn logical_type_for_binary(field: &Field) ->
Option<LogicalType> {
match field.extension_type_name() {
Some(n) if n == WkbType::NAME => match
field.try_extension_type::<WkbType>() {
- Ok(wkb_type) => match wkb_type.metadata().type_hint() {
- WkbTypeHint::Geometry => Some(LogicalType::geometry(
- wkb_type.metadata().crs.as_ref().map(|c| c.to_string()),
- )),
- WkbTypeHint::Geography => Some(LogicalType::geography(
- wkb_type.metadata().crs.as_ref().map(|c| c.to_string()),
- wkb_type.metadata().algorithm.map(|a| a.into()),
- )),
- },
+ Ok(wkb_type) => {
+ // Convert Arrow CRS to Parquet CRS:
+ // - None → "srid:0" (unset CRS in Parquet)
+ // - lon/lat CRS (OGC:CRS84, EPSG:4326) → None (default in
Parquet)
+ // - Other CRS → JSON string
+ let crs = match &wkb_type.metadata().crs {
+ None => Some("srid:0".to_string()),
+ Some(_) if wkb_type.metadata().crs_is_lon_lat() => None,
+ // For string values, use the raw string; for objects, use
JSON representation
+ Some(c) => Some(
+ c.as_str()
+ .map(|s| s.to_string())
+ .unwrap_or_else(|| c.to_string()),
+ ),
+ };
+ // Convert Arrow edges to Parquet algorithm:
+ // - Spherical → None (default for Geography)
+ // - Other algorithms → Some(algorithm)
+ let algorithm = wkb_type.metadata().algorithm.and_then(|a| {
+ use parquet_geospatial::WkbEdges;
+ match a {
+ WkbEdges::Spherical => None, // spherical is the
default
+ _ => Some(a.into()),
+ }
+ });
+ match wkb_type.metadata().type_hint() {
+ WkbTypeHint::Geometry => Some(LogicalType::geometry(crs)),
+ WkbTypeHint::Geography => Some(LogicalType::geography(crs,
algorithm)),
+ }
+ }
Err(_e) => None,
},
_ => None,
diff --git a/parquet/tests/geospatial.rs b/parquet/tests/geospatial.rs
index bf34528d03..388b5c003d 100644
--- a/parquet/tests/geospatial.rs
+++ b/parquet/tests/geospatial.rs
@@ -30,7 +30,7 @@ mod test {
ArrowSchemaConverter, ArrowWriter,
arrow_reader::ParquetRecordBatchReaderBuilder,
arrow_writer::ArrowWriterOptions,
},
- basic::LogicalType,
+ basic::{EdgeInterpolationAlgorithm, LogicalType},
column::reader::ColumnReader,
data_type::{ByteArray, ByteArrayType},
file::{
@@ -63,7 +63,7 @@ mod test {
(
"crs-default.parquet",
LogicalType::geometry(None),
- WkbMetadata::new(None, None),
+ WkbMetadata::new(Some("OGC:CRS84"), None), // omitted CRS
defaults to OGC:CRS84
),
(
"crs-srid.parquet",
@@ -78,7 +78,7 @@ mod test {
(
"crs-geography.parquet",
LogicalType::geography(None, None),
- WkbMetadata::new(None, Some(WkbEdges::Spherical)),
+ WkbMetadata::new(Some("OGC:CRS84"),
Some(WkbEdges::Spherical)), // omitted CRS defaults to OGC:CRS84
),
];
@@ -425,4 +425,221 @@ mod test {
);
Arc::new(array)
}
+
+ #[test]
+ fn test_logical_type_to_field_conversion() {
+ use parquet::arrow::parquet_to_arrow_schema;
+ use parquet::basic::Type as PhysicalType;
+ use parquet::schema::types::{SchemaDescriptor, Type};
+
+ // Test cases: (LogicalType, expected metadata JSON)
+ let test_cases = [
+ // Geometry with default CRS (defaults to OGC:CRS84 per Parquet
spec)
+ (LogicalType::geometry(None), r#"{"crs":"OGC:CRS84"}"#),
+ // Geometry with srid:0 should result in an unset (omitted) CRS
+ (LogicalType::geometry(Some("srid:0".to_string())), r#"{}"#),
+ // Geometry with custom CRSes (authority:code and partial projjson)
+ (
+ LogicalType::geometry(Some("EPSG:4267".to_string())),
+ r#"{"crs":"EPSG:4267"}"#,
+ ),
+ (
+ LogicalType::geometry(Some(
+ r#"{"id":{"authority":"EPSG","code":4326}}"#.to_string(),
+ )),
+ r#"{"crs":{"id":{"authority":"EPSG","code":4326}}}"#,
+ ),
+ // Geography with default CRS (default OGC:CRS84, spherical edges)
+ (
+ LogicalType::geography(None, None),
+ r#"{"crs":"OGC:CRS84","edges":"spherical"}"#,
+ ),
+ // Geography with explicit edges
+ (
+ LogicalType::geography(None,
Some(EdgeInterpolationAlgorithm::SPHERICAL)),
+ r#"{"crs":"OGC:CRS84","edges":"spherical"}"#,
+ ),
+ (
+ LogicalType::geography(None,
Some(EdgeInterpolationAlgorithm::KARNEY)),
+ r#"{"crs":"OGC:CRS84","edges":"karney"}"#,
+ ),
+ (
+ LogicalType::geography(None,
Some(EdgeInterpolationAlgorithm::VINCENTY)),
+ r#"{"crs":"OGC:CRS84","edges":"vincenty"}"#,
+ ),
+ (
+ LogicalType::geography(None,
Some(EdgeInterpolationAlgorithm::ANDOYER)),
+ r#"{"crs":"OGC:CRS84","edges":"andoyer"}"#,
+ ),
+ (
+ LogicalType::geography(None,
Some(EdgeInterpolationAlgorithm::THOMAS)),
+ r#"{"crs":"OGC:CRS84","edges":"thomas"}"#,
+ ),
+ // Geometry with srid:0 should result in an unset (omitted) CRS
+ // and spherical edges
+ (
+ LogicalType::geography(Some("srid:0".to_string()), None),
+ r#"{"edges":"spherical"}"#,
+ ),
+ // Geography with custom CRSes (authority:code and partial
projjson)
+ (
+ LogicalType::geography(Some("EPSG:4267".to_string()), None),
+ r#"{"crs":"EPSG:4267","edges":"spherical"}"#,
+ ),
+ (
+ LogicalType::geography(
+
Some(r#"{"id":{"authority":"EPSG","code":4326}}"#.to_string()),
+ None,
+ ),
+
r#"{"crs":{"id":{"authority":"EPSG","code":4326}},"edges":"spherical"}"#,
+ ),
+ ];
+
+ for (logical_type, expected_metadata) in test_cases {
+ // Build a Parquet schema with the given LogicalType
+ let parquet_schema = SchemaDescriptor::new(Arc::new(
+ Type::group_type_builder("schema")
+ .with_fields(vec![Arc::new(
+ Type::primitive_type_builder("geom",
PhysicalType::BYTE_ARRAY)
+ .with_logical_type(Some(logical_type.clone()))
+ .build()
+ .unwrap(),
+ )])
+ .build()
+ .unwrap(),
+ ));
+
+ // Convert to Arrow schema
+ let arrow_schema = parquet_to_arrow_schema(&parquet_schema,
None).unwrap();
+ let field = arrow_schema.field(0);
+
+ // Check extension type name
+ let ext_name = field.metadata().get("ARROW:extension:name");
+ assert_eq!(
+ ext_name,
+ Some(&"geoarrow.wkb".to_string()),
+ "Extension name mismatch for {logical_type:?}"
+ );
+
+ // Check extension metadata
+ let ext_metadata =
field.metadata().get("ARROW:extension:metadata");
+ assert_eq!(
+ ext_metadata,
+ Some(&expected_metadata.to_string()),
+ "Extension metadata mismatch for {logical_type:?}"
+ );
+ }
+ }
+
+ #[test]
+ fn test_field_to_logical_type_conversion() {
+ use std::collections::HashMap;
+
+ // Test cases: (extension metadata JSON, expected LogicalType)
+ let test_cases = [
+ // Geometry with no CRS should be GEOMETRY(srid:0)
+ (r#"{}"#, LogicalType::geometry(Some("srid:0".to_string()))),
+ // Geometry with string CRS
+ (
+ r#"{"crs":"EPSG:4267"}"#,
+ LogicalType::geometry(Some("EPSG:4267".to_string())),
+ ),
+ // Geometry with PROJJSON CRS
+ (
+ r#"{"crs":{"id":{"authority":"EPSG","code":3857}}}"#,
+ LogicalType::geometry(Some(
+ r#"{"id":{"authority":"EPSG","code":3857}}"#.to_string(),
+ )),
+ ),
+ // Geometry with lon/lat CRSes (canonically removed because
lon/lat is the
+ // default Parquet CRS)
+ (r#"{"crs":"OGC:CRS84"}"#, LogicalType::geometry(None)),
+ (r#"{"crs":"EPSG:4326"}"#, LogicalType::geometry(None)),
+ (
+ r#"{"crs":{"id":{"authority":"EPSG","code":4326}}}"#,
+ LogicalType::geometry(None),
+ ),
+ (
+ r#"{"crs":{"id":{"authority":"EPSG","code":"4326"}}}"#,
+ LogicalType::geometry(None),
+ ),
+ (
+ r#"{"crs":{"id":{"authority":"OGC","code":"CRS84"}}}"#,
+ LogicalType::geometry(None),
+ ),
+ // Geography with no CRS, spherical edges
+ (
+ r#"{"edges":"spherical"}"#,
+ LogicalType::geography(Some("srid:0".to_string()), None),
+ ),
+ // Geography with OGC:CRS84 and spherical edges
+ (
+ r#"{"crs":"OGC:CRS84","edges":"spherical"}"#,
+ LogicalType::geography(None, None),
+ ),
+ // Geography with different edge algorithms
+ (
+ r#"{"crs":"OGC:CRS84","edges":"karney"}"#,
+ LogicalType::geography(None,
Some(EdgeInterpolationAlgorithm::KARNEY)),
+ ),
+ (
+ r#"{"crs":"OGC:CRS84","edges":"vincenty"}"#,
+ LogicalType::geography(None,
Some(EdgeInterpolationAlgorithm::VINCENTY)),
+ ),
+ (
+ r#"{"crs":"OGC:CRS84","edges":"andoyer"}"#,
+ LogicalType::geography(None,
Some(EdgeInterpolationAlgorithm::ANDOYER)),
+ ),
+ (
+ r#"{"crs":"OGC:CRS84","edges":"thomas"}"#,
+ LogicalType::geography(None,
Some(EdgeInterpolationAlgorithm::THOMAS)),
+ ),
+ // Geography with custom CRS and edges
+ (
+ r#"{"crs":"EPSG:4267","edges":"karney"}"#,
+ LogicalType::geography(
+ Some("EPSG:4267".to_string()),
+ Some(EdgeInterpolationAlgorithm::KARNEY),
+ ),
+ ),
+ // Geography with PROJJSON CRS
+ (
+
r#"{"crs":{"id":{"authority":"EPSG","code":4267}},"edges":"spherical"}"#,
+ LogicalType::geography(
+
Some(r#"{"id":{"authority":"EPSG","code":4267}}"#.to_string()),
+ None,
+ ),
+ ),
+ ];
+
+ for (ext_metadata, expected_logical_type) in test_cases {
+ // Create an Arrow Field with raw extension metadata
+ let metadata = HashMap::from([
+ (
+ "ARROW:extension:name".to_string(),
+ "geoarrow.wkb".to_string(),
+ ),
+ (
+ "ARROW:extension:metadata".to_string(),
+ ext_metadata.to_string(),
+ ),
+ ]);
+ let field =
+ Arc::new(Field::new("geom", DataType::Binary,
true).with_metadata(metadata));
+ let schema = Schema::new(vec![field]);
+
+ // Convert to Parquet schema
+ let parquet_schema =
ArrowSchemaConverter::new().convert(&schema).unwrap();
+
+ // Check the logical type
+ let column_descr = parquet_schema.column(0);
+ let logical_type = column_descr.logical_type_ref();
+
+ assert_eq!(
+ logical_type,
+ Some(&expected_logical_type),
+ "LogicalType mismatch for extension metadata: {ext_metadata}"
+ );
+ }
+ }
}