liurenjie1024 commented on code in PR #2008:
URL: https://github.com/apache/iceberg-rust/pull/2008#discussion_r2675469824
##########
crates/integrations/datafusion/src/physical_plan/project.rs:
##########
@@ -20,18 +20,82 @@
use std::sync::Arc;
use datafusion::arrow::array::RecordBatch;
-use datafusion::arrow::datatypes::{DataType, Schema as ArrowSchema};
-use datafusion::common::Result as DFResult;
+use datafusion::arrow::datatypes::{DataType, Field, Fields, Schema as
ArrowSchema};
+use datafusion::common::{DataFusionError, Result as DFResult};
use datafusion::physical_expr::PhysicalExpr;
use datafusion::physical_expr::expressions::Column;
use datafusion::physical_plan::projection::ProjectionExec;
use datafusion::physical_plan::{ColumnarValue, ExecutionPlan};
-use iceberg::arrow::{PROJECTED_PARTITION_VALUE_COLUMN,
PartitionValueCalculator};
+use iceberg::arrow::{
+ PROJECTED_PARTITION_VALUE_COLUMN, PartitionValueCalculator,
schema_to_arrow_schema,
+};
use iceberg::spec::PartitionSpec;
use iceberg::table::Table;
use crate::to_datafusion_error;
+/// Recursively strips metadata from an Arrow schema and all its nested fields.
+///
+/// This function creates a new schema with all metadata removed from fields
at every level,
+/// including nested struct fields. This is useful for schema comparison where
metadata
+/// differences should be ignored.
+///
+/// # Arguments
+/// * `schema` - The Arrow schema to strip metadata from
+///
+/// # Returns
+/// A new Arrow schema with all metadata removed
+fn strip_metadata_from_schema(schema: &ArrowSchema) -> ArrowSchema {
+ let fields: Fields = schema
+ .fields()
+ .iter()
+ .map(|field| strip_metadata_from_field(field))
+ .collect();
+ ArrowSchema::new(fields)
+}
+
+/// Recursively strips metadata from an Arrow field and its nested fields.
+///
+/// # Arguments
+/// * `field` - The Arrow field to strip metadata from
+///
+/// # Returns
+/// A new Arrow field with all metadata removed
+fn strip_metadata_from_field(field: &Field) -> Field {
+ let data_type = strip_metadata_from_datatype(field.data_type());
+ Field::new(field.name(), data_type, field.is_nullable())
+}
+
+/// Recursively strips metadata from an Arrow data type.
+///
+/// For struct types, this function recursively processes all nested fields.
+/// For other types, it returns a clone of the type.
+///
+/// # Arguments
+/// * `data_type` - The Arrow data type to strip metadata from
+///
+/// # Returns
+/// A new Arrow data type with all metadata removed from nested structures
+fn strip_metadata_from_datatype(data_type: &DataType) -> DataType {
Review Comment:
Two suggestions:
1. Move this part to arrow module, we have plans to make move arrow out of
core library, so it would be better to put all arrow related code to same
module.
2. Use ArrowSchemaVisitor to do it.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]