adriangb commented on code in PR #16583:
URL: https://github.com/apache/datafusion/pull/16583#discussion_r2184186495
##########
datafusion/core/src/datasource/listing/table.rs:
##########
@@ -894,6 +1047,81 @@ impl ListingTable {
self.schema_source
}
+ /// Set the [`SchemaAdapterFactory`] for this [`ListingTable`]
+ ///
+ /// The schema adapter factory is used to create schema adapters that can
+ /// handle schema evolution and type conversions when reading files with
+ /// different schemas than the table schema.
+ ///
+ /// # Example: Adding Schema Evolution Support
+ /// ```rust
+ /// # use std::sync::Arc;
+ /// # use datafusion::datasource::listing::{ListingTable,
ListingTableConfig, ListingOptions, ListingTableUrl};
+ /// # use datafusion::datasource::schema_adapter::{SchemaAdapterFactory,
SchemaAdapter};
+ /// # use datafusion::datasource::file_format::parquet::ParquetFormat;
+ /// # use arrow::datatypes::{SchemaRef, Schema, Field, DataType};
+ /// # #[derive(Debug)]
+ /// # struct EvolutionAdapterFactory;
+ /// # impl SchemaAdapterFactory for EvolutionAdapterFactory {
+ /// # fn create(&self, _projected_table_schema: SchemaRef,
_file_schema: SchemaRef) -> Box<dyn SchemaAdapter> {
+ /// # unimplemented!()
+ /// # }
+ /// # }
+ /// # let table_path =
ListingTableUrl::parse("file:///path/to/data").unwrap();
+ /// # let options =
ListingOptions::new(Arc::new(ParquetFormat::default()));
+ /// # let schema = Arc::new(Schema::new(vec![Field::new("id",
DataType::Int64, false)]));
+ /// # let config =
ListingTableConfig::new(table_path).with_listing_options(options).with_schema(schema);
+ /// # let table = ListingTable::try_new(config).unwrap();
+ /// let table_with_evolution = table
+ /// .with_schema_adapter_factory(Arc::new(EvolutionAdapterFactory));
+ /// ```
+ pub fn with_schema_adapter_factory(
+ self,
+ schema_adapter_factory: Arc<dyn SchemaAdapterFactory>,
+ ) -> Self {
+ Self {
+ schema_adapter_factory: Some(schema_adapter_factory),
+ ..self
+ }
+ }
+
+ /// Get the [`SchemaAdapterFactory`] for this table
+ pub fn schema_adapter_factory(&self) -> Option<&Arc<dyn
SchemaAdapterFactory>> {
+ self.schema_adapter_factory.as_ref()
+ }
+
+ /// Creates a schema adapter for mapping between file and table schemas
+ ///
+ /// Uses the configured schema adapter factory if available, otherwise
falls back
+ /// to the default implementation.
+ fn create_schema_adapter(&self) -> Box<dyn SchemaAdapter> {
+ let table_schema = self.schema();
+ match &self.schema_adapter_factory {
+ Some(factory) => {
+ factory.create(Arc::clone(&table_schema),
Arc::clone(&table_schema))
+ }
+ None =>
DefaultSchemaAdapterFactory::from_schema(Arc::clone(&table_schema)),
+ }
+ }
+
+ /// Creates a file source and applies schema adapter factory if available
+ fn create_file_source_with_schema_adapter(&self) -> Result<Arc<dyn
FileSource>> {
+ let mut source = self.options.format.file_source();
+ // Apply schema adapter to source if available
+ //
+ // NOTE: This may layer the ListingTable's schema adapter factory on
top of any
+ // existing factory that the file source already has. The composition
semantics are:
+ // 1. The file format's existing adapter (if any) handles
format-specific schema mapping
+ // 2. Our adapter handles table-level schema evolution requirements
+ //
+ // This layering is intentional but may need adjustment if the
underlying source
+ // already handles the same schema evolution cases we're trying to
address.
+ if let Some(factory) = &self.schema_adapter_factory {
+ source = source.with_schema_adapter_factory(Arc::clone(factory))?;
+ }
+ Ok(source)
Review Comment:
Okay that makes sense.
```suggestion
let mut source = self.options.format.file_source();
// Apply schema adapter to source if available
//
// The source will use this SchemaAdapter to adapt data batches as
they flow up the plan.
// Note: ListingTable also creates a SchemaAdapter in `scan()` but
that is only used to adapt collected statistics.
if let Some(factory) = &self.schema_adapter_factory {
source =
source.with_schema_adapter_factory(Arc::clone(factory))?;
}
Ok(source)
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]