tustvold commented on code in PR #6458:
URL: https://github.com/apache/arrow-datafusion/pull/6458#discussion_r1212977474


##########
datafusion/core/src/physical_plan/file_format/mod.rs:
##########
@@ -489,38 +491,93 @@ impl SchemaAdapter {
             field_mappings,
         })
     }
+
+    /// Creates a `SchemaMapping` that can be used to cast or map the columns 
from the file schema
+    /// to the table schema, taking into account the provided projections.
+    pub fn map_schema_with_projection(
+        &self,
+        file_schema: &Schema,
+        projections: &[usize],
+    ) -> Result<(SchemaMapping, Vec<usize>)> {
+        let mut field_mappings: Vec<(usize, Option<DataType>)> = Vec::new();
+        let mut mapped: Vec<usize> = vec![];
+
+        for idx in projections {
+            let field = self.table_schema.field(*idx);
+            match file_schema.index_of(field.name().as_str()) {
+                Ok(mapped_idx)
+                    if can_cast_types(
+                        file_schema.field(mapped_idx).data_type(),
+                        field.data_type(),
+                    ) =>
+                {
+                    field_mappings.push((*idx, 
Some(field.data_type().clone())));
+                    mapped.push(mapped_idx);
+                }
+                Ok(mapped_idx) => {
+                    return Err(DataFusionError::Plan(format!(
+                        "Cannot cast file schema field {} of type {:?} to 
table schema field of type {:?}",
+                        field.name(),
+                        file_schema.field(mapped_idx).data_type(),
+                        field.data_type()
+                    )));
+                }
+                Err(_) => {
+                    field_mappings.push((*idx, None));
+                }
+            }
+        }
+        Ok((
+            SchemaMapping {
+                table_schema: self.table_schema.clone(),
+                field_mappings,
+            },
+            mapped,
+        ))
+    }
 }
 
 /// The SchemaMapping struct holds a mapping from the file schema to the table 
schema
 /// and any necessary type conversions that need to be applied.
 #[derive(Debug)]
 pub struct SchemaMapping {
-    #[allow(dead_code)]
     table_schema: SchemaRef,
-    #[allow(dead_code)]
-    field_mappings: Vec<(usize, DataType)>,
+    field_mappings: Vec<(usize, Option<DataType>)>,

Review Comment:
   I think some documentation of what these values would mean would go a long 
way, I would have expected something like the following, but this doesn't 
appear to be the case
   ```suggestion
       /// The index in the batch schema matching the corresponding field in 
table_schema
       /// i.e. table_schema[i] = file_schema[field_mappings[i].0]
       field_mappings: Vec<Option<usize>>,
   ```
   
   map_batch can then do
   
   ```
   let cols = self.table_schema.fields().zip(&self.field_mappings).map(|(field, 
mapping)| {
       match mapping {
           Some(idx) => cast(batch.columns()[idx], field.data_type()),
           None => new_null_array(field.data_type(), batch.num_rows()),
       }
   }).collect::<Result<Vec<_>>>()?;
   
   let record_batch = 
RecordBatch::try_new_with_options(self.table_schema.clone(), cols, &options)?;
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to