This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 2f51d5fdc1 Document schema merging. (#17249)
2f51d5fdc1 is described below
commit 2f51d5fdc1d5aa33d241969c3f9bb1c6079f9421
Author: wiedld <[email protected]>
AuthorDate: Fri Aug 22 12:14:22 2025 -0700
Document schema merging. (#17249)
* chore: add docs explaining FieldMetadata::merge_options
* chore: document DFSchema::merge, which is used in logical plan
construction & modification (e.g. LP optimizers)
* chore: merge_schema utils method
* chore: clarify wording
---
datafusion/common/src/dfschema.rs | 14 ++++++++++++
datafusion/expr/src/expr.rs | 45 ++++++++++++++++++++++++++++++++++++++-
datafusion/expr/src/utils.rs | 3 +++
3 files changed, 61 insertions(+), 1 deletion(-)
diff --git a/datafusion/common/src/dfschema.rs
b/datafusion/common/src/dfschema.rs
index daf4e19ce0..d3dda28882 100644
--- a/datafusion/common/src/dfschema.rs
+++ b/datafusion/common/src/dfschema.rs
@@ -297,6 +297,20 @@ impl DFSchema {
/// Modify this schema by appending the fields from the supplied schema,
ignoring any
/// duplicate fields.
+ ///
+ /// ## Merge Precedence
+ ///
+ /// **Schema-level metadata**: Metadata from both schemas is merged.
+ /// If both schemas have the same metadata key, the value from the
`other_schema` parameter takes precedence.
+ ///
+ /// **Field-level merging**: Only non-duplicate fields are added. This
means that the
+ /// `self` fields will always take precedence over the `other_schema`
fields.
+ /// Duplicate field detection is based on:
+ /// - For qualified fields: both qualifier and field name must match
+ /// - For unqualified fields: only field name needs to match
+ ///
+ /// Take note how the precedence for fields & metadata merging differs;
+ /// merging prefers fields from `self` but prefers metadata from
`other_schema`.
pub fn merge(&mut self, other_schema: &DFSchema) {
if other_schema.inner.fields.is_empty() {
return;
diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs
index 9e2ac794de..2324ae79c0 100644
--- a/datafusion/expr/src/expr.rs
+++ b/datafusion/expr/src/expr.rs
@@ -469,7 +469,50 @@ impl FieldMetadata {
}
/// Merges two optional `FieldMetadata` instances, overwriting any existing
- /// keys in `m` with keys from `n` if present
+ /// keys in `m` with keys from `n` if present.
+ ///
+ /// This function is commonly used in alias operations, particularly for
literals
+ /// with metadata. When creating an alias expression, the metadata from
the original
+ /// expression (such as a literal) is combined with any metadata specified
on the alias.
+ ///
+ /// # Arguments
+ ///
+ /// * `m` - The first metadata (typically from the original expression
like a literal)
+ /// * `n` - The second metadata (typically from the alias definition)
+ ///
+ /// # Merge Strategy
+ ///
+ /// - If both metadata instances exist, they are merged with `n` taking
precedence
+ /// - Keys from `n` will overwrite keys from `m` if they have the same name
+ /// - If only one metadata instance exists, it is returned unchanged
+ /// - If neither exists, `None` is returned
+ ///
+ /// # Example usage
+ /// ```rust
+ /// use datafusion_expr::expr::FieldMetadata;
+ /// use std::collections::BTreeMap;
+ ///
+ /// // Create metadata for a literal expression
+ /// let literal_metadata = Some(FieldMetadata::from(BTreeMap::from([
+ /// ("source".to_string(), "constant".to_string()),
+ /// ("type".to_string(), "int".to_string()),
+ /// ])));
+ ///
+ /// // Create metadata for an alias
+ /// let alias_metadata = Some(FieldMetadata::from(BTreeMap::from([
+ /// ("description".to_string(), "answer".to_string()),
+ /// ("source".to_string(), "user".to_string()), // This will override
literal's "source"
+ /// ])));
+ ///
+ /// // Merge the metadata
+ /// let merged = FieldMetadata::merge_options(
+ /// literal_metadata.as_ref(),
+ /// alias_metadata.as_ref(),
+ /// );
+ ///
+ /// // Result contains: {"source": "user", "type": "int", "description":
"answer"}
+ /// assert!(merged.is_some());
+ /// ```
pub fn merge_options(
m: Option<&FieldMetadata>,
n: Option<&FieldMetadata>,
diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs
index 7a612b6fe6..2e364d0d2b 100644
--- a/datafusion/expr/src/utils.rs
+++ b/datafusion/expr/src/utils.rs
@@ -1225,6 +1225,9 @@ pub fn only_or_err<T>(slice: &[T]) -> Result<&T> {
}
/// merge inputs schema into a single schema.
+///
+/// This function merges schemas from multiple logical plan inputs using
[`DFSchema::merge`].
+/// Refer to that documentation for details on precedence and metadata
handling.
pub fn merge_schema(inputs: &[&LogicalPlan]) -> DFSchema {
if inputs.len() == 1 {
inputs[0].schema().as_ref().clone()
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]