This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 670dbf481c fix: prevent duplicate alias collision with user-provided 
__datafusion_extracted names (#20432)
670dbf481c is described below

commit 670dbf481cff6bae9ae0892ac1ecc1b1ab90bea3
Author: Adrian Garcia Badaracco <[email protected]>
AuthorDate: Tue Feb 24 15:02:59 2026 +0000

    fix: prevent duplicate alias collision with user-provided 
__datafusion_extracted names (#20432)
    
    ## Summary
    - Fixes a bug where the optimizer's `AliasGenerator` could produce alias
    names that collide with`__datafusion_extracted_N` aliases, causing a
    "Schema contains duplicate unqualified field name" error
    - I don't expect users themselves to create these aliases, but if you
    run the optimizers twice (with different `AliasGenerator` instances)
    you'll hit this.
    - Adds `AliasGenerator::update_min_id()` to advance the counter past
    existing aliases
    - Scans each plan node's expressions during `ExtractLeafExpressions`
    traversal to seed the generator before any extraction occurs
    - Switches to controlling the traversal which also means the
    config-based short circuit more clearly skips the entire rule.
    
    Closes https://github.com/apache/datafusion/issues/20430
    
    ## Test plan
    - [x] Unit test: `test_user_provided_extracted_alias_no_collision` in
    `extract_leaf_expressions`
    - [x] SLT regression test in `projection_pushdown.slt` with explicit
    `__datafusion_extracted_2` alias
    
    🤖 Generated with [Claude Code](https://claude.com/claude-code)
    
    ---------
    
    Co-authored-by: Claude Opus 4.6 <[email protected]>
---
 datafusion/common/src/alias.rs                     | 10 ++++++
 .../optimizer/src/extract_leaf_expressions.rs      | 41 +++++++++++++++++++---
 .../test_files/projection_pushdown.slt             | 41 ++++++++++++++++++++++
 3 files changed, 87 insertions(+), 5 deletions(-)

diff --git a/datafusion/common/src/alias.rs b/datafusion/common/src/alias.rs
index 2ee2cb4dc7..99f6447a6a 100644
--- a/datafusion/common/src/alias.rs
+++ b/datafusion/common/src/alias.rs
@@ -37,6 +37,16 @@ impl AliasGenerator {
         Self::default()
     }
 
+    /// Advance the counter to at least `min_id`, ensuring future aliases
+    /// won't collide with already-existing ones.
+    ///
+    /// For example, if the query already contains an alias `alias_42`, then 
calling
+    /// `update_min_id(42)` will ensure that future aliases generated by this
+    /// [`AliasGenerator`] will start from `alias_43`.
+    pub fn update_min_id(&self, min_id: usize) {
+        self.next_id.fetch_max(min_id + 1, Ordering::Relaxed);
+    }
+
     /// Return a unique alias with the provided prefix
     pub fn next(&self, prefix: &str) -> String {
         let id = self.next_id.fetch_add(1, Ordering::Relaxed);
diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs 
b/datafusion/optimizer/src/extract_leaf_expressions.rs
index f5f4982e38..922ea79337 100644
--- a/datafusion/optimizer/src/extract_leaf_expressions.rs
+++ b/datafusion/optimizer/src/extract_leaf_expressions.rs
@@ -114,10 +114,6 @@ impl OptimizerRule for ExtractLeafExpressions {
         "extract_leaf_expressions"
     }
 
-    fn apply_order(&self) -> Option<ApplyOrder> {
-        Some(ApplyOrder::TopDown)
-    }
-
     fn rewrite(
         &self,
         plan: LogicalPlan,
@@ -127,10 +123,45 @@ impl OptimizerRule for ExtractLeafExpressions {
             return Ok(Transformed::no(plan));
         }
         let alias_generator = config.alias_generator();
-        extract_from_plan(plan, alias_generator)
+
+        // Advance the alias generator past any user-provided 
__datafusion_extracted_N
+        // aliases to prevent collisions when generating new extraction 
aliases.
+        advance_generator_past_existing(&plan, alias_generator)?;
+
+        plan.transform_down_with_subqueries(|plan| {
+            extract_from_plan(plan, alias_generator)
+        })
     }
 }
 
+/// Scans the current plan node's expressions for pre-existing
+/// `__datafusion_extracted_N` aliases and advances the generator
+/// counter past them to avoid collisions with user-provided aliases.
+fn advance_generator_past_existing(
+    plan: &LogicalPlan,
+    alias_generator: &AliasGenerator,
+) -> Result<()> {
+    plan.apply(|plan| {
+        plan.expressions().iter().try_for_each(|expr| {
+            expr.apply(|e| {
+                if let Expr::Alias(alias) = e
+                    && let Some(id) = alias
+                        .name
+                        .strip_prefix(EXTRACTED_EXPR_PREFIX)
+                        .and_then(|s| s.strip_prefix('_'))
+                        .and_then(|s| s.parse().ok())
+                {
+                    alias_generator.update_min_id(id);
+                }
+                Ok(TreeNodeRecursion::Continue)
+            })?;
+            Ok::<(), datafusion_common::error::DataFusionError>(())
+        })?;
+        Ok(TreeNodeRecursion::Continue)
+    })
+    .map(|_| ())
+}
+
 /// Extracts `MoveTowardsLeafNodes` sub-expressions from a plan node.
 ///
 /// Works for any number of inputs (0, 1, 2, …N). For multi-input nodes
diff --git a/datafusion/sqllogictest/test_files/projection_pushdown.slt 
b/datafusion/sqllogictest/test_files/projection_pushdown.slt
index c25b80a0d7..dbb77b33c2 100644
--- a/datafusion/sqllogictest/test_files/projection_pushdown.slt
+++ b/datafusion/sqllogictest/test_files/projection_pushdown.slt
@@ -1949,3 +1949,44 @@ ORDER BY simple_struct.id;
 3 3
 4 4
 5 5
+
+# =========================================================================
+# Regression: user-provided __datafusion_extracted aliases must not
+# collide with optimizer-generated ones
+# (https://github.com/apache/datafusion/issues/20430)
+# =========================================================================
+
+statement ok
+COPY ( select {f1: 1, f2: 2} as s
+) TO 'test_files/scratch/projection_pushdown/test.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE t
+STORED AS PARQUET
+LOCATION 'test_files/scratch/projection_pushdown/test.parquet';
+
+# Verify that the user-provided __datafusion_extracted_2 alias is preserved
+# and the optimizer skips to _3 and _4 for its generated aliases.
+query TT
+EXPLAIN SELECT
+    get_field(s, 'f1') AS __datafusion_extracted_2
+FROM t
+WHERE COALESCE(get_field(s, 'f1'), get_field(s, 'f2')) = 1;
+----
+logical_plan
+01)Projection: __datafusion_extracted_2
+02)--Filter: CASE WHEN __datafusion_extracted_3 IS NOT NULL THEN 
__datafusion_extracted_3 ELSE __datafusion_extracted_4 END = Int64(1)
+03)----Projection: get_field(t.s, Utf8("f1")) AS __datafusion_extracted_3, 
get_field(t.s, Utf8("f2")) AS __datafusion_extracted_4, get_field(t.s, 
Utf8("f1")) AS __datafusion_extracted_2
+04)------TableScan: t projection=[s], partial_filters=[CASE WHEN 
get_field(t.s, Utf8("f1")) IS NOT NULL THEN get_field(t.s, Utf8("f1")) ELSE 
get_field(t.s, Utf8("f2")) END = Int64(1)]
+physical_plan
+01)FilterExec: CASE WHEN __datafusion_extracted_3@0 IS NOT NULL THEN 
__datafusion_extracted_3@0 ELSE __datafusion_extracted_4@1 END = 1, 
projection=[__datafusion_extracted_2@2]
+02)--DataSourceExec: file_groups={1 group: 
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/test.parquet]]},
 projection=[get_field(s@0, f1) as __datafusion_extracted_3, get_field(s@0, f2) 
as __datafusion_extracted_4, get_field(s@0, f1) as __datafusion_extracted_2], 
file_type=parquet
+
+query I
+SELECT
+    get_field(s, 'f1') AS __datafusion_extracted_2
+FROM t
+WHERE COALESCE(get_field(s, 'f1'), get_field(s, 'f2')) = 1;
+----
+1


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to