[I] Partial AggregateMode will generate duplicate field names which will fail DFSchema construct [datafusion]

via GitHub Fri, 19 Sep 2025 01:25:34 -0700


zhuqi-lucas opened a new issue, #17662:
URL: https://github.com/apache/datafusion/issues/17662


   ### Describe the bug
   
   After the PR since apache datafusion 50.0.0, we will have name check for 
DFSchema construct:
   
   https://github.com/apache/datafusion/pull/17189/files#r2277001021
   
   
   But Partial AggregateMode will generate duplicate field names which will 
fail DFSchema construct, in our production we use lower level to do this, so we 
need to make  Partial AggregateMode  not generate duplicate field names.
   
   ### To Reproduce
   
   Reproduced code:
   
   
   ```rust
   #[test]
   fn test_duplicate_state_fields_fails_for_dfschema_construct() -> Result<()> {
       let ctx = SessionContext::new();
   
       // Simple schema with just the fields we need
       let file_schema = Arc::new(Schema::new(vec![
           Field::new(
               "timestamp",
               DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into())),
               true,
           ),
           Field::new("ticker", DataType::Utf8, true),
           Field::new("value", DataType::Float64, true),
           Field::new("date", DataType::Utf8, false),
       ]));
   
   
       let df_schema = DFSchema::try_from(file_schema.clone())?;
   
       let timestamp = col("timestamp");
       let value = col("value");
       let ticker = col("ticker");
       let date = col("date");
   
       // Create a simple mock execution plan (you can replace this with 
EmptyExec if needed)
       let mock_exec = Arc::new(EmptyExec::new(file_schema.clone()));
   
       // Build first_value aggregate
       let first_value = Arc::new(
           AggregateExprBuilder::new(
               datafusion_functions_aggregate::first_last::first_value_udaf(),
               vec![ctx.create_physical_expr(value.clone(), &df_schema)?],
           )
               .alias("first_value(value)")
               .order_by(vec![PhysicalSortExpr::new(
                   ctx.create_physical_expr(timestamp.clone(), &df_schema)?,
                   SortOptions::new(false, false),
               )])
               .schema(file_schema.clone())
               .build()
               .expect("Failed to build first_value"),
       );
   
       // Build last_value aggregate
       let last_value = Arc::new(
           AggregateExprBuilder::new(
               datafusion_functions_aggregate::first_last::last_value_udaf(),
               vec![ctx.create_physical_expr(value.clone(), &df_schema)?],
           )
               .alias("last_value(value)")
               .order_by(vec![PhysicalSortExpr::new(
                   ctx.create_physical_expr(timestamp.clone(), &df_schema)?,
                   SortOptions::new(false, false),
               )])
               .schema(file_schema.clone())
               .build()
               .expect("Failed to build last_value"),
       );
   
       let partial_agg = AggregateExec::try_new(
           AggregateMode::Partial,
           PhysicalGroupBy::new_single(vec![
               (
                   ctx.create_physical_expr(date.clone(), &df_schema)?,
                   "date".to_string(),
               ),
               (
                   ctx.create_physical_expr(ticker.clone(), &df_schema)?,
                   "ticker".to_string(),
               ),
           ]),
           vec![first_value, last_value], // Both aggregates together
           vec![None, None],
           mock_exec,
           file_schema,
       ).expect("Failed to build partial agg");
   
       println!("{:?}", partial_agg.schema());
   
       for field in partial_agg.schema().fields() {
           println!("Field: {}", field.name());
       }
   
       let partial_agg_exec_schema = DFSchema::try_from(partial_agg.schema());
   
       // This should fail due to duplicate state field names
       assert!(
           partial_agg_exec_schema.is_err(),
           "Expected get AggregateExec to fail due to duplicate state field 
names"
       );
   
       if let Err(e) = partial_agg_exec_schema {
           println!("Expected error due to duplicate state fields: {}", e);
           // Verify it's the specific duplicate field error we expect
           assert!(e.to_string().contains("duplicate") || 
e.to_string().contains("Duplicate"));
       }
   
       Ok(())
   }
   ```
   
   Schema { fields: [Field { name: "date", data_type: Utf8, nullable: false, 
dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "ticker", 
data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: 
{} }, Field { name: "first_value(value)[first_value]", data_type: Float64, 
nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { 
name: "timestamp@0", data_type: Timestamp(Nanosecond, Some("UTC")), nullable: 
true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: 
"is_set", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: 
false, metadata: {} }, Field { name: "last_value(value)[last_value]", 
data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, 
metadata: {} }, Field { name: "timestamp@0", data_type: Timestamp(Nanosecond, 
Some("UTC")), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} 
}, Field { name: "is_set", data_type: Boolean, nullable: true, dict_id: 0, 
dict_is_ordered:
  false, metadata: {} }], metadata: {} }
   Field: date
   Field: ticker
   Field: first_value(value)[first_value]
   Field: timestamp@0
   Field: is_set
   Field: last_value(value)[last_value]
   Field: timestamp@0
   Field: is_set
   Expected error due to duplicate state fields: Schema error: Schema contains 
duplicate unqualified field name "timestamp@0"
   
   
   ### Expected behavior
   
   _No response_
   
   ### Additional context
   
   _No response_


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

[I] Partial AggregateMode will generate duplicate field names which will fail DFSchema construct [datafusion]

Reply via email to