kosiew commented on code in PR #22169:
URL: https://github.com/apache/datafusion/pull/22169#discussion_r3286092903
##########
datafusion/sql/src/select.rs:
##########
@@ -377,39 +438,94 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
plan
};
- // Try processing unnest expression or do the final projection
- let plan = self.try_process_unnest(plan, select_exprs_post_aggr)?;
-
- // Process distinct clause
+ // Process distinct clause. For `DISTINCT ON` combined with
+ // aggregation, GROUP BY, or window functions we apply DistinctOn
+ // *before* the final projection so grouping columns and ORDER BY
+ // tie-breakers that aren't in the user SELECT stay in scope.
+ // DistinctOn provides the projection in that case (its select_expr
+ // list is wrapped in FIRST_VALUE during lowering).
let plan = match select.distinct {
- None => Ok(plan),
- Some(Distinct::All) => Ok(plan),
+ None | Some(Distinct::All) => {
+ self.try_process_unnest(plan, select_exprs_post_aggr)?
+ }
Some(Distinct::Distinct) => {
- LogicalPlanBuilder::from(plan).distinct()?.build()
+ let plan = self.try_process_unnest(plan,
select_exprs_post_aggr)?;
+ LogicalPlanBuilder::from(plan).distinct()?.build()?
}
- Some(Distinct::On(on_expr)) => {
- if !aggr_exprs.is_empty()
- || !group_by_exprs.is_empty()
- || !window_func_exprs.is_empty()
+ Some(Distinct::On(_)) => {
+ if aggr_exprs.is_empty()
+ && group_by_exprs.is_empty()
+ && window_func_exprs.is_empty()
{
- return not_impl_err!(
- "DISTINCT ON expressions with GROUP BY, aggregation or
window functions are not supported "
- );
- }
-
- let on_expr = on_expr
- .into_iter()
- .map(|e| {
- self.sql_expr_to_logical_expr(e, plan.schema(),
planner_context)
- })
- .collect::<Result<Vec<_>>>()?;
+ // Fast path: no aggregation context. Fuse projection
+ // and deduplication into a single DistinctOn over
+ // `base_plan`. The sort attached to DistinctOn via
+ // `with_sort_expr` later normalizes against base_plan,
+ // so a bare ORDER BY alias (e.g. `ORDER BY x` where
+ // SELECT has `a AS x`) must be swapped back to the
+ // underlying input expression first.
+ if !alias_map.is_empty() {
+ order_by_rex = order_by_rex
+ .into_iter()
+ .map(|s| {
+ let expr = substitute_top_level_alias(
+ s.expr.clone(),
+ &alias_map,
+ );
+ s.with_expr(expr)
+ })
+ .collect();
+ }
+ LogicalPlanBuilder::from(base_plan)
+ .distinct_on(on_exprs_post_aggr, select_exprs, None)?
+ .build()?
+ } else {
+ // General path: DistinctOn layered over the post-
+ // aggregate / post-window plan (no extra Projection
+ // node — DistinctOn's lowering wraps each select_expr
+ // in FIRST_VALUE, which acts as the projection).
+ //
+ // The DistinctOn input has the post-aggregate raw
+ // column names (e.g. `max(t.c4)`), not the user-facing
+ // SELECT aliases (`agg2`). ORDER BY may reference
+ // those aliases — substitute them back to the
+ // underlying post-aggregate expression so they
+ // resolve against the DistinctOn input.
+ let select_alias_map: datafusion_common::HashMap<String,
Expr> =
Review Comment:
Small refactoring suggestion: it may be worth extracting the top-level alias
substitution used for `ORDER BY` / `DISTINCT ON` into a helper that operates on
`SortExpr`s plus the alias map.
The same logic now appears in both the fast and general `DISTINCT ON` paths,
and centralizing it would make the "bare alias only" invariant easier to
preserve.
##########
datafusion/sql/src/select.rs:
##########
@@ -377,39 +438,94 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
plan
};
- // Try processing unnest expression or do the final projection
- let plan = self.try_process_unnest(plan, select_exprs_post_aggr)?;
-
- // Process distinct clause
+ // Process distinct clause. For `DISTINCT ON` combined with
+ // aggregation, GROUP BY, or window functions we apply DistinctOn
+ // *before* the final projection so grouping columns and ORDER BY
+ // tie-breakers that aren't in the user SELECT stay in scope.
+ // DistinctOn provides the projection in that case (its select_expr
+ // list is wrapped in FIRST_VALUE during lowering).
let plan = match select.distinct {
- None => Ok(plan),
- Some(Distinct::All) => Ok(plan),
+ None | Some(Distinct::All) => {
+ self.try_process_unnest(plan, select_exprs_post_aggr)?
+ }
Some(Distinct::Distinct) => {
- LogicalPlanBuilder::from(plan).distinct()?.build()
+ let plan = self.try_process_unnest(plan,
select_exprs_post_aggr)?;
+ LogicalPlanBuilder::from(plan).distinct()?.build()?
}
- Some(Distinct::On(on_expr)) => {
- if !aggr_exprs.is_empty()
- || !group_by_exprs.is_empty()
- || !window_func_exprs.is_empty()
+ Some(Distinct::On(_)) => {
+ if aggr_exprs.is_empty()
+ && group_by_exprs.is_empty()
+ && window_func_exprs.is_empty()
{
- return not_impl_err!(
- "DISTINCT ON expressions with GROUP BY, aggregation or
window functions are not supported "
- );
- }
-
- let on_expr = on_expr
- .into_iter()
- .map(|e| {
- self.sql_expr_to_logical_expr(e, plan.schema(),
planner_context)
- })
- .collect::<Result<Vec<_>>>()?;
+ // Fast path: no aggregation context. Fuse projection
+ // and deduplication into a single DistinctOn over
+ // `base_plan`. The sort attached to DistinctOn via
+ // `with_sort_expr` later normalizes against base_plan,
+ // so a bare ORDER BY alias (e.g. `ORDER BY x` where
+ // SELECT has `a AS x`) must be swapped back to the
+ // underlying input expression first.
+ if !alias_map.is_empty() {
+ order_by_rex = order_by_rex
+ .into_iter()
+ .map(|s| {
+ let expr = substitute_top_level_alias(
+ s.expr.clone(),
+ &alias_map,
+ );
+ s.with_expr(expr)
+ })
+ .collect();
+ }
+ LogicalPlanBuilder::from(base_plan)
+ .distinct_on(on_exprs_post_aggr, select_exprs, None)?
+ .build()?
+ } else {
+ // General path: DistinctOn layered over the post-
+ // aggregate / post-window plan (no extra Projection
+ // node — DistinctOn's lowering wraps each select_expr
+ // in FIRST_VALUE, which acts as the projection).
+ //
+ // The DistinctOn input has the post-aggregate raw
+ // column names (e.g. `max(t.c4)`), not the user-facing
+ // SELECT aliases (`agg2`). ORDER BY may reference
+ // those aliases — substitute them back to the
+ // underlying post-aggregate expression so they
+ // resolve against the DistinctOn input.
+ let select_alias_map: datafusion_common::HashMap<String,
Expr> =
+ select_exprs_post_aggr
+ .iter()
+ .filter_map(|e| match e {
+ Expr::Alias(a) => {
+ Some((a.name.clone(), (*a.expr).clone()))
+ }
+ _ => None,
+ })
+ .collect();
+
+ if !select_alias_map.is_empty() {
+ // Only swap an alias for its underlying expression
+ // when the ORDER BY item is a bare alias name —
+ // matches PostgreSQL's behavior. `b` resolves to
+ // the alias; `b + 0` keeps `b` as the input
+ // column.
+ order_by_rex = order_by_rex
+ .into_iter()
+ .map(|s| {
+ let expr = substitute_top_level_alias(
+ s.expr.clone(),
+ &select_alias_map,
+ );
+ s.with_expr(expr)
+ })
+ .collect();
+ }
- // Build the final plan
- LogicalPlanBuilder::from(base_plan)
- .distinct_on(on_expr, select_exprs, None)?
- .build()
+ LogicalPlanBuilder::from(plan)
Review Comment:
I think this new post-aggregation `DISTINCT ON` path accidentally bypasses
`try_process_unnest`, so SELECT-list `unnest(...)` expressions that currently
work after aggregation start failing once `DISTINCT ON` is added.
Repro:
```sql
WITH t(a,b) AS (VALUES (1,10),(1,20),(2,30))
SELECT DISTINCT ON (a)
a,
unnest(array_agg(b)) AS b
FROM t
GROUP BY a
ORDER BY a;
```
This currently returns:
```text
This feature is not implemented: Unnest should be rewritten to
LogicalPlan::Unnest before type coercion
```
The same aggregate query without `DISTINCT ON` returns rows successfully.
Since this path intentionally replaces the final projection with
`DistinctOn`, it looks like it still needs an equivalent unnest
rewrite/projection step before building `DistinctOn`. Otherwise, we should
probably reject this combination explicitly with a planned limitation error.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]