This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 8ab83425c9 Implement Substrait Support for `GROUPING SET CUBE` (#18798)
8ab83425c9 is described below
commit 8ab83425c95035ab10b3e05f6728decb869b6c5c
Author: kosiew <[email protected]>
AuthorDate: Wed Nov 26 08:48:18 2025 +0800
Implement Substrait Support for `GROUPING SET CUBE` (#18798)
## Which issue does this PR close?
* Closes #16294.
## Rationale for this change
This change adds complete Substrait round‑trip support for `GROUPING SET
CUBE`, allowing logical plans containing cubes to successfully convert
to and from Substrait. Previously, cube handling in the Substrait
producer returned a hard `NotImplemented` error, preventing several
SQLLogicTest cases from running under round‑trip mode.
Supporting `CUBE` brings consistency with existing `ROLLUP` and
`GROUPING SETS` handling, ensures correct logical plan serialization,
and enables successful execution of tests that rely on cube semantics.
### Before
```
❯ cargo test --test sqllogictests -- --substrait-round-trip grouping.slt:52
Finished `test` profile [unoptimized + debuginfo] target(s) in 0.60s
Running bin/sqllogictests.rs
(target/debug/deps/sqllogictests-917e139464eeea33)
Completed 1 test files in 0 seconds
External error: 1 errors in file
/Users/kosiew/GitHub/datafusion/datafusion/sqllogictest/test_files/grouping.slt
1. query failed: DataFusion error: This feature is not implemented:
GroupingSet CUBE is not yet supported
```
### After
```
❯ cargo test --test sqllogictests -- --substrait-round-trip grouping.slt:52
Finished `test` profile [unoptimized + debuginfo] target(s) in 0.65s
Running bin/sqllogictests.rs
(target/debug/deps/sqllogictests-917e139464eeea33)
Completed 1 test files in 0 seconds
```
## What changes are included in this PR?
* Introduces a shared internal helper `powerset_indices` for efficient
subset generation.
* Refactors `powerset` to use DataFusion error types and removes the
string‑based error.
* Adds a new `powerset_cloned` function for owned‑value subsets needed
in the Substrait adapter.
* Implements full Substrait producer support for `GroupingSet::Cube`
using `powerset_cloned`.
* Updates aggregate Substrait translation to correctly assemble grouping
sets derived from cube expansions.
* Adds a new Substrait round‑trip test case for `GROUP BY CUBE`.
## Are these changes tested?
Yes. A new Substrait round‑trip test (`aggregate_grouping_cube`)
validates the logical plan after translation. Existing
grouping/aggregate tests continue to pass, covering other grouping‑set
variants.
## Are there any user-facing changes?
There are no user‑facing API changes. The behavior of `GROUP BY CUBE` is
now consistent under Substrait round‑trip mode, which may allow
previously failing queries to succeed.
## LLM-generated code disclosure
This PR includes LLM‑generated code and comments. All LLM‑generated
content has been manually reviewed and tested.
---
datafusion/expr/src/utils.rs | 46 ++++++++++++----------
.../src/logical_plan/producer/rel/aggregate_rel.rs | 30 +++++++++-----
.../tests/cases/roundtrip_logical_plan.rs | 20 ++++++++++
3 files changed, 66 insertions(+), 30 deletions(-)
diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs
index 8e8483bc2a..cffeccb6e8 100644
--- a/datafusion/expr/src/utils.rs
+++ b/datafusion/expr/src/utils.rs
@@ -34,8 +34,8 @@ use datafusion_common::tree_node::{
};
use datafusion_common::utils::get_at_indices;
use datafusion_common::{
- internal_err, plan_datafusion_err, plan_err, Column, DFSchema,
DFSchemaRef, HashMap,
- Result, TableReference,
+ internal_err, plan_err, Column, DFSchema, DFSchemaRef, HashMap, Result,
+ TableReference,
};
#[cfg(not(feature = "sql"))]
@@ -66,6 +66,23 @@ pub fn grouping_set_expr_count(group_expr: &[Expr]) ->
Result<usize> {
}
}
+/// Internal helper that generates indices for powerset subsets using bitset
iteration.
+/// Returns an iterator of index vectors, where each vector contains the
indices
+/// of elements to include in that subset.
+fn powerset_indices(len: usize) -> impl Iterator<Item = Vec<usize>> {
+ (0..(1 << len)).map(move |mask| {
+ let mut indices = vec![];
+ let mut bitset = mask;
+ while bitset > 0 {
+ let rightmost: u64 = bitset & !(bitset - 1);
+ let idx = rightmost.trailing_zeros() as usize;
+ indices.push(idx);
+ bitset &= bitset - 1;
+ }
+ indices
+ })
+}
+
/// The [power set] (or powerset) of a set S is the set of all subsets of S, \
/// including the empty set and S itself.
///
@@ -83,26 +100,14 @@ pub fn grouping_set_expr_count(group_expr: &[Expr]) ->
Result<usize> {
/// and hence the power set of S is {{}, {x}, {y}, {z}, {x, y}, {x, z}, {y,
z}, {x, y, z}}.
///
/// [power set]: https://en.wikipedia.org/wiki/Power_set
-fn powerset<T>(slice: &[T]) -> Result<Vec<Vec<&T>>, String> {
+pub fn powerset<T>(slice: &[T]) -> Result<Vec<Vec<&T>>> {
if slice.len() >= 64 {
- return Err("The size of the set must be less than 64.".into());
+ return plan_err!("The size of the set must be less than 64");
}
- let mut v = Vec::new();
- for mask in 0..(1 << slice.len()) {
- let mut ss = vec![];
- let mut bitset = mask;
- while bitset > 0 {
- let rightmost: u64 = bitset & !(bitset - 1);
- let idx = rightmost.trailing_zeros();
- let item = slice.get(idx as usize).unwrap();
- ss.push(item);
- // zero the trailing bit
- bitset &= bitset - 1;
- }
- v.push(ss);
- }
- Ok(v)
+ Ok(powerset_indices(slice.len())
+ .map(|indices| indices.iter().map(|&idx| &slice[idx]).collect())
+ .collect())
}
/// check the number of expressions contained in the grouping_set
@@ -207,8 +212,7 @@ pub fn enumerate_grouping_sets(group_expr: Vec<Expr>) ->
Result<Vec<Expr>> {
grouping_sets.iter().map(|e| e.iter().collect()).collect()
}
Expr::GroupingSet(GroupingSet::Cube(group_exprs)) => {
- let grouping_sets = powerset(group_exprs)
- .map_err(|e| plan_datafusion_err!("{}", e))?;
+ let grouping_sets = powerset(group_exprs)?;
check_grouping_sets_size_limit(grouping_sets.len())?;
grouping_sets
}
diff --git
a/datafusion/substrait/src/logical_plan/producer/rel/aggregate_rel.rs
b/datafusion/substrait/src/logical_plan/producer/rel/aggregate_rel.rs
index 917959ea7d..9861401cb1 100644
--- a/datafusion/substrait/src/logical_plan/producer/rel/aggregate_rel.rs
+++ b/datafusion/substrait/src/logical_plan/producer/rel/aggregate_rel.rs
@@ -18,8 +18,9 @@
use crate::logical_plan::producer::{
from_aggregate_function, substrait_field_ref, SubstraitProducer,
};
-use datafusion::common::{internal_err, not_impl_err, DFSchemaRef,
DataFusionError};
+use datafusion::common::{internal_err, not_impl_err, DFSchemaRef};
use datafusion::logical_expr::expr::Alias;
+use datafusion::logical_expr::utils::powerset;
use datafusion::logical_expr::{Aggregate, Distinct, Expr, GroupingSet};
use substrait::proto::aggregate_rel::{Grouping, Measure};
use substrait::proto::rel::RelType;
@@ -91,10 +92,22 @@ pub fn to_substrait_groupings(
let groupings = match exprs.len() {
1 => match &exprs[0] {
Expr::GroupingSet(gs) => match gs {
- GroupingSet::Cube(_) => Err(DataFusionError::NotImplemented(
- "GroupingSet CUBE is not yet supported".to_string(),
- )),
- GroupingSet::GroupingSets(sets) => Ok(sets
+ GroupingSet::Cube(set) => {
+ // Generate power set of grouping expressions
+ let cube_sets = powerset(set)?;
+ cube_sets
+ .iter()
+ .map(|set| {
+ parse_flat_grouping_exprs(
+ producer,
+ &set.iter().map(|v|
(*v).clone()).collect::<Vec<_>>(),
+ schema,
+ &mut ref_group_exprs,
+ )
+ })
+ .collect::<datafusion::common::Result<Vec<_>>>()
+ }
+ GroupingSet::GroupingSets(sets) => sets
.iter()
.map(|set| {
parse_flat_grouping_exprs(
@@ -104,14 +117,13 @@ pub fn to_substrait_groupings(
&mut ref_group_exprs,
)
})
- .collect::<datafusion::common::Result<Vec<_>>>()?),
+ .collect::<datafusion::common::Result<Vec<_>>>(),
GroupingSet::Rollup(set) => {
let mut sets: Vec<Vec<Expr>> = vec![vec![]];
for i in 0..set.len() {
sets.push(set[..=i].to_vec());
}
- Ok(sets
- .iter()
+ sets.iter()
.rev()
.map(|set| {
parse_flat_grouping_exprs(
@@ -121,7 +133,7 @@ pub fn to_substrait_groupings(
&mut ref_group_exprs,
)
})
- .collect::<datafusion::common::Result<Vec<_>>>()?)
+ .collect::<datafusion::common::Result<Vec<_>>>()
}
},
_ => Ok(vec![parse_flat_grouping_exprs(
diff --git a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs
b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs
index 56fa767b61..0ce2c43570 100644
--- a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs
+++ b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs
@@ -322,6 +322,26 @@ async fn aggregate_grouping_rollup() -> Result<()> {
Ok(())
}
+#[tokio::test]
+async fn aggregate_grouping_cube() -> Result<()> {
+ let plan = generate_plan_from_sql(
+ "SELECT a, c, avg(b) FROM data GROUP BY CUBE (a, c)",
+ true,
+ true,
+ )
+ .await?;
+
+ assert_snapshot!(
+ plan,
+ @r#"
+ Projection: data.a, data.c, avg(data.b)
+ Aggregate: groupBy=[[GROUPING SETS ((), (data.a), (data.c), (data.a,
data.c))]], aggr=[[avg(data.b)]]
+ TableScan: data projection=[a, b, c]
+ "#
+ );
+ Ok(())
+}
+
#[tokio::test]
async fn multilayer_aggregate() -> Result<()> {
let plan = generate_plan_from_sql(
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]