(datafusion) branch main updated: Implement Substrait Support for `GROUPING SET CUBE` (#18798)

github-bot Tue, 25 Nov 2025 16:48:50 -0800

This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git



The following commit(s) were added to refs/heads/main by this push:
     new 8ab83425c9 Implement Substrait Support for `GROUPING SET CUBE` (#18798)
8ab83425c9 is described below

commit 8ab83425c95035ab10b3e05f6728decb869b6c5c
Author: kosiew <[email protected]>
AuthorDate: Wed Nov 26 08:48:18 2025 +0800

    Implement Substrait Support for `GROUPING SET CUBE` (#18798)
    
    ## Which issue does this PR close?
    
    * Closes #16294.
    
    ## Rationale for this change
    
    This change adds complete Substrait round‑trip support for `GROUPING SET
    CUBE`, allowing logical plans containing cubes to successfully convert
    to and from Substrait. Previously, cube handling in the Substrait
    producer returned a hard `NotImplemented` error, preventing several
    SQLLogicTest cases from running under round‑trip mode.
    
    Supporting `CUBE` brings consistency with existing `ROLLUP` and
    `GROUPING SETS` handling, ensures correct logical plan serialization,
    and enables successful execution of tests that rely on cube semantics.
    
    ### Before
    ```
    ❯ cargo test --test sqllogictests -- --substrait-round-trip grouping.slt:52
        Finished `test` profile [unoptimized + debuginfo] target(s) in 0.60s
         Running bin/sqllogictests.rs 
(target/debug/deps/sqllogictests-917e139464eeea33)
    Completed 1 test files in 0 seconds                                         
     External error: 1 errors in file 
/Users/kosiew/GitHub/datafusion/datafusion/sqllogictest/test_files/grouping.slt
    
    1. query failed: DataFusion error: This feature is not implemented: 
GroupingSet CUBE is not yet supported
    
    ```
    
    ### After
    ```
    ❯ cargo test --test sqllogictests -- --substrait-round-trip grouping.slt:52
        Finished `test` profile [unoptimized + debuginfo] target(s) in 0.65s
         Running bin/sqllogictests.rs 
(target/debug/deps/sqllogictests-917e139464eeea33)
    Completed 1 test files in 0 seconds
    ```
    
    ## What changes are included in this PR?
    
    * Introduces a shared internal helper `powerset_indices` for efficient
    subset generation.
    * Refactors `powerset` to use DataFusion error types and removes the
    string‑based error.
    * Adds a new `powerset_cloned` function for owned‑value subsets needed
    in the Substrait adapter.
    * Implements full Substrait producer support for `GroupingSet::Cube`
    using `powerset_cloned`.
    * Updates aggregate Substrait translation to correctly assemble grouping
    sets derived from cube expansions.
    * Adds a new Substrait round‑trip test case for `GROUP BY CUBE`.
    
    ## Are these changes tested?
    
    Yes. A new Substrait round‑trip test (`aggregate_grouping_cube`)
    validates the logical plan after translation. Existing
    grouping/aggregate tests continue to pass, covering other grouping‑set
    variants.
    
    ## Are there any user-facing changes?
    
    There are no user‑facing API changes. The behavior of `GROUP BY CUBE` is
    now consistent under Substrait round‑trip mode, which may allow
    previously failing queries to succeed.
    
    ## LLM-generated code disclosure
    
    This PR includes LLM‑generated code and comments. All LLM‑generated
    content has been manually reviewed and tested.
---
 datafusion/expr/src/utils.rs                       | 46 ++++++++++++----------
 .../src/logical_plan/producer/rel/aggregate_rel.rs | 30 +++++++++-----
 .../tests/cases/roundtrip_logical_plan.rs          | 20 ++++++++++
 3 files changed, 66 insertions(+), 30 deletions(-)

diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs
index 8e8483bc2a..cffeccb6e8 100644
--- a/datafusion/expr/src/utils.rs
+++ b/datafusion/expr/src/utils.rs
@@ -34,8 +34,8 @@ use datafusion_common::tree_node::{
 };
 use datafusion_common::utils::get_at_indices;
 use datafusion_common::{
-    internal_err, plan_datafusion_err, plan_err, Column, DFSchema, 
DFSchemaRef, HashMap,
-    Result, TableReference,
+    internal_err, plan_err, Column, DFSchema, DFSchemaRef, HashMap, Result,
+    TableReference,
 };
 
 #[cfg(not(feature = "sql"))]
@@ -66,6 +66,23 @@ pub fn grouping_set_expr_count(group_expr: &[Expr]) -> 
Result<usize> {
     }
 }
 
+/// Internal helper that generates indices for powerset subsets using bitset 
iteration.
+/// Returns an iterator of index vectors, where each vector contains the 
indices
+/// of elements to include in that subset.
+fn powerset_indices(len: usize) -> impl Iterator<Item = Vec<usize>> {
+    (0..(1 << len)).map(move |mask| {
+        let mut indices = vec![];
+        let mut bitset = mask;
+        while bitset > 0 {
+            let rightmost: u64 = bitset & !(bitset - 1);
+            let idx = rightmost.trailing_zeros() as usize;
+            indices.push(idx);
+            bitset &= bitset - 1;
+        }
+        indices
+    })
+}
+
 /// The [power set] (or powerset) of a set S is the set of all subsets of S, \
 /// including the empty set and S itself.
 ///
@@ -83,26 +100,14 @@ pub fn grouping_set_expr_count(group_expr: &[Expr]) -> 
Result<usize> {
 ///  and hence the power set of S is {{}, {x}, {y}, {z}, {x, y}, {x, z}, {y, 
z}, {x, y, z}}.
 ///
 /// [power set]: https://en.wikipedia.org/wiki/Power_set
-fn powerset<T>(slice: &[T]) -> Result<Vec<Vec<&T>>, String> {
+pub fn powerset<T>(slice: &[T]) -> Result<Vec<Vec<&T>>> {
     if slice.len() >= 64 {
-        return Err("The size of the set must be less than 64.".into());
+        return plan_err!("The size of the set must be less than 64");
     }
 
-    let mut v = Vec::new();
-    for mask in 0..(1 << slice.len()) {
-        let mut ss = vec![];
-        let mut bitset = mask;
-        while bitset > 0 {
-            let rightmost: u64 = bitset & !(bitset - 1);
-            let idx = rightmost.trailing_zeros();
-            let item = slice.get(idx as usize).unwrap();
-            ss.push(item);
-            // zero the trailing bit
-            bitset &= bitset - 1;
-        }
-        v.push(ss);
-    }
-    Ok(v)
+    Ok(powerset_indices(slice.len())
+        .map(|indices| indices.iter().map(|&idx| &slice[idx]).collect())
+        .collect())
 }
 
 /// check the number of expressions contained in the grouping_set
@@ -207,8 +212,7 @@ pub fn enumerate_grouping_sets(group_expr: Vec<Expr>) -> 
Result<Vec<Expr>> {
                     grouping_sets.iter().map(|e| e.iter().collect()).collect()
                 }
                 Expr::GroupingSet(GroupingSet::Cube(group_exprs)) => {
-                    let grouping_sets = powerset(group_exprs)
-                        .map_err(|e| plan_datafusion_err!("{}", e))?;
+                    let grouping_sets = powerset(group_exprs)?;
                     check_grouping_sets_size_limit(grouping_sets.len())?;
                     grouping_sets
                 }
diff --git 
a/datafusion/substrait/src/logical_plan/producer/rel/aggregate_rel.rs 
b/datafusion/substrait/src/logical_plan/producer/rel/aggregate_rel.rs
index 917959ea7d..9861401cb1 100644
--- a/datafusion/substrait/src/logical_plan/producer/rel/aggregate_rel.rs
+++ b/datafusion/substrait/src/logical_plan/producer/rel/aggregate_rel.rs
@@ -18,8 +18,9 @@
 use crate::logical_plan::producer::{
     from_aggregate_function, substrait_field_ref, SubstraitProducer,
 };
-use datafusion::common::{internal_err, not_impl_err, DFSchemaRef, 
DataFusionError};
+use datafusion::common::{internal_err, not_impl_err, DFSchemaRef};
 use datafusion::logical_expr::expr::Alias;
+use datafusion::logical_expr::utils::powerset;
 use datafusion::logical_expr::{Aggregate, Distinct, Expr, GroupingSet};
 use substrait::proto::aggregate_rel::{Grouping, Measure};
 use substrait::proto::rel::RelType;
@@ -91,10 +92,22 @@ pub fn to_substrait_groupings(
     let groupings = match exprs.len() {
         1 => match &exprs[0] {
             Expr::GroupingSet(gs) => match gs {
-                GroupingSet::Cube(_) => Err(DataFusionError::NotImplemented(
-                    "GroupingSet CUBE is not yet supported".to_string(),
-                )),
-                GroupingSet::GroupingSets(sets) => Ok(sets
+                GroupingSet::Cube(set) => {
+                    // Generate power set of grouping expressions
+                    let cube_sets = powerset(set)?;
+                    cube_sets
+                        .iter()
+                        .map(|set| {
+                            parse_flat_grouping_exprs(
+                                producer,
+                                &set.iter().map(|v| 
(*v).clone()).collect::<Vec<_>>(),
+                                schema,
+                                &mut ref_group_exprs,
+                            )
+                        })
+                        .collect::<datafusion::common::Result<Vec<_>>>()
+                }
+                GroupingSet::GroupingSets(sets) => sets
                     .iter()
                     .map(|set| {
                         parse_flat_grouping_exprs(
@@ -104,14 +117,13 @@ pub fn to_substrait_groupings(
                             &mut ref_group_exprs,
                         )
                     })
-                    .collect::<datafusion::common::Result<Vec<_>>>()?),
+                    .collect::<datafusion::common::Result<Vec<_>>>(),
                 GroupingSet::Rollup(set) => {
                     let mut sets: Vec<Vec<Expr>> = vec![vec![]];
                     for i in 0..set.len() {
                         sets.push(set[..=i].to_vec());
                     }
-                    Ok(sets
-                        .iter()
+                    sets.iter()
                         .rev()
                         .map(|set| {
                             parse_flat_grouping_exprs(
@@ -121,7 +133,7 @@ pub fn to_substrait_groupings(
                                 &mut ref_group_exprs,
                             )
                         })
-                        .collect::<datafusion::common::Result<Vec<_>>>()?)
+                        .collect::<datafusion::common::Result<Vec<_>>>()
                 }
             },
             _ => Ok(vec![parse_flat_grouping_exprs(
diff --git a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs 
b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs
index 56fa767b61..0ce2c43570 100644
--- a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs
+++ b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs
@@ -322,6 +322,26 @@ async fn aggregate_grouping_rollup() -> Result<()> {
     Ok(())
 }
 
+#[tokio::test]
+async fn aggregate_grouping_cube() -> Result<()> {
+    let plan = generate_plan_from_sql(
+        "SELECT a, c, avg(b) FROM data GROUP BY CUBE (a, c)",
+        true,
+        true,
+    )
+    .await?;
+
+    assert_snapshot!(
+    plan,
+    @r#"
+        Projection: data.a, data.c, avg(data.b)
+          Aggregate: groupBy=[[GROUPING SETS ((), (data.a), (data.c), (data.a, 
data.c))]], aggr=[[avg(data.b)]]
+            TableScan: data projection=[a, b, c]
+        "#
+    );
+    Ok(())
+}
+
 #[tokio::test]
 async fn multilayer_aggregate() -> Result<()> {
     let plan = generate_plan_from_sql(


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(datafusion) branch main updated: Implement Substrait Support for `GROUPING SET CUBE` (#18798)

Reply via email to