neilconway commented on code in PR #21058:
URL: https://github.com/apache/datafusion/pull/21058#discussion_r2991557200


##########
datafusion/physical-plan/src/aggregates/mod.rs:
##########
@@ -1937,15 +1937,53 @@ fn evaluate_optional(
         .collect()
 }
 
-fn group_id_array(group: &[bool], batch: &RecordBatch) -> Result<ArrayRef> {
+fn group_id_array(
+    group: &[bool],
+    group_ordinal: usize,
+    batch: &RecordBatch,
+) -> Result<ArrayRef> {
     if group.len() > 64 {
         return not_impl_err!(
             "Grouping sets with more than 64 columns are not supported"
         );
     }
+    let width_bits = if group.len() <= 8 {
+        8
+    } else if group.len() <= 16 {
+        16
+    } else if group.len() <= 32 {
+        32
+    } else {
+        64
+    };
+    let extra_bits = width_bits - group.len();
+    if extra_bits == 0 && group_ordinal > 0 {
+        return not_impl_err!(
+            "Duplicate grouping sets with more than {} grouping columns are 
not supported",

Review Comment:
   If `group.len() == 64`, seems like this will produce an inaccurate error 
message ("more than 64" vs "64").



##########
datafusion/physical-plan/src/aggregates/mod.rs:
##########
@@ -1937,15 +1937,53 @@ fn evaluate_optional(
         .collect()
 }
 
-fn group_id_array(group: &[bool], batch: &RecordBatch) -> Result<ArrayRef> {
+fn group_id_array(
+    group: &[bool],
+    group_ordinal: usize,
+    batch: &RecordBatch,
+) -> Result<ArrayRef> {
     if group.len() > 64 {
         return not_impl_err!(
             "Grouping sets with more than 64 columns are not supported"
         );
     }
+    let width_bits = if group.len() <= 8 {
+        8
+    } else if group.len() <= 16 {
+        16
+    } else if group.len() <= 32 {
+        32
+    } else {
+        64
+    };
+    let extra_bits = width_bits - group.len();
+    if extra_bits == 0 && group_ordinal > 0 {
+        return not_impl_err!(
+            "Duplicate grouping sets with more than {} grouping columns are 
not supported",
+            width_bits
+        );
+    }
+    if extra_bits < usize::BITS as usize {
+        let max_group_ordinal = 1usize << extra_bits;
+        if group_ordinal >= max_group_ordinal {
+            return not_impl_err!(
+                "Duplicate grouping sets exceed the supported grouping id 
capacity"
+            );
+        }
+    }
     let group_id = group.iter().fold(0u64, |acc, &is_null| {
         (acc << 1) | if is_null { 1 } else { 0 }
     });
+    let group_id = if group.len() == 64 {

Review Comment:
   Personally, I would opt for tightening the check on `group.len() > 64` to 
`group.len() >= 64` at the top of the function, which would simplify this logic.



##########
datafusion/physical-plan/src/aggregates/mod.rs:
##########
@@ -1972,6 +2010,7 @@ pub fn evaluate_group_by(
     group_by: &PhysicalGroupBy,
     batch: &RecordBatch,
 ) -> Result<Vec<Vec<ArrayRef>>> {
+    let mut group_ordinals: HashMap<Vec<bool>, usize> = HashMap::new();

Review Comment:
   `HashMap<&[bool], usize>` instead?



##########
datafusion/physical-plan/src/aggregates/mod.rs:
##########
@@ -1937,15 +1937,53 @@ fn evaluate_optional(
         .collect()
 }
 
-fn group_id_array(group: &[bool], batch: &RecordBatch) -> Result<ArrayRef> {
+fn group_id_array(

Review Comment:
   I think this function would benefit from some commentary: you need to read 
the implementation pretty carefully to understand the encoded bit format.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to