Rachelint commented on code in PR #16136:
URL: https://github.com/apache/datafusion/pull/16136#discussion_r2112691325


##########
datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive/mod.rs:
##########
@@ -116,42 +122,60 @@ where
 {
     fn intern(&mut self, cols: &[ArrayRef], groups: &mut Vec<usize>) -> 
Result<()> {
         assert_eq!(cols.len(), 1);
+        let col = cols[0].as_primitive::<T>();
+
         groups.clear();
+        self.append_row_indices.clear();
 
-        for v in cols[0].as_primitive::<T>() {
+        let mut num_total_groups = self.values.len();
+        for (row_index, v) in col.iter().enumerate() {
             let group_id = match v {
                 None => *self.null_group.get_or_insert_with(|| {
-                    let group_id = self.values.len();
-                    self.values.push(Default::default());
+                    let group_id = num_total_groups;
+                    self.append_row_indices.push(row_index as u32);
+                    num_total_groups += 1;
                     group_id
                 }),
                 Some(key) => {
                     let state = &self.random_state;
                     let hash = key.hash(state);
                     let insert = self.map.entry(
                         hash,
-                        |&(g, _)| unsafe { 
self.values.get_unchecked(g).is_eq(key) },
-                        |&(_, h)| h,
+                        |&(_, v)| v.is_eq(key),
+                        |&(_, v)| v.hash(state),
                     );
 
                     match insert {
                         hashbrown::hash_table::Entry::Occupied(o) => o.get().0,
                         hashbrown::hash_table::Entry::Vacant(v) => {
-                            let g = self.values.len();
-                            v.insert((g, hash));
-                            self.values.push(key);
+                            let g = num_total_groups;
+                            v.insert((g, key));
+                            self.append_row_indices.push(row_index as u32);
+                            num_total_groups += 1;
                             g
                         }
                     }
                 }
             };
             groups.push(group_id)
         }
+
+        // If all are new groups, we just extend it
+        if self.append_row_indices.len() == col.len() {
+            self.values.extend_from_slice(col.values());
+        } else {
+            let col_values = col.values();
+            for &row_index in self.append_row_indices.iter() {

Review Comment:
   Actually... I found no obvious improvement when switching to `extend`...
   The bottleneck still the `hashtable`, I think it is better to just keep the 
original `push` logic because it may be simpler, and actually efficient enough.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to