Jefffrey commented on code in PR #18981:
URL: https://github.com/apache/datafusion/pull/18981#discussion_r2580553561


##########
datafusion/common/src/hash_utils.rs:
##########
@@ -1323,4 +1477,127 @@ mod tests {
         // 67 vs 67
         assert_eq!(hashes[0], hashes[4]);
     }
+
+    #[test]
+    #[cfg(not(feature = "force_hash_collisions"))]
+    fn create_hashes_for_sliced_run_array() -> Result<()> {
+        let values = Arc::new(Int32Array::from(vec![10, 20, 30]));
+        let run_ends = Arc::new(Int32Array::from(vec![2, 5, 7]));
+        let array = Arc::new(RunArray::try_new(&run_ends, 
values.as_ref()).unwrap());
+
+        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let mut full_hashes = vec![0; array.len()];
+        create_hashes(
+            &[Arc::clone(&array) as ArrayRef],
+            &random_state,
+            &mut full_hashes,
+        )?;
+
+        let array_ref: ArrayRef = Arc::clone(&array) as ArrayRef;
+        let sliced_array = array_ref.slice(2, 3);
+
+        let mut sliced_hashes = vec![0; sliced_array.len()];
+        create_hashes(
+            std::slice::from_ref(&sliced_array),
+            &random_state,
+            &mut sliced_hashes,
+        )?;
+
+        assert_eq!(sliced_hashes.len(), 3);
+        assert_eq!(sliced_hashes[0], sliced_hashes[1]);
+        assert_eq!(sliced_hashes[1], sliced_hashes[2]);
+        assert_eq!(sliced_hashes[0], full_hashes[2]);
+        assert_eq!(sliced_hashes[1], full_hashes[3]);
+        assert_eq!(sliced_hashes[2], full_hashes[4]);

Review Comment:
   ```suggestion
           assert_eq!(&sliced_hashes, &full_hashes[2..5]);
   ```
   
   Makes it more obvious its a subset



##########
datafusion/common/src/hash_utils.rs:
##########
@@ -1323,4 +1477,127 @@ mod tests {
         // 67 vs 67
         assert_eq!(hashes[0], hashes[4]);
     }
+
+    #[test]
+    #[cfg(not(feature = "force_hash_collisions"))]
+    fn create_hashes_for_sliced_run_array() -> Result<()> {
+        let values = Arc::new(Int32Array::from(vec![10, 20, 30]));
+        let run_ends = Arc::new(Int32Array::from(vec![2, 5, 7]));
+        let array = Arc::new(RunArray::try_new(&run_ends, 
values.as_ref()).unwrap());
+
+        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let mut full_hashes = vec![0; array.len()];
+        create_hashes(
+            &[Arc::clone(&array) as ArrayRef],
+            &random_state,
+            &mut full_hashes,
+        )?;
+
+        let array_ref: ArrayRef = Arc::clone(&array) as ArrayRef;
+        let sliced_array = array_ref.slice(2, 3);
+
+        let mut sliced_hashes = vec![0; sliced_array.len()];
+        create_hashes(
+            std::slice::from_ref(&sliced_array),
+            &random_state,
+            &mut sliced_hashes,
+        )?;
+
+        assert_eq!(sliced_hashes.len(), 3);
+        assert_eq!(sliced_hashes[0], sliced_hashes[1]);
+        assert_eq!(sliced_hashes[1], sliced_hashes[2]);
+        assert_eq!(sliced_hashes[0], full_hashes[2]);
+        assert_eq!(sliced_hashes[1], full_hashes[3]);
+        assert_eq!(sliced_hashes[2], full_hashes[4]);
+
+        Ok(())
+    }
+
+    #[test]
+    #[cfg(not(feature = "force_hash_collisions"))]
+    fn test_sliced_run_array_only_hashes_needed_values() -> Result<()> {

Review Comment:
   How does this test verify we hash only the needed values?



##########
datafusion/common/src/hash_utils.rs:
##########
@@ -484,6 +484,90 @@ fn hash_fixed_list_array(
     Ok(())
 }
 
+#[cfg(not(feature = "force_hash_collisions"))]
+fn hash_run_array<R: RunEndIndexType>(
+    array: &RunArray<R>,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+    rehash: bool,
+) -> Result<()> {
+    // We find the relevant runs that cover potentially sliced arrays, so we 
can only hash those
+    // values. Then we find the runs that refer to the original runs and 
ensure that we apply
+    // hashes correctly to the sliced, whether sliced at the start, end, or 
both.
+    let array_offset = array.offset();
+    let array_len = array.len();
+
+    if array_len == 0 {
+        return Ok(());
+    }
+
+    let run_ends = array.run_ends();
+    let run_ends_values = run_ends.values();
+    let values = array.values();
+
+    let start_physical_index = array.get_start_physical_index();
+    // get_end_physical_index returns the inclusive last index, but we need 
the exclusive range end
+    // for the operations we use below.
+    let end_physical_index = array.get_end_physical_index() + 1;
+
+    let sliced_values = values.slice(
+        start_physical_index,
+        end_physical_index - start_physical_index,
+    );
+    let mut values_hashes = vec![0u64; sliced_values.len()];
+    create_hashes(
+        std::slice::from_ref(&sliced_values),
+        random_state,
+        &mut values_hashes,
+    )?;
+
+    let mut logical_position = 0;
+    for (adjusted_physical_index, &absolute_run_end) in run_ends_values
+        [start_physical_index..end_physical_index]
+        .iter()
+        .enumerate()
+    {
+        let is_null_value = sliced_values.is_null(adjusted_physical_index);
+        let absolute_run_end = absolute_run_end.as_usize();
+
+        let start_in_slice = if absolute_run_end > array_offset {
+            logical_position
+        } else {
+            continue;
+        };

Review Comment:
   In what cases can this else branch occur? Is it for when the slice starts 
mid run?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to