gruuya commented on PR #7180:
URL: 
https://github.com/apache/arrow-datafusion/pull/7180#issuecomment-1665730913

   Fwiw, I was able to narrow down the reason why that [last test is 
failing](https://github.com/apache/arrow-datafusion/actions/runs/5762302725/job/15622067686?pr=7180#step:6:2911);
 TLDR: unstable sort and changed batch sizes result in different order of items 
with the same value in the sort column between main and this PR.
   
   ## Details
   
   The repro involves an array 1 with 100 elements (this is basically column C3 
from the [test 
file](https://github.com/apache/arrow-testing/blob/e81d0c6de35948b3be7984af8e00413b314cde6e/data/csv/aggregate_test_100.csv))
 and array 2 with all elements from array 1 except one (which doesn't affect 
the output, namely 120)
   ```rust
   use arrow::compute::SortOptions;
   use arrow::compute::{lexsort_to_indices, SortColumn};
   use arrow_array::{ArrayRef, Int32Array};
   let options = SortOptions {
       descending: false,
       nulls_first: false,
   };
   let get_indices = |sort_column: SortColumn, fetch: Option<usize>| -> 
Vec<u32> {
       lexsort_to_indices(&vec![sort_column], fetch)
           .unwrap()
           .into_iter()
           .map(|maybe_index| maybe_index.unwrap_or_else(|| panic!("No 
index!")))
           .collect::<Vec<u32>>()
   };
   let array_1: ArrayRef = Arc::new(Int32Array::from(vec![
       1, -40, 29, -85, -82, -111, 104, 13, 38, -38, 57, -54, 112, 113, 54, 
103, 49,
       -98, 77, 97, -56, -99, 36, -53, -29, -25, 123, -31, 45, 17, 97, -60, 36, 
-5,
       13, 41, 93, 73, -2, 22, 63, 102, -8, 17, 52, 68, 31, -24, 65, 125, 17, 
-106,
       -59, 55, -60, -76, 73, -117, -101, 62, -79, 68, 70, -61, 74, 122, 71, 
-94,
       -72, 71, 96, -48, -56, 52, -5, 12, 64, -90, -86, -117, 14, 29, -59, 83, 
-12,
       3, -72, -107, 118, 97, -101, -43, -101, -44, 5, 120, -95, 123, 47, 30,
   ]));
   let sort_column_1 = SortColumn {
       values: array_1,
       options: Some(options.clone()),
   };
   let indices_1_no_fetch = get_indices(sort_column_1.clone(), None);
   let indices_1_fetch_5 = get_indices(sort_column_1, Some(5));
   println!("Indices for sorting array 1");
   println!("{:#?}", indices_1_no_fetch[0..5].to_vec());
   println!("{:#?}", indices_1_fetch_5);
   
   let array_2: ArrayRef = Arc::new(Int32Array::from(vec![
       1, -40, 29, -85, -82, -111, 104, 13, 38, -38, 57, -54, 112, 113, 54, 
103, 49,
       -98, 77, 97, -56, -99, 36, -53, -29, -25, 123, -31, 45, 17, 97, -60, 36, 
-5,
       13, 41, 93, 73, -2, 22, 63, 102, -8, 17, 52, 68, 31, -24, 65, 125, 17, 
-106,
       -59, 55, -60, -76, 73, -117, -101, 62, -79, 68, 70, -61, 74, 122, 71, 
-94,
       -72, 71, 96, -48, -56, 52, -5, 12, 64, -90, -86, -117, 14, 29, -59, 83, 
-12,
       3, -72, -107, 118, 97, -101, -43, -101, -44, 5, -95, 123, 47, 30,
   ]));
   let sort_column_2 = SortColumn {
       values: array_2,
       options: Some(options),
   };
   let indices_2_no_fetch = get_indices(sort_column_2.clone(), None);
   let indices_2_fetch_5 = get_indices(sort_column_2, Some(5));
   println!("Indices for sorting array 2");
   println!("{:#?}", indices_2_no_fetch[0..5].to_vec());
   println!("{:#?}", indices_2_fetch_5);
   ```
   
   This prints out:
   ```text
   Indices for sorting array 1
   [
       57,
       79,
       5,
       87,
       51,
   ]
   [
       57,
       79,
       5,
       87,
       51,
   ]
   Indices for sorting array 2
   [
       57,
       79,
       5,
       87,
       51,
   ]
   [
       79,
       57,
       5,
       87,
       51,
   ]
   ```
   Note that in the case of the second array the first two indices are swapped 
when setting some fetch size, as then the [unstable 
sort](https://github.com/apache/arrow-rs/blob/master/arrow-ord/src/sort.rs#L711)
 gets used, and the two smallest array values are the same (-117).
   
   ##
   
   Perhaps it's ok to change the test output to match the one being obtained 
now?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to