This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 363d3fad2 Faster dictionary sort (#2993)
363d3fad2 is described below
commit 363d3fad2d279a8f641414a86f9c89d7c37193cf
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Tue Nov 1 11:28:31 2022 +1300
Faster dictionary sort (#2993)
---
arrow/src/compute/kernels/sort.rs | 51 ++++++++++++++++++---------------------
1 file changed, 23 insertions(+), 28 deletions(-)
diff --git a/arrow/src/compute/kernels/sort.rs
b/arrow/src/compute/kernels/sort.rs
index a10e674ac..97b0758e5 100644
--- a/arrow/src/compute/kernels/sort.rs
+++ b/arrow/src/compute/kernels/sort.rs
@@ -24,7 +24,6 @@ use crate::datatypes::*;
use crate::downcast_dictionary_array;
use crate::error::{ArrowError, Result};
use std::cmp::Ordering;
-use std::collections::HashMap;
use TimeUnit::*;
/// Sort the `ArrayRef` using `SortOptions`.
@@ -114,7 +113,7 @@ where
}
}
-fn cmp<T>(l: T, r: T) -> std::cmp::Ordering
+fn cmp<T>(l: T, r: T) -> Ordering
where
T: Ord,
{
@@ -340,13 +339,13 @@ pub fn sort_to_indices(
dt if DataType::is_primitive(dt) => {
let dict_values = values.values();
let sorted_value_indices =
sort_to_indices(dict_values, value_options, None)?;
- let value_indices_map =
prepare_indices_map(&sorted_value_indices);
+ let value_indices_map =
sorted_rank(&sorted_value_indices);
sort_primitive_dictionary::<_, _>(values,
&value_indices_map, v, n, options, limit, cmp)
},
DataType::Utf8 => {
let dict_values = values.values();
let sorted_value_indices =
sort_to_indices(dict_values, value_options, None)?;
- let value_indices_map =
prepare_indices_map(&sorted_value_indices);
+ let value_indices_map =
sorted_rank(&sorted_value_indices);
sort_string_dictionary::<_>(values,
&value_indices_map, v, n, &options, limit)
},
t => return Err(ArrowError::ComputeError(format!(
@@ -490,8 +489,8 @@ fn sort_primitive<T, F>(
) -> UInt32Array
where
T: ArrowPrimitiveType,
- T::Native: std::cmp::PartialOrd,
- F: Fn(T::Native, T::Native) -> std::cmp::Ordering,
+ T::Native: PartialOrd,
+ F: Fn(T::Native, T::Native) -> Ordering,
{
// create tuples that are used for sorting
let valids = {
@@ -504,24 +503,22 @@ where
sort_primitive_inner(values.len(), null_indices, cmp, options, limit,
valids)
}
-/// A helper function used to convert sorted value indices to a map that we
can look up sorted order
-/// for a value index later.
-fn prepare_indices_map(sorted_value_indices: &UInt32Array) -> HashMap<usize,
u32> {
- sorted_value_indices
- .into_iter()
- .enumerate()
- .map(|(idx, index)| {
- // Indices don't have None value
- let index = index.unwrap();
- (index as usize, idx as u32)
- })
- .collect::<HashMap<usize, u32>>()
+/// Given a list of indices that yield a sorted order, returns the ordered
+/// rank of each index
+///
+/// e.g. [2, 4, 3, 1, 0] -> [4, 3, 0, 2, 1]
+fn sorted_rank(sorted_value_indices: &UInt32Array) -> Vec<u32> {
+ assert_eq!(sorted_value_indices.null_count(), 0);
+ let sorted_indices = sorted_value_indices.values();
+ let mut out: Vec<_> = (0..sorted_indices.len() as u32).collect();
+ out.sort_unstable_by_key(|x| sorted_indices[*x as usize]);
+ out
}
/// Sort dictionary encoded primitive values
fn sort_primitive_dictionary<K, F>(
values: &DictionaryArray<K>,
- value_indices_map: &HashMap<usize, u32>,
+ value_indices_map: &[u32],
value_indices: Vec<u32>,
null_indices: Vec<u32>,
options: SortOptions,
@@ -530,7 +527,7 @@ fn sort_primitive_dictionary<K, F>(
) -> UInt32Array
where
K: ArrowDictionaryKeyType,
- F: Fn(u32, u32) -> std::cmp::Ordering,
+ F: Fn(u32, u32) -> Ordering,
{
let keys: &PrimitiveArray<K> = values.keys();
@@ -539,8 +536,7 @@ where
.into_iter()
.map(|index| {
let key: K::Native = keys.value(index as usize);
- let value_order = value_indices_map.get(&key.as_usize()).unwrap();
- (index, *value_order)
+ (index, value_indices_map[key.as_usize()])
})
.collect::<Vec<(u32, u32)>>();
@@ -558,8 +554,8 @@ fn sort_primitive_inner<T, F>(
) -> UInt32Array
where
T: ArrowNativeType,
- T: std::cmp::PartialOrd,
- F: Fn(T, T) -> std::cmp::Ordering,
+ T: PartialOrd,
+ F: Fn(T, T) -> Ordering,
{
let mut nulls = null_indices;
@@ -651,7 +647,7 @@ fn sort_string<Offset: OffsetSizeTrait>(
/// Sort dictionary encoded strings
fn sort_string_dictionary<T: ArrowDictionaryKeyType>(
values: &DictionaryArray<T>,
- value_indices_map: &HashMap<usize, u32>,
+ value_indices_map: &[u32],
value_indices: Vec<u32>,
null_indices: Vec<u32>,
options: &SortOptions,
@@ -664,8 +660,7 @@ fn sort_string_dictionary<T: ArrowDictionaryKeyType>(
.into_iter()
.map(|index| {
let key: T::Native = keys.value(index as usize);
- let value_order = value_indices_map.get(&key.as_usize()).unwrap();
- (index, *value_order)
+ (index, value_indices_map[key.as_usize()])
})
.collect::<Vec<(u32, u32)>>();
@@ -723,7 +718,7 @@ fn sort_list<S, T>(
where
S: OffsetSizeTrait,
T: ArrowPrimitiveType,
- T::Native: std::cmp::PartialOrd,
+ T::Native: PartialOrd,
{
sort_list_inner::<S>(values, value_indices, null_indices, options, limit)
}