This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 6fa9c1ad11 Optimize hashing for StringView and ByteView (15-70%
faster) (#19374)
6fa9c1ad11 is described below
commit 6fa9c1ad11d234140006dc9a71da3486c4488aaa
Author: Andrew Lamb <[email protected]>
AuthorDate: Sat Dec 20 03:06:03 2025 -0500
Optimize hashing for StringView and ByteView (15-70% faster) (#19374)
## Which issue does this PR close?
- builds on https://github.com/apache/datafusion/pull/19373
- part of https://github.com/apache/datafusion/issues/18411
- Broken out of https://github.com/apache/datafusion/pull/19344
- Closes https://github.com/apache/datafusion/pull/19344
## Rationale for this change
While looking at performance as part of
https://github.com/apache/datafusion/issues/18411, I noticed we could
speed up string view hashing by optimizing for small strings
## What changes are included in this PR?
Optimize StringView hashing, specifically by using the inlined view for
short strings
## Are these changes tested?
Functionally by existing coverage
Performance by benchmarks (added in
https://github.com/apache/datafusion/pull/19373) which show
* 15%-20% faster for mixed short/long strings
* 50%-70% faster for "short" arrays where we know there are no strings
longer than 12 bytes
```
utf8_view (small): multiple, no nulls 1.00 47.9±1.71µs ?
?/sec 4.00 191.6±1.15µs ? ?/sec
utf8_view (small): multiple, nulls 1.00 78.4±0.48µs ?
?/sec 3.08 241.6±1.11µs ? ?/sec
utf8_view (small): single, no nulls 1.00 13.9±0.19µs ?
?/sec 4.29 59.7±0.30µs ? ?/sec
utf8_view (small): single, nulls 1.00 23.8±0.20µs ?
?/sec 3.10 73.7±1.03µs ? ?/sec
utf8_view: multiple, no nulls 1.00 235.4±2.14µs ?
?/sec 1.11 262.2±1.34µs ? ?/sec
utf8_view: multiple, nulls 1.00 227.2±2.11µs ?
?/sec 1.34 303.9±2.23µs ? ?/sec
utf8_view: single, no nulls 1.00 71.6±0.74µs ?
?/sec 1.05 75.2±1.27µs ? ?/sec
utf8_view: single, nulls 1.00 71.5±1.92µs ?
?/sec 1.28 91.6±4.65µs
```
<details><summary>Details</summary>
<p>
```
Gnuplot not found, using plotters backend
utf8_view: single, no nulls
time: [20.872 µs 20.906 µs 20.944 µs]
change: [−15.863% −15.614% −15.331%] (p = 0.00 <
0.05)
Performance has improved.
Found 13 outliers among 100 measurements (13.00%)
8 (8.00%) high mild
5 (5.00%) high severe
utf8_view: single, nulls
time: [22.968 µs 23.050 µs 23.130 µs]
change: [−17.796% −17.384% −16.918%] (p = 0.00 <
0.05)
Performance has improved.
Found 7 outliers among 100 measurements (7.00%)
3 (3.00%) high mild
4 (4.00%) high severe
utf8_view: multiple, no nulls
time: [66.005 µs 66.155 µs 66.325 µs]
change: [−19.077% −18.785% −18.512%] (p = 0.00 <
0.05)
Performance has improved.
utf8_view: multiple, nulls
time: [72.155 µs 72.375 µs 72.649 µs]
change: [−17.944% −17.612% −17.266%] (p = 0.00 <
0.05)
Performance has improved.
Found 11 outliers among 100 measurements (11.00%)
6 (6.00%) high mild
5 (5.00%) high severe
utf8_view (small): single, no nulls
time: [6.1401 µs 6.1563 µs 6.1747 µs]
change: [−69.623% −69.484% −69.333%] (p = 0.00 <
0.05)
Performance has improved.
Found 6 outliers among 100 measurements (6.00%)
3 (3.00%) high mild
3 (3.00%) high severe
utf8_view (small): single, nulls
time: [10.234 µs 10.250 µs 10.270 µs]
change: [−53.969% −53.815% −53.666%] (p = 0.00 <
0.05)
Performance has improved.
Found 5 outliers among 100 measurements (5.00%)
5 (5.00%) high severe
utf8_view (small): multiple, no nulls
time: [20.853 µs 20.905 µs 20.961 µs]
change: [−66.006% −65.883% −65.759%] (p = 0.00 <
0.05)
Performance has improved.
Found 9 outliers among 100 measurements (9.00%)
7 (7.00%) high mild
2 (2.00%) high severe
utf8_view (small): multiple, nulls
time: [32.519 µs 32.600 µs 32.675 µs]
change: [−53.937% −53.581% −53.232%] (p = 0.00 <
0.05)
Performance has improved.
Found 1 outliers among 100 measurements (1.00%)
1 (1.00%) high mild
```
</p>
</details>
## Are there any user-facing changes?
<!--
If there are user-facing changes then we may require documentation to be
updated before approving the PR.
-->
<!--
If there are any breaking changes to public APIs, please add the `api
change` label.
-->
---
datafusion/common/src/hash_utils.rs | 136 +++++++++++++++++++++++++++++++++---
1 file changed, 125 insertions(+), 11 deletions(-)
diff --git a/datafusion/common/src/hash_utils.rs
b/datafusion/common/src/hash_utils.rs
index 6886220ccf..98dd1f235a 100644
--- a/datafusion/common/src/hash_utils.rs
+++ b/datafusion/common/src/hash_utils.rs
@@ -162,7 +162,7 @@ macro_rules! hash_value {
})+
};
}
-hash_value!(i8, i16, i32, i64, i128, i256, u8, u16, u32, u64);
+hash_value!(i8, i16, i32, i64, i128, i256, u8, u16, u32, u64, u128);
hash_value!(bool, str, [u8], IntervalDayTime, IntervalMonthDayNano);
macro_rules! hash_float_value {
@@ -269,6 +269,127 @@ fn hash_array<T>(
}
}
+/// Hash a StringView or BytesView array
+///
+/// Templated to optimize inner loop based on presence of nulls and external
buffers.
+///
+/// HAS_NULLS: do we have to check null in the inner loop
+/// HAS_BUFFERS: if true, array has external buffers; if false, all strings
are inlined/ less then 12 bytes
+/// REHASH: if true, combining with existing hash, otherwise initializing
+#[inline(never)]
+fn hash_string_view_array_inner<
+ T: ByteViewType,
+ const HAS_NULLS: bool,
+ const HAS_BUFFERS: bool,
+ const REHASH: bool,
+>(
+ array: &GenericByteViewArray<T>,
+ random_state: &RandomState,
+ hashes_buffer: &mut [u64],
+) {
+ assert_eq!(
+ hashes_buffer.len(),
+ array.len(),
+ "hashes_buffer and array should be of equal length"
+ );
+
+ let buffers = array.data_buffers();
+ let view_bytes = |view_len: u32, view: u128| {
+ let view = ByteView::from(view);
+ let offset = view.offset as usize;
+ // SAFETY: view is a valid view as it came from the array
+ unsafe {
+ let data = buffers.get_unchecked(view.buffer_index as usize);
+ data.get_unchecked(offset..offset + view_len as usize)
+ }
+ };
+
+ let hashes_and_views = hashes_buffer.iter_mut().zip(array.views().iter());
+ for (i, (hash, &v)) in hashes_and_views.enumerate() {
+ if HAS_NULLS && array.is_null(i) {
+ continue;
+ }
+ let view_len = v as u32;
+ // all views are inlined, no need to access external buffers
+ if !HAS_BUFFERS || view_len <= 12 {
+ if REHASH {
+ *hash = combine_hashes(v.hash_one(random_state), *hash);
+ } else {
+ *hash = v.hash_one(random_state);
+ }
+ continue;
+ }
+ // view is not inlined, so we need to hash the bytes as well
+ let value = view_bytes(view_len, v);
+ if REHASH {
+ *hash = combine_hashes(value.hash_one(random_state), *hash);
+ } else {
+ *hash = value.hash_one(random_state);
+ }
+ }
+}
+
+/// Builds hash values for array views and writes them into `hashes_buffer`
+/// If `rehash==true` this combines the previous hash value in the buffer
+/// with the new hash using `combine_hashes`
+#[cfg(not(feature = "force_hash_collisions"))]
+fn hash_generic_byte_view_array<T: ByteViewType>(
+ array: &GenericByteViewArray<T>,
+ random_state: &RandomState,
+ hashes_buffer: &mut [u64],
+ rehash: bool,
+) {
+ // instantiate the correct version based on presence of nulls and external
buffers
+ match (
+ array.null_count() != 0,
+ !array.data_buffers().is_empty(),
+ rehash,
+ ) {
+ // no nulls or buffers ==> hash the inlined views directly
+ // don't call the inner function as Rust seems better able to inline
this simpler code (2-3% faster)
+ (false, false, false) => {
+ for (hash, &view) in
hashes_buffer.iter_mut().zip(array.views().iter()) {
+ *hash = view.hash_one(random_state);
+ }
+ }
+ (false, false, true) => {
+ for (hash, &view) in
hashes_buffer.iter_mut().zip(array.views().iter()) {
+ *hash = combine_hashes(view.hash_one(random_state), *hash);
+ }
+ }
+ (false, true, false) => hash_string_view_array_inner::<T, false, true,
false>(
+ array,
+ random_state,
+ hashes_buffer,
+ ),
+ (false, true, true) => hash_string_view_array_inner::<T, false, true,
true>(
+ array,
+ random_state,
+ hashes_buffer,
+ ),
+ (true, false, false) => hash_string_view_array_inner::<T, true, false,
false>(
+ array,
+ random_state,
+ hashes_buffer,
+ ),
+ (true, false, true) => hash_string_view_array_inner::<T, true, false,
true>(
+ array,
+ random_state,
+ hashes_buffer,
+ ),
+ (true, true, false) => hash_string_view_array_inner::<T, true, true,
false>(
+ array,
+ random_state,
+ hashes_buffer,
+ ),
+ (true, true, true) => hash_string_view_array_inner::<T, true, true,
true>(
+ array,
+ random_state,
+ hashes_buffer,
+ ),
+ }
+}
+
/// Helper function to update hash for a dictionary key if the value is valid
#[cfg(not(feature = "force_hash_collisions"))]
#[inline]
@@ -568,10 +689,10 @@ fn hash_single_array(
DataType::Null => hash_null(random_state, hashes_buffer, rehash),
DataType::Boolean => hash_array(&as_boolean_array(array)?,
random_state, hashes_buffer, rehash),
DataType::Utf8 => hash_array(&as_string_array(array)?, random_state,
hashes_buffer, rehash),
- DataType::Utf8View => hash_array(&as_string_view_array(array)?,
random_state, hashes_buffer, rehash),
+ DataType::Utf8View =>
hash_generic_byte_view_array(as_string_view_array(array)?, random_state,
hashes_buffer, rehash),
DataType::LargeUtf8 => hash_array(&as_largestring_array(array),
random_state, hashes_buffer, rehash),
DataType::Binary =>
hash_array(&as_generic_binary_array::<i32>(array)?, random_state,
hashes_buffer, rehash),
- DataType::BinaryView => hash_array(&as_binary_view_array(array)?,
random_state, hashes_buffer, rehash),
+ DataType::BinaryView =>
hash_generic_byte_view_array(as_binary_view_array(array)?, random_state,
hashes_buffer, rehash),
DataType::LargeBinary =>
hash_array(&as_generic_binary_array::<i64>(array)?, random_state,
hashes_buffer, rehash),
DataType::FixedSizeBinary(_) => {
let array: &FixedSizeBinaryArray =
array.as_any().downcast_ref().unwrap();
@@ -767,8 +888,6 @@ mod tests {
let binary_array: ArrayRef =
Arc::new(binary.iter().cloned().collect::<$ARRAY>());
- let ref_array: ArrayRef =
- Arc::new(binary.iter().cloned().collect::<BinaryArray>());
let random_state = RandomState::with_seeds(0, 0, 0, 0);
@@ -776,9 +895,6 @@ mod tests {
create_hashes(&[binary_array], &random_state, &mut
binary_hashes)
.unwrap();
- let mut ref_hashes = vec![0; binary.len()];
- create_hashes(&[ref_array], &random_state, &mut
ref_hashes).unwrap();
-
// Null values result in a zero hash,
for (val, hash) in binary.iter().zip(binary_hashes.iter()) {
match val {
@@ -787,9 +903,6 @@ mod tests {
}
}
- // same logical values should hash to the same hash value
- assert_eq!(binary_hashes, ref_hashes);
-
// Same values should map to same hash values
assert_eq!(binary[0], binary[5]);
assert_eq!(binary[4], binary[6]);
@@ -801,6 +914,7 @@ mod tests {
}
create_hash_binary!(binary_array, BinaryArray);
+ create_hash_binary!(large_binary_array, LargeBinaryArray);
create_hash_binary!(binary_view_array, BinaryViewArray);
#[test]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]