Copilot commented on code in PR #19975:
URL: https://github.com/apache/datafusion/pull/19975#discussion_r2725748302
##########
datafusion/physical-expr-common/src/binary_view_map.rs:
##########
@@ -267,28 +273,49 @@ where
};
observe_payload_fn(payload);
continue;
- };
+ }
- // get the value as bytes
- let value: &[u8] = value.as_ref();
+ // Extract length from the view (first 4 bytes of u128 in
little-endian)
+ let len = (view_u128 & 0xFFFFFFFF) as u32;
let entry = self.map.find_mut(hash, |header| {
if header.hash != hash {
return false;
}
- let v = self.builder.get_value(header.view_idx);
- v == value
+ // Fast path: for inline strings (<=12 bytes), the entire value
+ // is stored in the u128 view, so we can compare directly
+ // This avoids the expensive conversion back to bytes
+ if len <= 12 {
+ return header.view == view_u128;
+ }
+
+ // For larger strings: first compare the 4-byte prefix (bytes
4-7 of u128)
+ // The prefix is stored in the next 4 bytes after length
+ // Only dereference full bytes if prefixes match
+ let stored_prefix = ((header.view >> 32) & 0xFFFFFFFF) as u32;
+ let input_prefix = ((view_u128 >> 32) & 0xFFFFFFFF) as u32;
Review Comment:
Prefix extraction via manual bit shifts (`(view >> 32) & 0xFFFFFFFF`)
duplicates knowledge of the `ByteView` layout and is harder to maintain. There
is already a pattern elsewhere to extract the prefix using Arrow helpers (e.g.
`GenericByteViewArray::<B>::inline_value(&view, 4)` in
`physical-plan/src/aggregates/group_values/multi_group_by/bytes_view.rs:271-275`),
which avoids hardcoding masks/shifts and documents intent more clearly.
```suggestion
let stored_bytes = header.view.to_le_bytes();
let stored_prefix =
u32::from_le_bytes(stored_bytes[4..8].try_into().unwrap());
let input_bytes = view_u128.to_le_bytes();
let input_prefix =
u32::from_le_bytes(input_bytes[4..8].try_into().unwrap());
```
##########
datafusion/physical-expr-common/src/binary_view_map.rs:
##########
@@ -378,6 +405,10 @@ struct Entry<V>
where
V: Debug + PartialEq + Eq + Clone + Copy + Default,
{
+ /// The original u128 view for fast comparison of inline strings (<=12
bytes)
+ /// and prefix comparison for larger strings
+ view: u128,
+
Review Comment:
PR description claims "No additional memory overhead in Entry struct", but
this change adds `view: u128` to `Entry`, increasing per-entry size (and thus
memory usage / `map_size`) by at least 16 bytes plus padding. If avoiding
additional overhead is a requirement, consider storing only the needed metadata
(e.g., `len: u32` + `prefix: u32`) or deriving these from the builder’s stored
view via `view_idx`; otherwise, update the PR description to match the
implementation.
##########
datafusion/physical-expr-common/src/binary_view_map.rs:
##########
@@ -267,28 +273,49 @@ where
};
observe_payload_fn(payload);
continue;
- };
+ }
- // get the value as bytes
- let value: &[u8] = value.as_ref();
+ // Extract length from the view (first 4 bytes of u128 in
little-endian)
+ let len = (view_u128 & 0xFFFFFFFF) as u32;
let entry = self.map.find_mut(hash, |header| {
if header.hash != hash {
return false;
}
- let v = self.builder.get_value(header.view_idx);
- v == value
+ // Fast path: for inline strings (<=12 bytes), the entire value
+ // is stored in the u128 view, so we can compare directly
+ // This avoids the expensive conversion back to bytes
+ if len <= 12 {
+ return header.view == view_u128;
+ }
+
+ // For larger strings: first compare the 4-byte prefix (bytes
4-7 of u128)
+ // The prefix is stored in the next 4 bytes after length
+ // Only dereference full bytes if prefixes match
+ let stored_prefix = ((header.view >> 32) & 0xFFFFFFFF) as u32;
+ let input_prefix = ((view_u128 >> 32) & 0xFFFFFFFF) as u32;
+ if stored_prefix != input_prefix {
+ return false;
+ }
+
+ // Prefix matched - must compare full bytes
Review Comment:
The non-inline comparison path skips a cheap length equality check before
doing prefix/full-byte comparisons. Since both input and stored views encode
length in the low 32 bits, checking `header.view as u32 == len` (or equivalent)
first would avoid calling `builder.get_value(...)` / `values.value(i)` when
lengths differ but hashes (and possibly prefixes) match, improving the hot path.
```suggestion
// For larger strings: first compare length using the low 32
bits.
// If lengths differ, the values cannot be equal, so we can
skip
// prefix and full-byte comparisons and avoid dereferencing
bytes.
if (header.view as u32) != len {
return false;
}
// Next compare the 4-byte prefix (bytes 4-7 of u128).
// The prefix is stored in the next 4 bytes after length.
// Only dereference full bytes if prefixes match.
let stored_prefix = ((header.view >> 32) & 0xFFFFFFFF) as
u32;
let input_prefix = ((view_u128 >> 32) & 0xFFFFFFFF) as u32;
if stored_prefix != input_prefix {
return false;
}
// Prefix and length matched - now compare full bytes.
```
##########
datafusion/physical-expr-common/src/binary_view_map.rs:
##########
@@ -267,28 +273,49 @@ where
};
observe_payload_fn(payload);
continue;
- };
+ }
- // get the value as bytes
- let value: &[u8] = value.as_ref();
+ // Extract length from the view (first 4 bytes of u128 in
little-endian)
+ let len = (view_u128 & 0xFFFFFFFF) as u32;
let entry = self.map.find_mut(hash, |header| {
if header.hash != hash {
return false;
}
- let v = self.builder.get_value(header.view_idx);
- v == value
+ // Fast path: for inline strings (<=12 bytes), the entire value
+ // is stored in the u128 view, so we can compare directly
+ // This avoids the expensive conversion back to bytes
+ if len <= 12 {
+ return header.view == view_u128;
+ }
+
+ // For larger strings: first compare the 4-byte prefix (bytes
4-7 of u128)
+ // The prefix is stored in the next 4 bytes after length
+ // Only dereference full bytes if prefixes match
+ let stored_prefix = ((header.view >> 32) & 0xFFFFFFFF) as u32;
Review Comment:
Several new comments describe behavior in terms of "strings" (e.g. "inline
strings" / "larger strings"), but this function is generic over `ByteViewType`
and is used for both `Utf8View` and `BinaryView`. Consider adjusting wording to
"values"/"byte sequences" to avoid confusion for the binary case.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]