Dandandan commented on code in PR #19975:
URL: https://github.com/apache/datafusion/pull/19975#discussion_r2727452902
##########
datafusion/physical-expr-common/src/binary_view_map.rs:
##########
@@ -250,52 +265,92 @@ where
// step 2: insert each value into the set, if not already present
let values = values.as_byte_view::<B>();
+ // Get raw views buffer for direct comparison
+ let input_views = values.views();
+
// Ensure lengths are equivalent
- assert_eq!(values.len(), batch_hashes.len());
+ assert_eq!(values.len(), self.hashes_buffer.len());
+
+ for i in 0..values.len() {
+ let view_u128 = input_views[i];
+ let hash = self.hashes_buffer[i];
- for (value, &hash) in values.iter().zip(batch_hashes.iter()) {
- // handle null value
- let Some(value) = value else {
+ // handle null value via validity bitmap check
+ if !values.is_valid(i) {
let payload = if let Some(&(payload, _offset)) =
self.null.as_ref() {
payload
} else {
let payload = make_payload_fn(None);
- let null_index = self.builder.len();
- self.builder.append_null();
+ let null_index = self.views.len();
+ self.views.push(0);
+ self.nulls.append(false); // false = null in validity
buffer
self.null = Some((payload, null_index));
payload
};
observe_payload_fn(payload);
continue;
- };
+ }
- // get the value as bytes
- let value: &[u8] = value.as_ref();
+ // Extract length from the view (first 4 bytes of u128 in
little-endian)
+ let len = (view_u128 & 0xFFFFFFFF) as u32;
- let entry = self.map.find_mut(hash, |header| {
- if header.hash != hash {
- return false;
- }
- let v = self.builder.get_value(header.view_idx);
+ // Check if value already exists
+ let maybe_payload = {
+ // Borrow completed and in_progress for comparison
+ let completed = &self.completed;
+ let in_progress = &self.in_progress;
- v == value
- });
+ self.map
+ .find(hash, |header| {
+ if header.hash != hash {
+ return false;
+ }
+
+ // Fast path: inline strings can be compared directly
+ if len <= 12 {
+ return header.view == view_u128;
+ }
+
+ // For larger strings: first compare the 4-byte prefix
+ let stored_prefix = ((header.view >> 32) & 0xFFFFFFFF)
as u32;
Review Comment:
`& 0xFFFFFFFF` is not needed
##########
datafusion/physical-expr-common/src/binary_view_map.rs:
##########
@@ -250,52 +265,92 @@ where
// step 2: insert each value into the set, if not already present
let values = values.as_byte_view::<B>();
+ // Get raw views buffer for direct comparison
+ let input_views = values.views();
+
// Ensure lengths are equivalent
- assert_eq!(values.len(), batch_hashes.len());
+ assert_eq!(values.len(), self.hashes_buffer.len());
+
+ for i in 0..values.len() {
+ let view_u128 = input_views[i];
+ let hash = self.hashes_buffer[i];
- for (value, &hash) in values.iter().zip(batch_hashes.iter()) {
- // handle null value
- let Some(value) = value else {
+ // handle null value via validity bitmap check
+ if !values.is_valid(i) {
let payload = if let Some(&(payload, _offset)) =
self.null.as_ref() {
payload
} else {
let payload = make_payload_fn(None);
- let null_index = self.builder.len();
- self.builder.append_null();
+ let null_index = self.views.len();
+ self.views.push(0);
+ self.nulls.append(false); // false = null in validity
buffer
self.null = Some((payload, null_index));
payload
};
observe_payload_fn(payload);
continue;
- };
+ }
- // get the value as bytes
- let value: &[u8] = value.as_ref();
+ // Extract length from the view (first 4 bytes of u128 in
little-endian)
+ let len = (view_u128 & 0xFFFFFFFF) as u32;
- let entry = self.map.find_mut(hash, |header| {
- if header.hash != hash {
- return false;
- }
- let v = self.builder.get_value(header.view_idx);
+ // Check if value already exists
+ let maybe_payload = {
+ // Borrow completed and in_progress for comparison
+ let completed = &self.completed;
+ let in_progress = &self.in_progress;
- v == value
- });
+ self.map
+ .find(hash, |header| {
+ if header.hash != hash {
+ return false;
+ }
+
+ // Fast path: inline strings can be compared directly
+ if len <= 12 {
+ return header.view == view_u128;
+ }
+
+ // For larger strings: first compare the 4-byte prefix
+ let stored_prefix = ((header.view >> 32) & 0xFFFFFFFF)
as u32;
+ let input_prefix = ((view_u128 >> 32) & 0xFFFFFFFF) as
u32;
Review Comment:
`& 0xFFFFFFFF` is not needed
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]