jorgecarleitao commented on a change in pull request #24:
URL: https://github.com/apache/arrow-datafusion/pull/24#discussion_r620634002
##########
File path: datafusion/src/physical_plan/hash_join.rs
##########
@@ -781,58 +846,140 @@ pub fn create_hashes<'a>(
random_state: &RandomState,
hashes_buffer: &'a mut Vec<u64>,
) -> Result<&'a mut Vec<u64>> {
+ // combine hashes with `combine_hashes` if we have more than 1 column
+ let multi_col = arrays.len() > 1;
+
for col in arrays {
match col.data_type() {
DataType::UInt8 => {
- hash_array!(UInt8Array, col, u8, hashes_buffer, random_state);
+ hash_array_primitive!(
+ UInt8Array,
+ col,
+ u8,
+ hashes_buffer,
+ random_state,
+ multi_col
+ );
}
DataType::UInt16 => {
- hash_array!(UInt16Array, col, u16, hashes_buffer,
random_state);
+ hash_array_primitive!(
+ UInt16Array,
+ col,
+ u16,
+ hashes_buffer,
+ random_state,
+ multi_col
+ );
}
DataType::UInt32 => {
- hash_array!(UInt32Array, col, u32, hashes_buffer,
random_state);
+ hash_array_primitive!(
+ UInt32Array,
+ col,
+ u32,
+ hashes_buffer,
+ random_state,
+ multi_col
+ );
}
DataType::UInt64 => {
- hash_array!(UInt64Array, col, u64, hashes_buffer,
random_state);
+ hash_array_primitive!(
+ UInt64Array,
+ col,
+ u64,
+ hashes_buffer,
+ random_state,
+ multi_col
+ );
}
DataType::Int8 => {
- hash_array!(Int8Array, col, i8, hashes_buffer, random_state);
+ hash_array_primitive!(
+ Int8Array,
+ col,
+ i8,
+ hashes_buffer,
+ random_state,
+ multi_col
+ );
}
DataType::Int16 => {
- hash_array!(Int16Array, col, i16, hashes_buffer, random_state);
+ hash_array_primitive!(
+ Int16Array,
+ col,
+ i16,
+ hashes_buffer,
+ random_state,
+ multi_col
+ );
}
DataType::Int32 => {
- hash_array!(Int32Array, col, i32, hashes_buffer, random_state);
+ hash_array_primitive!(
+ Int32Array,
+ col,
+ i32,
+ hashes_buffer,
+ random_state,
+ multi_col
+ );
}
DataType::Int64 => {
- hash_array!(Int64Array, col, i64, hashes_buffer, random_state);
+ hash_array_primitive!(
+ Int64Array,
+ col,
+ i64,
+ hashes_buffer,
+ random_state,
+ multi_col
+ );
}
DataType::Timestamp(TimeUnit::Microsecond, None) => {
- hash_array!(
+ hash_array_primitive!(
TimestampMicrosecondArray,
col,
i64,
hashes_buffer,
- random_state
+ random_state,
+ multi_col
);
}
DataType::Timestamp(TimeUnit::Nanosecond, None) => {
- hash_array!(
+ hash_array_primitive!(
TimestampNanosecondArray,
col,
i64,
hashes_buffer,
- random_state
+ random_state,
+ multi_col
);
}
DataType::Boolean => {
- hash_array!(BooleanArray, col, u8, hashes_buffer,
random_state);
+ hash_array_primitive!(
Review comment:
```suggestion
hash_array!(
```
otherwise this will be an iterator over the bytes of the bitmap, not over
the individual bits.
In general, I recommend using generics, as they make it more difficult for
these to happen, but I agree that the API is a bit funny on the arrow side. ^_^
##########
File path: datafusion/src/physical_plan/hash_join.rs
##########
@@ -708,7 +720,6 @@ macro_rules! equal_rows_elem {
let right_array = $r.as_any().downcast_ref::<$array_type>().unwrap();
match (left_array.is_null($left), left_array.is_null($right)) {
- (true, true) => true,
Review comment:
this was the line, right? Damm, so many wrong things for this one.
Really great finding, @Dandandan !
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]