jackwener commented on code in PR #4377: URL: https://github.com/apache/arrow-datafusion/pull/4377#discussion_r1032930615
########## datafusion/core/src/physical_plan/joins/hash_join.rs: ########## @@ -1441,44 +1181,147 @@ fn equal_rows( err.unwrap_or(Ok(res)) } -// Produces a batch for left-side rows that have/have not been matched during the whole join -fn produce_from_matched( - visited_left_side: &BooleanBufferBuilder, - schema: &SchemaRef, - column_indices: &[ColumnIndex], - left_data: &JoinLeftData, - unmatched: bool, -) -> ArrowResult<RecordBatch> { - let indices = if unmatched { - UInt64Array::from_iter_values( - (0..visited_left_side.len()) - .filter_map(|v| (!visited_left_side.get_bit(v)).then_some(v as u64)), - ) +// The input is the matched indices for left and right. +// Adjust the indices according to the join type +fn adjust_indices_by_join_type( + left_indices: UInt64Array, + right_indices: UInt32Array, + count_right_batch: usize, + join_type: JoinType, +) -> (UInt64Array, UInt32Array) { + match join_type { + JoinType::Inner => { + // matched + (left_indices, right_indices) + } + JoinType::Left => { + // matched + (left_indices, right_indices) + // unmatched left row will be produced in the end of loop, and it has been set in the left visited bitmap + } + JoinType::Right | JoinType::Full => { + // matched + // unmatched right row will be produced in this batch + let right_null_indices = get_anti_indices(count_right_batch, &right_indices); + // combine the matched and unmatched right result together + append_right_indices(left_indices, right_indices, right_null_indices) + } + JoinType::RightSemi => { + // need to remove the duplicated record in the right side + let right_indices = get_semi_indices(count_right_batch, &right_indices); + // the left_indices will not be used later for the `right semi` join + (left_indices, right_indices) + } + JoinType::RightAnti => { + // need to remove the duplicated record in the right side + // get the anti index for the right side + let right_indices = get_anti_indices(count_right_batch, &right_indices); + // the left_indices will not be used later for the `right anti` join + (left_indices, right_indices) + } + JoinType::LeftSemi | JoinType::LeftAnti => { + // matched or unmatched left row will be produced in the end of loop + ( + UInt64Array::from_iter_values(vec![]), + UInt32Array::from_iter_values(vec![]), Review Comment: 🤔 I don't understand why these code -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org