deniskuzZ commented on code in PR #6048: URL: https://github.com/apache/hive/pull/6048#discussion_r2324194808
########## ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java: ########## @@ -656,32 +656,39 @@ private boolean convertJoinBucketMapJoin(JoinOperator joinOp, OptimizeTezProcCon // on small table(s). ReduceSinkOperator bigTableRS = (ReduceSinkOperator)joinOp.getParentOperators().get(bigTablePosition); OpTraits opTraits = bigTableRS.getOpTraits(); - List<List<String>> listBucketCols = opTraits.getBucketColNames(); + // It is guaranteed there is only 1 list within bigTableRS.getOpTraits().getBucketColNames(). + List<String> listBucketCols = opTraits.getBucketColNames().get(0); List<ExprNodeDesc> bigTablePartitionCols = bigTableRS.getConf().getPartitionCols(); - boolean updatePartitionCols = false; + boolean updatePartitionCols = listBucketCols.size() != bigTablePartitionCols.size(); List<Integer> positions = new ArrayList<>(); - CustomBucketFunction bucketFunction = opTraits.getCustomBucketFunctions().get(0); - if (listBucketCols.get(0).size() != bigTablePartitionCols.size()) { - updatePartitionCols = true; - // Prepare updated partition columns for small table(s). - // Get the positions of bucketed columns - - int bigTableExprPos = 0; - Map<String, ExprNodeDesc> colExprMap = bigTableRS.getColumnExprMap(); - final boolean[] retainedColumns = new boolean[listBucketCols.get(0).size()]; - for (ExprNodeDesc bigTableExpr : bigTablePartitionCols) { - // It is guaranteed there is only 1 list within listBucketCols. - for (int i = 0; i < listBucketCols.get(0).size(); i++) { - final String colName = listBucketCols.get(0).get(i); - if (colExprMap.get(colName).isSame(bigTableExpr)) { - positions.add(bigTableExprPos); - retainedColumns[i] = true; - } + // Compare the partition columns and the bucket columns of bigTableRS. + Map<String, ExprNodeDesc> colExprMap = bigTableRS.getColumnExprMap(); + final boolean[] retainedColumns = new boolean[listBucketCols.size()]; + for (int bucketColIdx = 0; bucketColIdx < listBucketCols.size(); bucketColIdx++) { + for (int bigTablePartIdx = 0; bigTablePartIdx < bigTablePartitionCols.size(); bigTablePartIdx++) { + ExprNodeDesc bigTablePartExpr = bigTablePartitionCols.get(bigTablePartIdx); + ExprNodeDesc bucketColExpr = colExprMap.get(listBucketCols.get(bucketColIdx)); + if (bigTablePartExpr.isSame(bucketColExpr)) { + positions.add(bigTablePartIdx); + retainedColumns[bucketColIdx] = true; + // If the positions of the partition column and the bucket column are not the same, + // then we need to update the position of the partition column in small tables. + updatePartitionCols = updatePartitionCols || bucketColIdx != bigTablePartIdx; + break; } - bigTableExprPos = bigTableExprPos + 1; } + } + // If the number of partition columns is less than the number of bucket columns, + // then we cannot properly distribute small tables onto bucketized map tasks. + // Bail out. + if (positions.size() < listBucketCols.size()) { + return false; + } + + CustomBucketFunction bucketFunction = opTraits.getCustomBucketFunctions().get(0); Review Comment: @ngsg, @okumin shouldn't we apply `retainedColumns` to a `bucketFunction`? ```` final Optional<CustomBucketFunction> selected = opTraits.getCustomBucketFunctions().get(0).select(retainedColumns); if (!selected.isPresent()) { LOG.info("{} can't keep itself only with {}", opTraits.getCustomBucketFunctions().get(0), retainedColumns); return false; } bucketFunction = selected.get(); ```` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org For additional commands, e-mail: gitbox-h...@hive.apache.org