[ https://issues.apache.org/jira/browse/DRILL-5375?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15953375#comment-15953375 ]
ASF GitHub Bot commented on DRILL-5375: --------------------------------------- Github user arina-ielchiieva commented on a diff in the pull request: https://github.com/apache/drill/pull/794#discussion_r109371613 --- Diff: exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/join/NestedLoopJoinTemplate.java --- @@ -40,132 +41,133 @@ // Record count of the left batch currently being processed private int leftRecordCount = 0; - // List of record counts per batch in the hyper container + // List of record counts per batch in the hyper container private List<Integer> rightCounts = null; // Output batch private NestedLoopJoinBatch outgoing = null; - // Next right batch to process - private int nextRightBatchToProcess = 0; - - // Next record in the current right batch to process - private int nextRightRecordToProcess = 0; - - // Next record in the left batch to process - private int nextLeftRecordToProcess = 0; + // Iteration status tracker + private IterationStatusTracker tracker = new IterationStatusTracker(); /** * Method initializes necessary state and invokes the doSetup() to set the - * input and output value vector references + * input and output value vector references. + * * @param context Fragment context * @param left Current left input batch being processed * @param rightContainer Hyper container + * @param rightCounts Counts for each right container * @param outgoing Output batch */ - public void setupNestedLoopJoin(FragmentContext context, RecordBatch left, + public void setupNestedLoopJoin(FragmentContext context, + RecordBatch left, ExpandableHyperContainer rightContainer, LinkedList<Integer> rightCounts, NestedLoopJoinBatch outgoing) { this.left = left; - leftRecordCount = left.getRecordCount(); + this.leftRecordCount = left.getRecordCount(); this.rightCounts = rightCounts; this.outgoing = outgoing; doSetup(context, rightContainer, left, outgoing); } /** - * This method is the core of the nested loop join. For every record on the right we go over - * the left batch and produce the cross product output + * Main entry point for producing the output records. Thin wrapper around populateOutgoingBatch(), this method + * controls which left batch we are processing and fetches the next left input batch once we exhaust the current one. + * + * @param joinType join type (INNER ot LEFT) + * @return the number of records produced in the output batch + */ + public int outputRecords(JoinRelType joinType) { + int outputIndex = 0; + while (leftRecordCount != 0) { + outputIndex = populateOutgoingBatch(joinType, outputIndex); + if (outputIndex >= NestedLoopJoinBatch.MAX_BATCH_SIZE) { + break; + } + // reset state and get next left batch + resetAndGetNextLeft(); + } + return outputIndex; + } + + /** + * This method is the core of the nested loop join.For each left batch record looks for matching record + * from the list of right batches. Match is checked by calling {@link #doEval(int, int, int)} method. + * If matching record is found both left and right records are written into output batch, + * otherwise if join type is LEFT, than only left record is written, right batch record values will be null. + * + * @param joinType join type (INNER or LEFT) * @param outputIndex index to start emitting records at * @return final outputIndex after producing records in the output batch */ - private int populateOutgoingBatch(int outputIndex) { - - // Total number of batches on the right side - int totalRightBatches = rightCounts.size(); - - // Total number of records on the left - int localLeftRecordCount = leftRecordCount; - - /* - * The below logic is the core of the NLJ. To have better performance we copy the instance members into local - * method variables, once we are done with the loop we need to update the instance variables to reflect the new - * state. To avoid code duplication of resetting the instance members at every exit point in the loop we are using - * 'goto' - */ - int localNextRightBatchToProcess = nextRightBatchToProcess; - int localNextRightRecordToProcess = nextRightRecordToProcess; - int localNextLeftRecordToProcess = nextLeftRecordToProcess; - - outer: { - - for (; localNextRightBatchToProcess< totalRightBatches; localNextRightBatchToProcess++) { // for every batch on the right - int compositeIndexPart = localNextRightBatchToProcess << 16; - int rightRecordCount = rightCounts.get(localNextRightBatchToProcess); - - for (; localNextRightRecordToProcess < rightRecordCount; localNextRightRecordToProcess++) { // for every record in this right batch - for (; localNextLeftRecordToProcess < localLeftRecordCount; localNextLeftRecordToProcess++) { // for every record in the left batch - + private int populateOutgoingBatch(JoinRelType joinType, int outputIndex) { + // copy index and match counters as local variables to speed up processing + int nextRightBatchToProcess = tracker.getNextRightBatchToProcess(); + int nextRightRecordToProcess = tracker.getNextRightRecordToProcess(); + int nextLeftRecordToProcess = tracker.getNextLeftRecordToProcess(); + boolean rightRecordMatched = tracker.isRightRecordMatched(); + + outer: + // for every record in the left batch + for (; nextLeftRecordToProcess < leftRecordCount; nextLeftRecordToProcess++) { + // for every batch on the right + for (; nextRightBatchToProcess < rightCounts.size(); nextRightBatchToProcess++) { + int rightRecordCount = rightCounts.get(nextRightBatchToProcess); --- End diff -- No. I have created Jira https://issues.apache.org/jira/browse/DRILL-5407 to fix this issue since it was present before. > Nested loop join: return correct result for left join > ----------------------------------------------------- > > Key: DRILL-5375 > URL: https://issues.apache.org/jira/browse/DRILL-5375 > Project: Apache Drill > Issue Type: Bug > Affects Versions: 1.8.0 > Reporter: Arina Ielchiieva > Assignee: Arina Ielchiieva > Labels: doc-impacting > > Mini repro: > 1. Create 2 Hive tables with data > {code} > CREATE TABLE t1 ( > FYQ varchar(999), > dts varchar(999), > dte varchar(999) > ) > ROW FORMAT DELIMITED FIELDS TERMINATED BY ','; > 2016-Q1,2016-06-01,2016-09-30 > 2016-Q2,2016-09-01,2016-12-31 > 2016-Q3,2017-01-01,2017-03-31 > 2016-Q4,2017-04-01,2017-06-30 > CREATE TABLE t2 ( > who varchar(999), > event varchar(999), > dt varchar(999) > ) > ROW FORMAT DELIMITED FIELDS TERMINATED BY ','; > aperson,did somthing,2017-01-06 > aperson,did somthing else,2017-01-12 > aperson,had chrsitmas,2016-12-26 > aperson,went wild,2016-01-01 > {code} > 2. Impala Query shows correct result > {code} > select t2.dt, t1.fyq, t2.who, t2.event > from t2 > left join t1 on t2.dt between t1.dts and t1.dte > order by t2.dt; > +------------+---------+---------+-------------------+ > | dt | fyq | who | event | > +------------+---------+---------+-------------------+ > | 2016-01-01 | NULL | aperson | went wild | > | 2016-12-26 | 2016-Q2 | aperson | had chrsitmas | > | 2017-01-06 | 2016-Q3 | aperson | did somthing | > | 2017-01-12 | 2016-Q3 | aperson | did somthing else | > +------------+---------+---------+-------------------+ > {code} > 3. Drill query shows wrong results: > {code} > alter session set planner.enable_nljoin_for_scalar_only=false; > use hive; > select t2.dt, t1.fyq, t2.who, t2.event > from t2 > left join t1 on t2.dt between t1.dts and t1.dte > order by t2.dt; > +-------------+----------+----------+--------------------+ > | dt | fyq | who | event | > +-------------+----------+----------+--------------------+ > | 2016-12-26 | 2016-Q2 | aperson | had chrsitmas | > | 2017-01-06 | 2016-Q3 | aperson | did somthing | > | 2017-01-12 | 2016-Q3 | aperson | did somthing else | > +-------------+----------+----------+--------------------+ > 3 rows selected (2.523 seconds) > {code} -- This message was sent by Atlassian JIRA (v6.3.15#6346)