[ 
https://issues.apache.org/jira/browse/CALCITE-3846?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Ruben Q L updated CALCITE-3846:
-------------------------------
    Description: 
The problem can be reproduced with the following test in 
{{EnumerablesJoinTest.java}}:
{code}
  @Test public void testMergeJoinWithCompositeKeyAndNull() {
    tester(false, new JdbcTest.HrSchema())
        .query("?")
        .withHook(Hook.PLANNER, (Consumer<RelOptPlanner>) planner -> {
          planner.addRule(EnumerableRules.ENUMERABLE_MERGE_JOIN_RULE);
          planner.removeRule(EnumerableRules.ENUMERABLE_JOIN_RULE);
        })
        .withRel(builder -> builder
            .scan("s", "emps")
            .sort(builder.field("deptno"), builder.field("commission"))
            .scan("s", "emps")
            .sort(builder.field("deptno"), builder.field("commission"))
            .join(JoinRelType.INNER,
                builder.and(
                    builder.equals(
                        builder.field(2, 0, "deptno"),
                        builder.field(2, 1, "deptno")),
                    builder.equals(
                        builder.field(2, 0, "commission"),
                        builder.field(2, 1, "commission"))))
            .project(
                builder.field("empid"))
            .build())
        .explainHookMatches("" // It is important that we have MergeJoin in the 
plan
            + "EnumerableCalc(expr#0..4=[{inputs}], empid=[$t0])\n"
            + "  EnumerableMergeJoin(condition=[AND(=($1, $3), =($2, $4))], 
joinType=[inner])\n"
            + "    EnumerableSort(sort0=[$1], sort1=[$2], dir0=[ASC], 
dir1=[ASC])\n"
            + "      EnumerableCalc(expr#0..4=[{inputs}], proj#0..1=[{exprs}], 
commission=[$t4])\n"
            + "        EnumerableTableScan(table=[[s, emps]])\n"
            + "    EnumerableSort(sort0=[$0], sort1=[$1], dir0=[ASC], 
dir1=[ASC])\n"
            + "      EnumerableCalc(expr#0..4=[{inputs}], deptno=[$t1], 
commission=[$t4])\n"
            + "        EnumerableTableScan(table=[[s, emps]])\n")
        .returnsUnordered("empid=100\nempid=110\nempid=150\nempid=200");
  }
{code}

The test fails with the following exception:
{code}
java.lang.IllegalStateException: mergeJoin assumes inputs sorted in ascending 
order, however [10, 1000] is greater than [10, null]
{code}

The problem is that {{EnumerableMergeJoin}} implementation (i.e. 
{{EnumerableDefaults#mergeJoin}}) expects its inputs to be sorted in ascending 
order, nulls last [1] (see {{EnumerableMergeJoinRule}} [2]). In case of a 
composite key, {{EnumerableMergeJoin}} will represent keys as 
{{JavaRowFormat.LIST}} [3], which is a comparable list, whose comparison is 
implemented via {{FlatLists.ComparableListImpl#compare}}. This method will 
compare both lists, item by item, but it will consider that a null item is 
less-than a non-null item [4]. This is a de-facto nulls-first collation, which 
contradicts the pre-requisite of the mergeJoin algorithm.


[1] 
https://github.com/apache/calcite/blob/200a136dd3b2ca04a1d1283eb5a03a04388b1947/linq4j/src/main/java/org/apache/calcite/linq4j/EnumerableDefaults.java#L1966
[2] 
https://github.com/apache/calcite/blob/200a136dd3b2ca04a1d1283eb5a03a04388b1947/core/src/main/java/org/apache/calcite/adapter/enumerable/EnumerableMergeJoinRule.java#L72
[3] 
https://github.com/apache/calcite/blob/200a136dd3b2ca04a1d1283eb5a03a04388b1947/core/src/main/java/org/apache/calcite/adapter/enumerable/EnumerableMergeJoin.java#L154
[4] 
https://github.com/apache/calcite/blob/200a136dd3b2ca04a1d1283eb5a03a04388b1947/core/src/main/java/org/apache/calcite/runtime/FlatLists.java#L1350

  was:
The problem can be reproduced with the following test in 
{{EnumerablesJoinTest.java}}:
{code}
  @Test public void testMergeJoinWithCompositeKeyAndNull() {
    tester(false, new JdbcTest.HrSchema())
        .query("?")
        .withHook(Hook.PLANNER, (Consumer<RelOptPlanner>) planner -> {
          planner.addRule(EnumerableRules.ENUMERABLE_MERGE_JOIN_RULE);
          planner.removeRule(EnumerableRules.ENUMERABLE_JOIN_RULE);
        })
        .withRel(builder -> builder
            .scan("s", "emps")
            .sort(builder.field("deptno"), builder.field("commission"))
            .scan("s", "emps")
            .sort(builder.field("deptno"), builder.field("commission"))
            .join(JoinRelType.INNER,
                builder.and(
                    builder.equals(
                        builder.field(2, 0, "deptno"),
                        builder.field(2, 1, "deptno")),
                    builder.equals(
                        builder.field(2, 0, "commission"),
                        builder.field(2, 1, "commission"))))
            .project(
                builder.field("empid"))
            .build())
        .explainHookMatches("" // It is important that we have MergeJoin in the 
plan
            + "EnumerableCalc(expr#0..4=[{inputs}], empid=[$t0])\n"
            + "  EnumerableMergeJoin(condition=[AND(=($1, $3), =($2, $4))], 
joinType=[inner])\n"
            + "    EnumerableSort(sort0=[$1], sort1=[$2], dir0=[ASC], 
dir1=[ASC])\n"
            + "      EnumerableCalc(expr#0..4=[{inputs}], proj#0..1=[{exprs}], 
commission=[$t4])\n"
            + "        EnumerableTableScan(table=[[s, emps]])\n"
            + "    EnumerableSort(sort0=[$0], sort1=[$1], dir0=[ASC], 
dir1=[ASC])\n"
            + "      EnumerableCalc(expr#0..4=[{inputs}], deptno=[$t1], 
commission=[$t4])\n"
            + "        EnumerableTableScan(table=[[s, emps]])\n")
        .returnsUnordered("empid=100\nempid=110\nempid=150\nempid=200");
  }
{code}

The test fails with the following exception:
{code}
java.lang.IllegalStateException: mergeJoin assumes inputs sorted in ascending 
order, however [10, 1000] is greater than [10, null]
{code}

The problem is that {{EnumerableMergeJoin}} implementation (i.e. 
{{EnumerableDefaults#mergeJoin}}) expects its inputs to be sorted in ascending 
order, nulls last [1] (see {{EnumerableMergeJoinRule}} [2]). In case of a 
composite key, {{EnumerableMergeJoin}} will represent keys as 
{{JavaRowFormat.LIST}} [3], which is a comparable list, whose comparison is 
implemented via {{FlatLists.ComparableListImpl#compare}}. This method will 
compare both lists, item by item, but in will consider that a null item is 
less-than a non-null item [4]. This is a de-facto nulls-first collation, which 
contradicts the pre-requisite of the mergeJoin algorithm.


[1] 
https://github.com/apache/calcite/blob/200a136dd3b2ca04a1d1283eb5a03a04388b1947/linq4j/src/main/java/org/apache/calcite/linq4j/EnumerableDefaults.java#L1966
[2] 
https://github.com/apache/calcite/blob/200a136dd3b2ca04a1d1283eb5a03a04388b1947/core/src/main/java/org/apache/calcite/adapter/enumerable/EnumerableMergeJoinRule.java#L72
[3] 
https://github.com/apache/calcite/blob/200a136dd3b2ca04a1d1283eb5a03a04388b1947/core/src/main/java/org/apache/calcite/adapter/enumerable/EnumerableMergeJoin.java#L154
[4] 
https://github.com/apache/calcite/blob/200a136dd3b2ca04a1d1283eb5a03a04388b1947/core/src/main/java/org/apache/calcite/runtime/FlatLists.java#L1350


> EnumerableMergeJoin: wrong comparison of composite key with null values
> -----------------------------------------------------------------------
>
>                 Key: CALCITE-3846
>                 URL: https://issues.apache.org/jira/browse/CALCITE-3846
>             Project: Calcite
>          Issue Type: Bug
>          Components: core
>    Affects Versions: 1.22.0
>            Reporter: Ruben Q L
>            Assignee: Ruben Q L
>            Priority: Major
>              Labels: pull-request-available
>          Time Spent: 10m
>  Remaining Estimate: 0h
>
> The problem can be reproduced with the following test in 
> {{EnumerablesJoinTest.java}}:
> {code}
>   @Test public void testMergeJoinWithCompositeKeyAndNull() {
>     tester(false, new JdbcTest.HrSchema())
>         .query("?")
>         .withHook(Hook.PLANNER, (Consumer<RelOptPlanner>) planner -> {
>           planner.addRule(EnumerableRules.ENUMERABLE_MERGE_JOIN_RULE);
>           planner.removeRule(EnumerableRules.ENUMERABLE_JOIN_RULE);
>         })
>         .withRel(builder -> builder
>             .scan("s", "emps")
>             .sort(builder.field("deptno"), builder.field("commission"))
>             .scan("s", "emps")
>             .sort(builder.field("deptno"), builder.field("commission"))
>             .join(JoinRelType.INNER,
>                 builder.and(
>                     builder.equals(
>                         builder.field(2, 0, "deptno"),
>                         builder.field(2, 1, "deptno")),
>                     builder.equals(
>                         builder.field(2, 0, "commission"),
>                         builder.field(2, 1, "commission"))))
>             .project(
>                 builder.field("empid"))
>             .build())
>         .explainHookMatches("" // It is important that we have MergeJoin in 
> the plan
>             + "EnumerableCalc(expr#0..4=[{inputs}], empid=[$t0])\n"
>             + "  EnumerableMergeJoin(condition=[AND(=($1, $3), =($2, $4))], 
> joinType=[inner])\n"
>             + "    EnumerableSort(sort0=[$1], sort1=[$2], dir0=[ASC], 
> dir1=[ASC])\n"
>             + "      EnumerableCalc(expr#0..4=[{inputs}], 
> proj#0..1=[{exprs}], commission=[$t4])\n"
>             + "        EnumerableTableScan(table=[[s, emps]])\n"
>             + "    EnumerableSort(sort0=[$0], sort1=[$1], dir0=[ASC], 
> dir1=[ASC])\n"
>             + "      EnumerableCalc(expr#0..4=[{inputs}], deptno=[$t1], 
> commission=[$t4])\n"
>             + "        EnumerableTableScan(table=[[s, emps]])\n")
>         .returnsUnordered("empid=100\nempid=110\nempid=150\nempid=200");
>   }
> {code}
> The test fails with the following exception:
> {code}
> java.lang.IllegalStateException: mergeJoin assumes inputs sorted in ascending 
> order, however [10, 1000] is greater than [10, null]
> {code}
> The problem is that {{EnumerableMergeJoin}} implementation (i.e. 
> {{EnumerableDefaults#mergeJoin}}) expects its inputs to be sorted in 
> ascending order, nulls last [1] (see {{EnumerableMergeJoinRule}} [2]). In 
> case of a composite key, {{EnumerableMergeJoin}} will represent keys as 
> {{JavaRowFormat.LIST}} [3], which is a comparable list, whose comparison is 
> implemented via {{FlatLists.ComparableListImpl#compare}}. This method will 
> compare both lists, item by item, but it will consider that a null item is 
> less-than a non-null item [4]. This is a de-facto nulls-first collation, 
> which contradicts the pre-requisite of the mergeJoin algorithm.
> [1] 
> https://github.com/apache/calcite/blob/200a136dd3b2ca04a1d1283eb5a03a04388b1947/linq4j/src/main/java/org/apache/calcite/linq4j/EnumerableDefaults.java#L1966
> [2] 
> https://github.com/apache/calcite/blob/200a136dd3b2ca04a1d1283eb5a03a04388b1947/core/src/main/java/org/apache/calcite/adapter/enumerable/EnumerableMergeJoinRule.java#L72
> [3] 
> https://github.com/apache/calcite/blob/200a136dd3b2ca04a1d1283eb5a03a04388b1947/core/src/main/java/org/apache/calcite/adapter/enumerable/EnumerableMergeJoin.java#L154
> [4] 
> https://github.com/apache/calcite/blob/200a136dd3b2ca04a1d1283eb5a03a04388b1947/core/src/main/java/org/apache/calcite/runtime/FlatLists.java#L1350



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to