This is an automated email from the ASF dual-hosted git repository.
asolimando pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/calcite.git
The following commit(s) were added to refs/heads/main by this push:
new aa19cc1ba0 [CALCITE-7173] Improve RelMdDistinctRowCount estimation for
lossless casts
aa19cc1ba0 is described below
commit aa19cc1ba0db5ed94002cbae4b562fd7cdb5cd47
Author: Alessandro Solimando <[email protected]>
AuthorDate: Mon Sep 8 14:09:49 2025 +0200
[CALCITE-7173] Improve RelMdDistinctRowCount estimation for lossless casts
---
.../org/apache/calcite/rel/metadata/RelMdUtil.java | 3 +
.../org/apache/calcite/test/RelMetadataTest.java | 72 +++++++++++++++++++++-
core/src/test/resources/sql/sub-query.iq | 63 +++++++++----------
3 files changed, 104 insertions(+), 34 deletions(-)
diff --git a/core/src/main/java/org/apache/calcite/rel/metadata/RelMdUtil.java
b/core/src/main/java/org/apache/calcite/rel/metadata/RelMdUtil.java
index 17059c8b7d..dc12983f3f 100644
--- a/core/src/main/java/org/apache/calcite/rel/metadata/RelMdUtil.java
+++ b/core/src/main/java/org/apache/calcite/rel/metadata/RelMdUtil.java
@@ -602,6 +602,9 @@ public static void splitCols(
final RexNode e = projExprs.get(bit);
if (e instanceof RexInputRef) {
baseCols.set(((RexInputRef) e).getIndex());
+ } else if (RexUtil.isLosslessCast(e)
+ && ((RexCall) e).getOperands().get(0).isA(SqlKind.INPUT_REF)) {
+ baseCols.set(((RexInputRef) ((RexCall)
e).getOperands().get(0)).getIndex());
} else {
projCols.set(bit);
}
diff --git a/core/src/test/java/org/apache/calcite/test/RelMetadataTest.java
b/core/src/test/java/org/apache/calcite/test/RelMetadataTest.java
index 38f8ac587f..72f0cd958d 100644
--- a/core/src/test/java/org/apache/calcite/test/RelMetadataTest.java
+++ b/core/src/test/java/org/apache/calcite/test/RelMetadataTest.java
@@ -4666,7 +4666,7 @@ static <K, V> List<String> toSortedStringList(Map<K, V>
map) {
/** Test case of
* <a
href="https://issues.apache.org/jira/browse/CALCITE-7083">[CALCITE-7083]
* RelMdDistinctRowCount aggregates implementation problems</a>. */
- @Test void testAggregateDistinctRowcount() {
+ @Test void testAggregateDistinctRowCount() {
// test case of groupKey contains aggregated column
sql("select name, sum(sal) from (values ('b', 10), ('b', 20), ('b', 30))
as t(name, sal) "
+ "group by name")
@@ -4694,6 +4694,76 @@ static <K, V> List<String> toSortedStringList(Map<K, V>
map) {
.assertThatDistinctRowCount(bitSetOf(0), is(1d));
}
+ @Test void testAggregateDistinctRowCountLosslessCast() {
+ final String sql =
+ "select name, sal, cast(sal as varchar(11)) "
+ + "from (values ('b', 10), ('b', 20), ('b', 30)) t(name, sal) "
+ + "group by name, sal, cast(sal as varchar(11))";
+
+ // we expect NDV($i) = NDV(CAST($i)), if cast is lossless
+ sql(sql).assertThatDistinctRowCount(bitSetOf(1), is(3d));
+ sql(sql).assertThatDistinctRowCount(bitSetOf(2), is(3d));
+ }
+
+ @Test void testAggregateDistinctRowCountLosslessCastFilterOnField() {
+ final String sql =
+ "select name, sal, cast(sal as varchar(11))"
+ + "from (values ('b', 10), ('b', 20), ('b', 30)) t(name, sal) "
+ + "where sal = 10 "
+ + "group by name, sal, cast(sal as varchar(11))";
+
+ // we expect NDV($i) = NDV(CAST($i)), if cast is lossless
+ sql(sql).assertThatDistinctRowCount(bitSetOf(1), is(1d));
+ sql(sql).assertThatDistinctRowCount(bitSetOf(2), is(1d));
+ }
+
+ @Test void testAggregateDistinctRowCountLosslessCastFilterOnCastedField() {
+ final String sql =
+ "select name, sal, cast(sal as varchar(11))"
+ + "from (values ('b', 10), ('b', 20), ('b', 30)) t(name, sal) "
+ + "where cast(sal as varchar(11)) = 10 "
+ + "group by name, sal, cast(sal as varchar(11))";
+
+ // we expect NDV($i) = NDV(CAST($i)), if cast is lossless
+ sql(sql).assertThatDistinctRowCount(bitSetOf(1), is(1d));
+ sql(sql).assertThatDistinctRowCount(bitSetOf(2), is(1d));
+ }
+
+ @Test void testAggregateDistinctRowCountLossyCast() {
+ final String sql =
+ "select name, sal, cast(sal as int) "
+ + "from (values ('b', 10.1), ('b', 20.2), ('b', 30.3)) t(name,
sal) "
+ + "group by name, sal, cast(sal as int)";
+
+ // we expect NDV($i) >= NDV(CAST($i)), if cast is lossy
+ sql(sql).assertThatDistinctRowCount(bitSetOf(1), is(3d));
+ sql(sql).assertThatDistinctRowCount(bitSetOf(2),
closeTo(1.6439107033725735d, 0.1d));
+ }
+
+ @Test void testAggregateDistinctRowCountLossyCastFilterOnField() {
+ final String sql =
+ "select name, sal, cast(sal as int) "
+ + "from (values ('b', 10.1), ('b', 20.2), ('b', 30.3)) t(name,
sal) "
+ + "where sal > 20 "
+ + "group by name, sal, cast(sal as int)";
+
+ // we expect NDV($i) >= NDV(CAST($i)), if cast is lossy
+ sql(sql).assertThatDistinctRowCount(bitSetOf(1),
closeTo(1.367006838144548, 0.1d));
+ sql(sql).assertThatDistinctRowCount(bitSetOf(2),
closeTo(1.0744407789565844, 0.1d));
+ }
+
+ @Test void testAggregateDistinctRowCountLossyCastFilterOnCastedField() {
+ final String sql =
+ "select name, sal, cast(sal as int) "
+ + "from (values ('b', 10.1), ('b', 20.2), ('b', 30.3)) t(name,
sal) "
+ + "where cast(sal as int) > 20 "
+ + "group by name, sal, cast(sal as int)";
+
+ // we expect NDV($i) >= NDV(CAST($i)), if cast is lossy
+ sql(sql).assertThatDistinctRowCount(bitSetOf(1),
closeTo(1.367006838144548, 0.1d));
+ sql(sql).assertThatDistinctRowCount(bitSetOf(2),
closeTo(1.0744407789565844, 0.1d));
+ }
+
private void checkInputForCollationAndLimit(RelOptCluster cluster,
RelOptTable empTable,
RelOptTable deptTable) {
final RexBuilder rexBuilder = cluster.getRexBuilder();
diff --git a/core/src/test/resources/sql/sub-query.iq
b/core/src/test/resources/sql/sub-query.iq
index 08e924ee6a..330e3d990b 100644
--- a/core/src/test/resources/sql/sub-query.iq
+++ b/core/src/test/resources/sql/sub-query.iq
@@ -2684,17 +2684,16 @@ e.deptno > ANY(
select 2 from "scott".dept e2 where e2.deptno = e.empno) from "scott".emp as
e;
EnumerableCalc(expr#0..6=[{inputs}], expr#7=[>($t1, $t2)], expr#8=[IS
TRUE($t7)], expr#9=[IS NULL($t5)], expr#10=[0], expr#11=[=($t3, $t10)],
expr#12=[OR($t9, $t11)], expr#13=[IS NOT TRUE($t12)], expr#14=[AND($t8, $t13)],
expr#15=[>($t3, $t4)], expr#16=[IS TRUE($t15)], expr#17=[null:BOOLEAN],
expr#18=[IS NOT TRUE($t7)], expr#19=[AND($t16, $t17, $t13, $t18)], expr#20=[IS
NOT TRUE($t15)], expr#21=[AND($t7, $t13, $t18, $t20)], expr#22=[OR($t14, $t19,
$t21)], EMPNO=[$t0], EXPR$1=[$t22])
- EnumerableMergeJoin(condition=[=($0, $6)], joinType=[left])
+ EnumerableHashJoin(condition=[=($0, $6)], joinType=[left])
EnumerableCalc(expr#0..7=[{inputs}], EMPNO=[$t0], DEPTNO=[$t7])
EnumerableTableScan(table=[[scott, EMP]])
- EnumerableSort(sort0=[$4], dir0=[ASC])
- EnumerableCalc(expr#0..4=[{inputs}], expr#5=[IS NOT NULL($t3)],
expr#6=[0], expr#7=[CASE($t5, $t3, $t6)], m=[$t2], c=[$t7], d=[$t7],
trueLiteral=[$t4], DEPTNO0=[$t0])
- EnumerableNestedLoopJoin(condition=[IS NOT DISTINCT FROM($0, $1)],
joinType=[left])
- EnumerableCalc(expr#0..7=[{inputs}], EMPNO=[$t0])
- EnumerableTableScan(table=[[scott, EMP]])
- EnumerableAggregate(group=[{0}], m=[MIN($1)], c=[COUNT()],
trueLiteral=[LITERAL_AGG(true)])
- EnumerableCalc(expr#0..2=[{inputs}], expr#3=[CAST($t0):SMALLINT
NOT NULL], expr#4=[2], DEPTNO0=[$t3], EXPR$0=[$t4])
- EnumerableTableScan(table=[[scott, DEPT]])
+ EnumerableCalc(expr#0..4=[{inputs}], expr#5=[IS NOT NULL($t3)],
expr#6=[0], expr#7=[CASE($t5, $t3, $t6)], m=[$t2], c=[$t7], d=[$t7],
trueLiteral=[$t4], DEPTNO0=[$t0])
+ EnumerableNestedLoopJoin(condition=[IS NOT DISTINCT FROM($0, $1)],
joinType=[left])
+ EnumerableCalc(expr#0..7=[{inputs}], EMPNO=[$t0])
+ EnumerableTableScan(table=[[scott, EMP]])
+ EnumerableAggregate(group=[{0}], m=[MIN($1)], c=[COUNT()],
trueLiteral=[LITERAL_AGG(true)])
+ EnumerableCalc(expr#0..2=[{inputs}], expr#3=[CAST($t0):SMALLINT NOT
NULL], expr#4=[2], DEPTNO0=[$t3], EXPR$0=[$t4])
+ EnumerableTableScan(table=[[scott, DEPT]])
!plan
EMPNO | EXPR$1
-------+--------
@@ -2865,19 +2864,18 @@ select *
from "scott".emp as emp1
where empno <> some (select 2 from "scott".dept dept1 where dept1.deptno =
emp1.empno);
EnumerableCalc(expr#0..13=[{inputs}], expr#14=[<>($t9, $t8)], expr#15=[1],
expr#16=[<=($t10, $t15)], expr#17=[<>($t0, $t11)], expr#18=[IS NULL($t12)],
expr#19=[0], expr#20=[=($t8, $t19)], expr#21=[OR($t18, $t20)], expr#22=[IS NOT
TRUE($t21)], expr#23=[AND($t14, $t16, $t17, $t22)], expr#24=[=($t10, $t15)],
expr#25=[IS NOT NULL($t10)], expr#26=[AND($t14, $t25)], expr#27=[IS NOT
TRUE($t26)], expr#28=[AND($t24, $t17, $t22, $t27)], expr#29=[AND($t14, $t16)],
expr#30=[IS NOT TRUE($t29)], expr# [...]
- EnumerableMergeJoin(condition=[=($0, $13)], joinType=[left])
+ EnumerableHashJoin(condition=[=($0, $13)], joinType=[left])
EnumerableTableScan(table=[[scott, EMP]])
- EnumerableSort(sort0=[$5], dir0=[ASC])
- EnumerableCalc(expr#0..5=[{inputs}], expr#6=[IS NOT NULL($t2)],
expr#7=[0], expr#8=[CASE($t6, $t2, $t7)], expr#9=[IS NOT NULL($t3)],
expr#10=[CASE($t9, $t3, $t7)], c=[$t8], d=[$t8], dd=[$t10], m=[$t4],
trueLiteral=[$t5], DEPTNO0=[$t0])
- EnumerableNestedLoopJoin(condition=[IS NOT DISTINCT FROM($0, $1)],
joinType=[left])
- EnumerableCalc(expr#0..7=[{inputs}], EMPNO=[$t0])
- EnumerableTableScan(table=[[scott, EMP]])
- EnumerableCalc(expr#0..4=[{inputs}], expr#5=[CAST($t1):BIGINT NOT
NULL], expr#6=[CAST($t3):INTEGER NOT NULL], expr#7=[CAST($t4):BOOLEAN NOT
NULL], DEPTNO0=[$t0], c=[$t5], dd=[$t2], m=[$t6], trueLiteral=[$t7])
- EnumerableAggregate(group=[{0}], c=[MIN($2) FILTER $6],
dd=[COUNT($1) FILTER $5], m=[MIN($3) FILTER $6], trueLiteral=[MIN(true, $4)
FILTER $6])
- EnumerableCalc(expr#0..5=[{inputs}], expr#6=[0], expr#7=[=($t5,
$t6)], expr#8=[1], expr#9=[=($t5, $t8)], proj#0..4=[{exprs}], $g_0=[$t7],
$g_1=[$t9])
- EnumerableAggregate(group=[{0, 1}], groups=[[{0, 1}, {0}]],
c=[COUNT()], m=[MAX($1)], trueLiteral=[LITERAL_AGG(true)], $g=[GROUPING($0,
$1)])
- EnumerableCalc(expr#0..2=[{inputs}],
expr#3=[CAST($t0):SMALLINT NOT NULL], expr#4=[2], DEPTNO0=[$t3], EXPR$0=[$t4])
- EnumerableTableScan(table=[[scott, DEPT]])
+ EnumerableCalc(expr#0..5=[{inputs}], expr#6=[IS NOT NULL($t2)],
expr#7=[0], expr#8=[CASE($t6, $t2, $t7)], expr#9=[IS NOT NULL($t3)],
expr#10=[CASE($t9, $t3, $t7)], c=[$t8], d=[$t8], dd=[$t10], m=[$t4],
trueLiteral=[$t5], DEPTNO0=[$t0])
+ EnumerableNestedLoopJoin(condition=[IS NOT DISTINCT FROM($0, $1)],
joinType=[left])
+ EnumerableCalc(expr#0..7=[{inputs}], EMPNO=[$t0])
+ EnumerableTableScan(table=[[scott, EMP]])
+ EnumerableCalc(expr#0..4=[{inputs}], expr#5=[CAST($t1):BIGINT NOT
NULL], expr#6=[CAST($t3):INTEGER NOT NULL], expr#7=[CAST($t4):BOOLEAN NOT
NULL], DEPTNO0=[$t0], c=[$t5], dd=[$t2], m=[$t6], trueLiteral=[$t7])
+ EnumerableAggregate(group=[{0}], c=[MIN($2) FILTER $6],
dd=[COUNT($1) FILTER $5], m=[MIN($3) FILTER $6], trueLiteral=[MIN(true, $4)
FILTER $6])
+ EnumerableCalc(expr#0..5=[{inputs}], expr#6=[0], expr#7=[=($t5,
$t6)], expr#8=[1], expr#9=[=($t5, $t8)], proj#0..4=[{exprs}], $g_0=[$t7],
$g_1=[$t9])
+ EnumerableAggregate(group=[{0, 1}], groups=[[{0, 1}, {0}]],
c=[COUNT()], m=[MAX($1)], trueLiteral=[LITERAL_AGG(true)], $g=[GROUPING($0,
$1)])
+ EnumerableCalc(expr#0..2=[{inputs}],
expr#3=[CAST($t0):SMALLINT NOT NULL], expr#4=[2], DEPTNO0=[$t3], EXPR$0=[$t4])
+ EnumerableTableScan(table=[[scott, DEPT]])
!plan
+-------+-------+-----+-----+----------+-----+------+--------+
| EMPNO | ENAME | JOB | MGR | HIREDATE | SAL | COMM | DEPTNO |
@@ -2917,19 +2915,18 @@ select *
from "scott".emp as emp1
where comm <> some (select 2 from "scott".dept dept1 where dept1.deptno =
emp1.empno);
EnumerableCalc(expr#0..13=[{inputs}], expr#14=[<>($t9, $t8)], expr#15=[1],
expr#16=[<=($t10, $t15)], expr#17=[AND($t14, $t16)], expr#18=[=($t10, $t15)],
expr#19=[OR($t17, $t18)], expr#20=[<>($t6, $t11)], expr#21=[IS NULL($t12)],
expr#22=[IS NULL($t6)], expr#23=[0], expr#24=[=($t8, $t23)], expr#25=[OR($t21,
$t22, $t24)], expr#26=[IS NOT TRUE($t25)], expr#27=[AND($t19, $t20, $t26)],
expr#28=[IS NOT TRUE($t19)], expr#29=[AND($t26, $t28)], expr#30=[OR($t27,
$t29)], proj#0..7=[{exprs}], $cond [...]
- EnumerableMergeJoin(condition=[=($0, $13)], joinType=[left])
+ EnumerableHashJoin(condition=[=($0, $13)], joinType=[left])
EnumerableTableScan(table=[[scott, EMP]])
- EnumerableSort(sort0=[$5], dir0=[ASC])
- EnumerableCalc(expr#0..5=[{inputs}], expr#6=[IS NOT NULL($t2)],
expr#7=[0], expr#8=[CASE($t6, $t2, $t7)], expr#9=[IS NOT NULL($t3)],
expr#10=[CASE($t9, $t3, $t7)], c=[$t8], d=[$t8], dd=[$t10], m=[$t4],
trueLiteral=[$t5], DEPTNO0=[$t0])
- EnumerableNestedLoopJoin(condition=[IS NOT DISTINCT FROM($0, $1)],
joinType=[left])
- EnumerableCalc(expr#0..7=[{inputs}], EMPNO=[$t0])
- EnumerableTableScan(table=[[scott, EMP]])
- EnumerableCalc(expr#0..4=[{inputs}], expr#5=[CAST($t1):BIGINT NOT
NULL], expr#6=[CAST($t3):INTEGER NOT NULL], expr#7=[CAST($t4):BOOLEAN NOT
NULL], DEPTNO0=[$t0], c=[$t5], dd=[$t2], m=[$t6], trueLiteral=[$t7])
- EnumerableAggregate(group=[{0}], c=[MIN($2) FILTER $6],
dd=[COUNT($1) FILTER $5], m=[MIN($3) FILTER $6], trueLiteral=[MIN(true, $4)
FILTER $6])
- EnumerableCalc(expr#0..5=[{inputs}], expr#6=[0], expr#7=[=($t5,
$t6)], expr#8=[1], expr#9=[=($t5, $t8)], proj#0..4=[{exprs}], $g_0=[$t7],
$g_1=[$t9])
- EnumerableAggregate(group=[{0, 1}], groups=[[{0, 1}, {0}]],
c=[COUNT()], m=[MAX($1)], trueLiteral=[LITERAL_AGG(true)], $g=[GROUPING($0,
$1)])
- EnumerableCalc(expr#0..2=[{inputs}],
expr#3=[CAST($t0):SMALLINT NOT NULL], expr#4=[2], DEPTNO0=[$t3], EXPR$0=[$t4])
- EnumerableTableScan(table=[[scott, DEPT]])
+ EnumerableCalc(expr#0..5=[{inputs}], expr#6=[IS NOT NULL($t2)],
expr#7=[0], expr#8=[CASE($t6, $t2, $t7)], expr#9=[IS NOT NULL($t3)],
expr#10=[CASE($t9, $t3, $t7)], c=[$t8], d=[$t8], dd=[$t10], m=[$t4],
trueLiteral=[$t5], DEPTNO0=[$t0])
+ EnumerableNestedLoopJoin(condition=[IS NOT DISTINCT FROM($0, $1)],
joinType=[left])
+ EnumerableCalc(expr#0..7=[{inputs}], EMPNO=[$t0])
+ EnumerableTableScan(table=[[scott, EMP]])
+ EnumerableCalc(expr#0..4=[{inputs}], expr#5=[CAST($t1):BIGINT NOT
NULL], expr#6=[CAST($t3):INTEGER NOT NULL], expr#7=[CAST($t4):BOOLEAN NOT
NULL], DEPTNO0=[$t0], c=[$t5], dd=[$t2], m=[$t6], trueLiteral=[$t7])
+ EnumerableAggregate(group=[{0}], c=[MIN($2) FILTER $6],
dd=[COUNT($1) FILTER $5], m=[MIN($3) FILTER $6], trueLiteral=[MIN(true, $4)
FILTER $6])
+ EnumerableCalc(expr#0..5=[{inputs}], expr#6=[0], expr#7=[=($t5,
$t6)], expr#8=[1], expr#9=[=($t5, $t8)], proj#0..4=[{exprs}], $g_0=[$t7],
$g_1=[$t9])
+ EnumerableAggregate(group=[{0, 1}], groups=[[{0, 1}, {0}]],
c=[COUNT()], m=[MAX($1)], trueLiteral=[LITERAL_AGG(true)], $g=[GROUPING($0,
$1)])
+ EnumerableCalc(expr#0..2=[{inputs}],
expr#3=[CAST($t0):SMALLINT NOT NULL], expr#4=[2], DEPTNO0=[$t3], EXPR$0=[$t4])
+ EnumerableTableScan(table=[[scott, DEPT]])
!plan
+-------+-------+-----+-----+----------+-----+------+--------+
| EMPNO | ENAME | JOB | MGR | HIREDATE | SAL | COMM | DEPTNO |