(calcite) branch main updated: [CALCITE-7173] Improve RelMdDistinctRowCount estimation for lossless casts

asolimando Wed, 17 Sep 2025 22:30:52 -0700

This is an automated email from the ASF dual-hosted git repository.

asolimando pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/calcite.git



The following commit(s) were added to refs/heads/main by this push:
     new aa19cc1ba0 [CALCITE-7173] Improve RelMdDistinctRowCount estimation for 
lossless casts
aa19cc1ba0 is described below

commit aa19cc1ba0db5ed94002cbae4b562fd7cdb5cd47
Author: Alessandro Solimando <[email protected]>
AuthorDate: Mon Sep 8 14:09:49 2025 +0200

    [CALCITE-7173] Improve RelMdDistinctRowCount estimation for lossless casts
---
 .../org/apache/calcite/rel/metadata/RelMdUtil.java |  3 +
 .../org/apache/calcite/test/RelMetadataTest.java   | 72 +++++++++++++++++++++-
 core/src/test/resources/sql/sub-query.iq           | 63 +++++++++----------
 3 files changed, 104 insertions(+), 34 deletions(-)

diff --git a/core/src/main/java/org/apache/calcite/rel/metadata/RelMdUtil.java 
b/core/src/main/java/org/apache/calcite/rel/metadata/RelMdUtil.java
index 17059c8b7d..dc12983f3f 100644
--- a/core/src/main/java/org/apache/calcite/rel/metadata/RelMdUtil.java
+++ b/core/src/main/java/org/apache/calcite/rel/metadata/RelMdUtil.java
@@ -602,6 +602,9 @@ public static void splitCols(
       final RexNode e = projExprs.get(bit);
       if (e instanceof RexInputRef) {
         baseCols.set(((RexInputRef) e).getIndex());
+      } else if (RexUtil.isLosslessCast(e)
+          && ((RexCall) e).getOperands().get(0).isA(SqlKind.INPUT_REF)) {
+        baseCols.set(((RexInputRef) ((RexCall) 
e).getOperands().get(0)).getIndex());
       } else {
         projCols.set(bit);
       }
diff --git a/core/src/test/java/org/apache/calcite/test/RelMetadataTest.java 
b/core/src/test/java/org/apache/calcite/test/RelMetadataTest.java
index 38f8ac587f..72f0cd958d 100644
--- a/core/src/test/java/org/apache/calcite/test/RelMetadataTest.java
+++ b/core/src/test/java/org/apache/calcite/test/RelMetadataTest.java
@@ -4666,7 +4666,7 @@ static <K, V> List<String> toSortedStringList(Map<K, V> 
map) {
   /** Test case of
    * <a 
href="https://issues.apache.org/jira/browse/CALCITE-7083";>[CALCITE-7083]
    * RelMdDistinctRowCount aggregates implementation problems</a>. */
-  @Test void testAggregateDistinctRowcount() {
+  @Test void testAggregateDistinctRowCount() {
     // test case of groupKey contains aggregated column
     sql("select name, sum(sal) from (values ('b', 10), ('b', 20), ('b', 30)) 
as t(name, sal) "
         + "group by name")
@@ -4694,6 +4694,76 @@ static <K, V> List<String> toSortedStringList(Map<K, V> 
map) {
         .assertThatDistinctRowCount(bitSetOf(0), is(1d));
   }
 
+  @Test void testAggregateDistinctRowCountLosslessCast() {
+    final String sql =
+        "select name, sal, cast(sal as varchar(11)) "
+            + "from (values ('b', 10), ('b', 20), ('b', 30)) t(name, sal) "
+            + "group by name, sal, cast(sal as varchar(11))";
+
+    // we expect NDV($i) = NDV(CAST($i)), if cast is lossless
+    sql(sql).assertThatDistinctRowCount(bitSetOf(1), is(3d));
+    sql(sql).assertThatDistinctRowCount(bitSetOf(2), is(3d));
+  }
+
+  @Test void testAggregateDistinctRowCountLosslessCastFilterOnField() {
+    final String sql =
+        "select name, sal, cast(sal as varchar(11))"
+            + "from (values ('b', 10), ('b', 20), ('b', 30)) t(name, sal) "
+            + "where sal = 10 "
+            + "group by name, sal, cast(sal as varchar(11))";
+
+    // we expect NDV($i) = NDV(CAST($i)), if cast is lossless
+    sql(sql).assertThatDistinctRowCount(bitSetOf(1), is(1d));
+    sql(sql).assertThatDistinctRowCount(bitSetOf(2), is(1d));
+  }
+
+  @Test void testAggregateDistinctRowCountLosslessCastFilterOnCastedField() {
+    final String sql =
+        "select name, sal, cast(sal as varchar(11))"
+            + "from (values ('b', 10), ('b', 20), ('b', 30)) t(name, sal) "
+            + "where cast(sal as varchar(11)) = 10 "
+            + "group by name, sal, cast(sal as varchar(11))";
+
+    // we expect NDV($i) = NDV(CAST($i)), if cast is lossless
+    sql(sql).assertThatDistinctRowCount(bitSetOf(1), is(1d));
+    sql(sql).assertThatDistinctRowCount(bitSetOf(2), is(1d));
+  }
+
+  @Test void testAggregateDistinctRowCountLossyCast() {
+    final String sql =
+        "select name, sal, cast(sal as int) "
+            + "from (values ('b', 10.1), ('b', 20.2), ('b', 30.3)) t(name, 
sal) "
+            + "group by name, sal, cast(sal as int)";
+
+    // we expect NDV($i) >= NDV(CAST($i)), if cast is lossy
+    sql(sql).assertThatDistinctRowCount(bitSetOf(1), is(3d));
+    sql(sql).assertThatDistinctRowCount(bitSetOf(2), 
closeTo(1.6439107033725735d, 0.1d));
+  }
+
+  @Test void testAggregateDistinctRowCountLossyCastFilterOnField() {
+    final String sql =
+        "select name, sal, cast(sal as int) "
+            + "from (values ('b', 10.1), ('b', 20.2), ('b', 30.3)) t(name, 
sal) "
+            + "where sal > 20 "
+            + "group by name, sal, cast(sal as int)";
+
+    // we expect NDV($i) >= NDV(CAST($i)), if cast is lossy
+    sql(sql).assertThatDistinctRowCount(bitSetOf(1), 
closeTo(1.367006838144548, 0.1d));
+    sql(sql).assertThatDistinctRowCount(bitSetOf(2), 
closeTo(1.0744407789565844, 0.1d));
+  }
+
+  @Test void testAggregateDistinctRowCountLossyCastFilterOnCastedField() {
+    final String sql =
+        "select name, sal, cast(sal as int) "
+            + "from (values ('b', 10.1), ('b', 20.2), ('b', 30.3)) t(name, 
sal) "
+            + "where cast(sal as int) > 20 "
+            + "group by name, sal, cast(sal as int)";
+
+    // we expect NDV($i) >= NDV(CAST($i)), if cast is lossy
+    sql(sql).assertThatDistinctRowCount(bitSetOf(1), 
closeTo(1.367006838144548, 0.1d));
+    sql(sql).assertThatDistinctRowCount(bitSetOf(2), 
closeTo(1.0744407789565844, 0.1d));
+  }
+
   private void checkInputForCollationAndLimit(RelOptCluster cluster, 
RelOptTable empTable,
       RelOptTable deptTable) {
     final RexBuilder rexBuilder = cluster.getRexBuilder();
diff --git a/core/src/test/resources/sql/sub-query.iq 
b/core/src/test/resources/sql/sub-query.iq
index 08e924ee6a..330e3d990b 100644
--- a/core/src/test/resources/sql/sub-query.iq
+++ b/core/src/test/resources/sql/sub-query.iq
@@ -2684,17 +2684,16 @@ e.deptno > ANY(
   select 2 from "scott".dept e2 where e2.deptno = e.empno) from "scott".emp as 
e;
 
 EnumerableCalc(expr#0..6=[{inputs}], expr#7=[>($t1, $t2)], expr#8=[IS 
TRUE($t7)], expr#9=[IS NULL($t5)], expr#10=[0], expr#11=[=($t3, $t10)], 
expr#12=[OR($t9, $t11)], expr#13=[IS NOT TRUE($t12)], expr#14=[AND($t8, $t13)], 
expr#15=[>($t3, $t4)], expr#16=[IS TRUE($t15)], expr#17=[null:BOOLEAN], 
expr#18=[IS NOT TRUE($t7)], expr#19=[AND($t16, $t17, $t13, $t18)], expr#20=[IS 
NOT TRUE($t15)], expr#21=[AND($t7, $t13, $t18, $t20)], expr#22=[OR($t14, $t19, 
$t21)], EMPNO=[$t0], EXPR$1=[$t22])
-  EnumerableMergeJoin(condition=[=($0, $6)], joinType=[left])
+  EnumerableHashJoin(condition=[=($0, $6)], joinType=[left])
     EnumerableCalc(expr#0..7=[{inputs}], EMPNO=[$t0], DEPTNO=[$t7])
       EnumerableTableScan(table=[[scott, EMP]])
-    EnumerableSort(sort0=[$4], dir0=[ASC])
-      EnumerableCalc(expr#0..4=[{inputs}], expr#5=[IS NOT NULL($t3)], 
expr#6=[0], expr#7=[CASE($t5, $t3, $t6)], m=[$t2], c=[$t7], d=[$t7], 
trueLiteral=[$t4], DEPTNO0=[$t0])
-        EnumerableNestedLoopJoin(condition=[IS NOT DISTINCT FROM($0, $1)], 
joinType=[left])
-          EnumerableCalc(expr#0..7=[{inputs}], EMPNO=[$t0])
-            EnumerableTableScan(table=[[scott, EMP]])
-          EnumerableAggregate(group=[{0}], m=[MIN($1)], c=[COUNT()], 
trueLiteral=[LITERAL_AGG(true)])
-            EnumerableCalc(expr#0..2=[{inputs}], expr#3=[CAST($t0):SMALLINT 
NOT NULL], expr#4=[2], DEPTNO0=[$t3], EXPR$0=[$t4])
-              EnumerableTableScan(table=[[scott, DEPT]])
+    EnumerableCalc(expr#0..4=[{inputs}], expr#5=[IS NOT NULL($t3)], 
expr#6=[0], expr#7=[CASE($t5, $t3, $t6)], m=[$t2], c=[$t7], d=[$t7], 
trueLiteral=[$t4], DEPTNO0=[$t0])
+      EnumerableNestedLoopJoin(condition=[IS NOT DISTINCT FROM($0, $1)], 
joinType=[left])
+        EnumerableCalc(expr#0..7=[{inputs}], EMPNO=[$t0])
+          EnumerableTableScan(table=[[scott, EMP]])
+        EnumerableAggregate(group=[{0}], m=[MIN($1)], c=[COUNT()], 
trueLiteral=[LITERAL_AGG(true)])
+          EnumerableCalc(expr#0..2=[{inputs}], expr#3=[CAST($t0):SMALLINT NOT 
NULL], expr#4=[2], DEPTNO0=[$t3], EXPR$0=[$t4])
+            EnumerableTableScan(table=[[scott, DEPT]])
 !plan
  EMPNO | EXPR$1
 -------+--------
@@ -2865,19 +2864,18 @@ select *
 from "scott".emp as emp1
 where empno <> some (select 2 from "scott".dept dept1 where dept1.deptno = 
emp1.empno);
 EnumerableCalc(expr#0..13=[{inputs}], expr#14=[<>($t9, $t8)], expr#15=[1], 
expr#16=[<=($t10, $t15)], expr#17=[<>($t0, $t11)], expr#18=[IS NULL($t12)], 
expr#19=[0], expr#20=[=($t8, $t19)], expr#21=[OR($t18, $t20)], expr#22=[IS NOT 
TRUE($t21)], expr#23=[AND($t14, $t16, $t17, $t22)], expr#24=[=($t10, $t15)], 
expr#25=[IS NOT NULL($t10)], expr#26=[AND($t14, $t25)], expr#27=[IS NOT 
TRUE($t26)], expr#28=[AND($t24, $t17, $t22, $t27)], expr#29=[AND($t14, $t16)], 
expr#30=[IS NOT TRUE($t29)], expr# [...]
-  EnumerableMergeJoin(condition=[=($0, $13)], joinType=[left])
+  EnumerableHashJoin(condition=[=($0, $13)], joinType=[left])
     EnumerableTableScan(table=[[scott, EMP]])
-    EnumerableSort(sort0=[$5], dir0=[ASC])
-      EnumerableCalc(expr#0..5=[{inputs}], expr#6=[IS NOT NULL($t2)], 
expr#7=[0], expr#8=[CASE($t6, $t2, $t7)], expr#9=[IS NOT NULL($t3)], 
expr#10=[CASE($t9, $t3, $t7)], c=[$t8], d=[$t8], dd=[$t10], m=[$t4], 
trueLiteral=[$t5], DEPTNO0=[$t0])
-        EnumerableNestedLoopJoin(condition=[IS NOT DISTINCT FROM($0, $1)], 
joinType=[left])
-          EnumerableCalc(expr#0..7=[{inputs}], EMPNO=[$t0])
-            EnumerableTableScan(table=[[scott, EMP]])
-          EnumerableCalc(expr#0..4=[{inputs}], expr#5=[CAST($t1):BIGINT NOT 
NULL], expr#6=[CAST($t3):INTEGER NOT NULL], expr#7=[CAST($t4):BOOLEAN NOT 
NULL], DEPTNO0=[$t0], c=[$t5], dd=[$t2], m=[$t6], trueLiteral=[$t7])
-            EnumerableAggregate(group=[{0}], c=[MIN($2) FILTER $6], 
dd=[COUNT($1) FILTER $5], m=[MIN($3) FILTER $6], trueLiteral=[MIN(true, $4) 
FILTER $6])
-              EnumerableCalc(expr#0..5=[{inputs}], expr#6=[0], expr#7=[=($t5, 
$t6)], expr#8=[1], expr#9=[=($t5, $t8)], proj#0..4=[{exprs}], $g_0=[$t7], 
$g_1=[$t9])
-                EnumerableAggregate(group=[{0, 1}], groups=[[{0, 1}, {0}]], 
c=[COUNT()], m=[MAX($1)], trueLiteral=[LITERAL_AGG(true)], $g=[GROUPING($0, 
$1)])
-                  EnumerableCalc(expr#0..2=[{inputs}], 
expr#3=[CAST($t0):SMALLINT NOT NULL], expr#4=[2], DEPTNO0=[$t3], EXPR$0=[$t4])
-                    EnumerableTableScan(table=[[scott, DEPT]])
+    EnumerableCalc(expr#0..5=[{inputs}], expr#6=[IS NOT NULL($t2)], 
expr#7=[0], expr#8=[CASE($t6, $t2, $t7)], expr#9=[IS NOT NULL($t3)], 
expr#10=[CASE($t9, $t3, $t7)], c=[$t8], d=[$t8], dd=[$t10], m=[$t4], 
trueLiteral=[$t5], DEPTNO0=[$t0])
+      EnumerableNestedLoopJoin(condition=[IS NOT DISTINCT FROM($0, $1)], 
joinType=[left])
+        EnumerableCalc(expr#0..7=[{inputs}], EMPNO=[$t0])
+          EnumerableTableScan(table=[[scott, EMP]])
+        EnumerableCalc(expr#0..4=[{inputs}], expr#5=[CAST($t1):BIGINT NOT 
NULL], expr#6=[CAST($t3):INTEGER NOT NULL], expr#7=[CAST($t4):BOOLEAN NOT 
NULL], DEPTNO0=[$t0], c=[$t5], dd=[$t2], m=[$t6], trueLiteral=[$t7])
+          EnumerableAggregate(group=[{0}], c=[MIN($2) FILTER $6], 
dd=[COUNT($1) FILTER $5], m=[MIN($3) FILTER $6], trueLiteral=[MIN(true, $4) 
FILTER $6])
+            EnumerableCalc(expr#0..5=[{inputs}], expr#6=[0], expr#7=[=($t5, 
$t6)], expr#8=[1], expr#9=[=($t5, $t8)], proj#0..4=[{exprs}], $g_0=[$t7], 
$g_1=[$t9])
+              EnumerableAggregate(group=[{0, 1}], groups=[[{0, 1}, {0}]], 
c=[COUNT()], m=[MAX($1)], trueLiteral=[LITERAL_AGG(true)], $g=[GROUPING($0, 
$1)])
+                EnumerableCalc(expr#0..2=[{inputs}], 
expr#3=[CAST($t0):SMALLINT NOT NULL], expr#4=[2], DEPTNO0=[$t3], EXPR$0=[$t4])
+                  EnumerableTableScan(table=[[scott, DEPT]])
 !plan
 +-------+-------+-----+-----+----------+-----+------+--------+
 | EMPNO | ENAME | JOB | MGR | HIREDATE | SAL | COMM | DEPTNO |
@@ -2917,19 +2915,18 @@ select *
 from "scott".emp as emp1
 where comm <> some (select 2 from "scott".dept dept1 where dept1.deptno = 
emp1.empno);
 EnumerableCalc(expr#0..13=[{inputs}], expr#14=[<>($t9, $t8)], expr#15=[1], 
expr#16=[<=($t10, $t15)], expr#17=[AND($t14, $t16)], expr#18=[=($t10, $t15)], 
expr#19=[OR($t17, $t18)], expr#20=[<>($t6, $t11)], expr#21=[IS NULL($t12)], 
expr#22=[IS NULL($t6)], expr#23=[0], expr#24=[=($t8, $t23)], expr#25=[OR($t21, 
$t22, $t24)], expr#26=[IS NOT TRUE($t25)], expr#27=[AND($t19, $t20, $t26)], 
expr#28=[IS NOT TRUE($t19)], expr#29=[AND($t26, $t28)], expr#30=[OR($t27, 
$t29)], proj#0..7=[{exprs}], $cond [...]
-  EnumerableMergeJoin(condition=[=($0, $13)], joinType=[left])
+  EnumerableHashJoin(condition=[=($0, $13)], joinType=[left])
     EnumerableTableScan(table=[[scott, EMP]])
-    EnumerableSort(sort0=[$5], dir0=[ASC])
-      EnumerableCalc(expr#0..5=[{inputs}], expr#6=[IS NOT NULL($t2)], 
expr#7=[0], expr#8=[CASE($t6, $t2, $t7)], expr#9=[IS NOT NULL($t3)], 
expr#10=[CASE($t9, $t3, $t7)], c=[$t8], d=[$t8], dd=[$t10], m=[$t4], 
trueLiteral=[$t5], DEPTNO0=[$t0])
-        EnumerableNestedLoopJoin(condition=[IS NOT DISTINCT FROM($0, $1)], 
joinType=[left])
-          EnumerableCalc(expr#0..7=[{inputs}], EMPNO=[$t0])
-            EnumerableTableScan(table=[[scott, EMP]])
-          EnumerableCalc(expr#0..4=[{inputs}], expr#5=[CAST($t1):BIGINT NOT 
NULL], expr#6=[CAST($t3):INTEGER NOT NULL], expr#7=[CAST($t4):BOOLEAN NOT 
NULL], DEPTNO0=[$t0], c=[$t5], dd=[$t2], m=[$t6], trueLiteral=[$t7])
-            EnumerableAggregate(group=[{0}], c=[MIN($2) FILTER $6], 
dd=[COUNT($1) FILTER $5], m=[MIN($3) FILTER $6], trueLiteral=[MIN(true, $4) 
FILTER $6])
-              EnumerableCalc(expr#0..5=[{inputs}], expr#6=[0], expr#7=[=($t5, 
$t6)], expr#8=[1], expr#9=[=($t5, $t8)], proj#0..4=[{exprs}], $g_0=[$t7], 
$g_1=[$t9])
-                EnumerableAggregate(group=[{0, 1}], groups=[[{0, 1}, {0}]], 
c=[COUNT()], m=[MAX($1)], trueLiteral=[LITERAL_AGG(true)], $g=[GROUPING($0, 
$1)])
-                  EnumerableCalc(expr#0..2=[{inputs}], 
expr#3=[CAST($t0):SMALLINT NOT NULL], expr#4=[2], DEPTNO0=[$t3], EXPR$0=[$t4])
-                    EnumerableTableScan(table=[[scott, DEPT]])
+    EnumerableCalc(expr#0..5=[{inputs}], expr#6=[IS NOT NULL($t2)], 
expr#7=[0], expr#8=[CASE($t6, $t2, $t7)], expr#9=[IS NOT NULL($t3)], 
expr#10=[CASE($t9, $t3, $t7)], c=[$t8], d=[$t8], dd=[$t10], m=[$t4], 
trueLiteral=[$t5], DEPTNO0=[$t0])
+      EnumerableNestedLoopJoin(condition=[IS NOT DISTINCT FROM($0, $1)], 
joinType=[left])
+        EnumerableCalc(expr#0..7=[{inputs}], EMPNO=[$t0])
+          EnumerableTableScan(table=[[scott, EMP]])
+        EnumerableCalc(expr#0..4=[{inputs}], expr#5=[CAST($t1):BIGINT NOT 
NULL], expr#6=[CAST($t3):INTEGER NOT NULL], expr#7=[CAST($t4):BOOLEAN NOT 
NULL], DEPTNO0=[$t0], c=[$t5], dd=[$t2], m=[$t6], trueLiteral=[$t7])
+          EnumerableAggregate(group=[{0}], c=[MIN($2) FILTER $6], 
dd=[COUNT($1) FILTER $5], m=[MIN($3) FILTER $6], trueLiteral=[MIN(true, $4) 
FILTER $6])
+            EnumerableCalc(expr#0..5=[{inputs}], expr#6=[0], expr#7=[=($t5, 
$t6)], expr#8=[1], expr#9=[=($t5, $t8)], proj#0..4=[{exprs}], $g_0=[$t7], 
$g_1=[$t9])
+              EnumerableAggregate(group=[{0, 1}], groups=[[{0, 1}, {0}]], 
c=[COUNT()], m=[MAX($1)], trueLiteral=[LITERAL_AGG(true)], $g=[GROUPING($0, 
$1)])
+                EnumerableCalc(expr#0..2=[{inputs}], 
expr#3=[CAST($t0):SMALLINT NOT NULL], expr#4=[2], DEPTNO0=[$t3], EXPR$0=[$t4])
+                  EnumerableTableScan(table=[[scott, DEPT]])
 !plan
 +-------+-------+-----+-----+----------+-----+------+--------+
 | EMPNO | ENAME | JOB | MGR | HIREDATE | SAL | COMM | DEPTNO |

(calcite) branch main updated: [CALCITE-7173] Improve RelMdDistinctRowCount estimation for lossless casts

Reply via email to