(cloudberry) branch main updated: Add option to avoid additional generated EquivalenceCalss for RelabelType in cdb_pull_up_eclass.

avamingli Tue, 12 Aug 2025 23:58:02 -0700

This is an automated email from the ASF dual-hosted git repository.

avamingli pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git



The following commit(s) were added to refs/heads/main by this push:
     new 91c28e9a97e Add option to avoid additional generated EquivalenceCalss 
for RelabelType in cdb_pull_up_eclass.
91c28e9a97e is described below

commit 91c28e9a97e8e8444f42c6e5435b71ed292ada5f
Author: Zhang Mingli <[email protected]>
AuthorDate: Mon Aug 11 15:34:28 2025 +0800

    Add option to avoid additional generated EquivalenceCalss for RelabelType 
in cdb_pull_up_eclass.
    
    Fix issue https://github.com/apache/cloudberry/issues/1301
    
    For partition tables, cdb_pull_up_eclass will call get_eclass_for_sort_expr
    to find an existing equivalence class it is a member of.
    It will fail and build a new single-member EquivalenceClass for it when 
there is RelabelType.
    That will cause cdbpath_match_preds_to_both_distkeys return false even the 
locus are compatible
    because the RestrictInfo's left_ec/right_ec and rel's distribution key's 
dk_eclasses are two
    different pointers.
    And unnecessary Motions will be added in that case.
    
    CREATE TABLE t1 (id varchar(32), date date) DISTRIBUTED BY (id)
    PARTITION BY RANGE (date)
    (START (date '2016-01-01') INCLUSIVE END (date '2016-01-02') EXCLUSIVE 
EVERY (INTERVAL '1 day'));
    CREATE TABLE t2 (id varchar(32)) DISTRIBUTED BY (id);
    EXPLAIN(COSTS OFF) SELECT COUNT(*) FROM t1 JOIN t2 USING(id);
                                QUERY PLAN
    ------------------------------------------------------------------
     Aggregate
       ->  Gather Motion 3:1  (slice1; segments: 3)
             ->  Hash Join
                   Hash Cond: ((t1.id)::text = (t2.id)::text)
                   ->  Redistribute Motion 3:3  (slice2; segments: 3)
                         Hash Key: t1.id
                         ->  Seq Scan on t1_1_prt_1 t1
                   ->  Hash
                         ->  Seq Scan on t2
     Optimizer: Postgres query optimizer
    (10 rows)
    
    cdbpullup_findEclassInTargetList() will ignore the presence or absence of a 
RelabelType node atop
    either expr in determining whether an EC member expr matches a targetlist 
expr.
    But get_eclass_for_sort_expr() didn't handle additional RelabelType and a 
new EC may be generated.
    We should not disturb get_eclass_for_sort_expr() as it's used at many 
places and UPSTREAM didn't
    add RelabelType to EC for that case.
    If find a sub_distkeyexpr from cdbpullup_findEclassInTargetList and we know 
that there is RelabelType
    stripped inside, we don't need to try to fetch or generate new EC in 
get_eclass_for_sort_expr().
    So, add options for cdb_pull_up_eclass to ignore RelabelType when needed.
    
    Authored-by: Zhang Mingli [email protected]
---
 src/backend/cdb/cdbpathlocus.c               |  8 ++-
 src/backend/cdb/cdbpullup.c                  | 12 +++-
 src/backend/optimizer/path/pathkeys.c        |  9 ++-
 src/include/cdb/cdbpullup.h                  |  2 +-
 src/include/optimizer/paths.h                |  3 +-
 src/test/regress/expected/partition_join.out | 95 ++++++++++++++++++++++++++++
 src/test/regress/sql/partition_join.sql      | 40 ++++++++++++
 7 files changed, 160 insertions(+), 9 deletions(-)

diff --git a/src/backend/cdb/cdbpathlocus.c b/src/backend/cdb/cdbpathlocus.c
index 04fba4b367a..29930085429 100644
--- a/src/backend/cdb/cdbpathlocus.c
+++ b/src/backend/cdb/cdbpathlocus.c
@@ -518,7 +518,8 @@ cdbpathlocus_from_subquery(struct PlannerInfo *root,
                                                                                
          rel->relids,
                                                                                
          usable_subtlist,
                                                                                
          new_vars,
-                                                                               
          -1 /* not used */);
+                                                                               
          -1 /* not used */,
+                                                                               
          false/* not used */);
                                if (outer_ec)
                                {
                                        outer_dk = makeNode(DistributionKey);
@@ -589,7 +590,7 @@ cdbpathlocus_get_distkey_exprs(CdbPathLocus locus,
                                EquivalenceClass *dk_eclass = (EquivalenceClass 
*) lfirst(ec_cell);
 
                                item = 
cdbpullup_findEclassInTargetList(dk_eclass, targetlist,
-                                                                               
                                distkey->dk_opfamily);
+                                                                               
                                distkey->dk_opfamily, NULL);
 
                                if (item)
                                        break;
@@ -668,7 +669,8 @@ cdbpathlocus_pull_above_projection(struct PlannerInfo *root,
                                                                                
        relids,
                                                                                
        targetlist,
                                                                                
        newvarlist,
-                                                                               
        newrelid);
+                                                                               
        newrelid,
+                                                                               
        true /* ignore RelabelType */);
                                if (new_ec)
                                        break;
                        }
diff --git a/src/backend/cdb/cdbpullup.c b/src/backend/cdb/cdbpullup.c
index c6f243ae08a..a7aa6486856 100644
--- a/src/backend/cdb/cdbpullup.c
+++ b/src/backend/cdb/cdbpullup.c
@@ -241,7 +241,7 @@ cdbpullup_expr(Expr *expr, List *targetlist, List 
*newvarlist, Index newvarno)
  */
 Expr *
 cdbpullup_findEclassInTargetList(EquivalenceClass *eclass, List *targetlist,
-                                                                Oid 
hashOpFamily)
+                                                                Oid 
hashOpFamily, bool *relabel_stripped)
 {
        ListCell   *lc;
 
@@ -276,7 +276,11 @@ cdbpullup_findEclassInTargetList(EquivalenceClass *eclass, 
List *targetlist,
                 *-------
                 */
                while (IsA(key, RelabelType))
+               {
                        key = (Expr *) ((RelabelType *) key)->arg;
+                       if(relabel_stripped && (!*relabel_stripped))
+                               *relabel_stripped = true;
+               }
 
                foreach(lc_tle, targetlist)
                {
@@ -293,7 +297,11 @@ cdbpullup_findEclassInTargetList(EquivalenceClass *eclass, 
List *targetlist,
                        /* ignore RelabelType nodes on both sides */
                        naked_tlexpr = tlexpr;
                        while (naked_tlexpr && IsA(naked_tlexpr, RelabelType))
+                       {
                                naked_tlexpr = (Node *) ((RelabelType *) 
naked_tlexpr)->arg;
+                               if(relabel_stripped && (!*relabel_stripped))
+                                       *relabel_stripped = true;
+                       }
 
                        if (IsA(key, Var))
                        {
@@ -342,7 +350,7 @@ cdbpullup_truncatePathKeysForTargetList(List *pathkeys, 
List *targetlist)
        {
                PathKey    *pk = (PathKey *) lfirst(lc);
 
-               if (!cdbpullup_findEclassInTargetList(pk->pk_eclass, 
targetlist, InvalidOid))
+               if (!cdbpullup_findEclassInTargetList(pk->pk_eclass, 
targetlist, InvalidOid, NULL))
                        break;
 
                new_pathkeys = lappend(new_pathkeys, pk);
diff --git a/src/backend/optimizer/path/pathkeys.c 
b/src/backend/optimizer/path/pathkeys.c
index ece5f3cde82..48a60c137d3 100644
--- a/src/backend/optimizer/path/pathkeys.c
+++ b/src/backend/optimizer/path/pathkeys.c
@@ -1551,23 +1551,28 @@ cdb_pull_up_eclass(PlannerInfo *root,
                                   Relids relids,
                                   List *targetlist,
                                   List *newvarlist,
-                                  Index newrelid)
+                                  Index newrelid,
+                                  bool ignore_relabel)
 {
        Expr       *sub_distkeyexpr;
        EquivalenceClass *outer_ec;
        Expr       *newexpr = NULL;
        Index sortref = 0;
+       bool    relabel_stripped = false;
 
        Assert(eclass);
        Assert(!newvarlist ||
                   list_length(newvarlist) == list_length(targetlist));
 
        /* Find an expr that we can rewrite to use the projected columns. */
-       sub_distkeyexpr = cdbpullup_findEclassInTargetList(eclass, targetlist, 
InvalidOid);
+       sub_distkeyexpr = cdbpullup_findEclassInTargetList(eclass, targetlist, 
InvalidOid, &relabel_stripped);
 
        /* Replace expr's Var nodes with new ones referencing the targetlist. */
        if (sub_distkeyexpr)
        {
+               if (ignore_relabel && relabel_stripped)
+                       return eclass;
+
                newexpr = cdbpullup_expr(sub_distkeyexpr,
                                                                 targetlist,
                                                                 newvarlist,
diff --git a/src/include/cdb/cdbpullup.h b/src/include/cdb/cdbpullup.h
index 535b2987ce1..8c23a4a64f2 100644
--- a/src/include/cdb/cdbpullup.h
+++ b/src/include/cdb/cdbpullup.h
@@ -55,7 +55,7 @@
 Expr *
 cdbpullup_expr(Expr *expr, List *targetlist, List *newvarlist, Index newvarno);
 
-extern Expr *cdbpullup_findEclassInTargetList(EquivalenceClass *eclass, List 
*targetlist, Oid hashOpFamily);
+extern Expr *cdbpullup_findEclassInTargetList(EquivalenceClass *eclass, List 
*targetlist, Oid hashOpFamily, bool *relabel_stripped);
 
 extern List *cdbpullup_truncatePathKeysForTargetList(List *pathkeys, List 
*targetlist);
 
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index 9b890e95566..a90c654cea6 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -283,7 +283,8 @@ cdb_pull_up_eclass(PlannerInfo    *root,
                     Relids          relids,
                     List           *targetlist,
                     List           *newvarlist,
-                    Index           newrelid);
+                    Index           newrelid,
+                                       bool                    ignore_relabel);
 
 extern List *make_pathkeys_for_sortclauses(PlannerInfo *root,
                                                                                
   List *sortclauses,
diff --git a/src/test/regress/expected/partition_join.out 
b/src/test/regress/expected/partition_join.out
index 80bde64b3db..107cbc746e5 100644
--- a/src/test/regress/expected/partition_join.out
+++ b/src/test/regress/expected/partition_join.out
@@ -5700,3 +5700,98 @@ SELECT t1.*, t2.* FROM alpha t1 INNER JOIN beta t2 ON 
(t1.a = t2.a AND t1.b = t2
   1 | 209 | 0009 |  1 | 209 | 0009
 (8 rows)
 
+--
+-- test issue https://github.com/apache/cloudberry/issues/1301
+--
+begin;
+create table t_issue_1301_big(
+       id varchar(32),
+       t varchar(32)
+) distributed by (id)
+partition by range(t)
+(
+partition p1 start ('0') end ('5'),
+partition p2 start ('5') end ('9999999999999999999')
+);
+create index idx_t_issue_1301_big_id on t_issue_1301_big(id);
+insert into t_issue_1301_big select seq, seq from generate_series(1, 100000) 
as seq;
+create table t_issue_1301_small(
+  id varchar(32),
+  t varchar(32)
+) distributed by (id);
+insert into t_issue_1301_small select seq*10000, seq*10000 from 
generate_series(1, 100) as seq;
+set local optimizer = off;
+set local enable_nestloop to on;
+analyze t_issue_1301_big;
+analyze t_issue_1301_small;
+explain(costs off) select a.* from t_issue_1301_small a left join 
t_issue_1301_big b on a.id=b.id;
+                                                QUERY PLAN                     
                            
+-----------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)
+   ->  Nested Loop Left Join
+         ->  Seq Scan on t_issue_1301_small a
+         ->  Append
+               ->  Index Only Scan using t_issue_1301_big_1_prt_p1_id_idx on 
t_issue_1301_big_1_prt_p1 b_1
+                     Index Cond: (id = (a.id)::text)
+               ->  Index Only Scan using t_issue_1301_big_1_prt_p2_id_idx on 
t_issue_1301_big_1_prt_p2 b_2
+                     Index Cond: (id = (a.id)::text)
+ Optimizer: Postgres query optimizer
+(9 rows)
+
+abort;
+BEGIN;
+CREATE TABLE t1 (id varchar(32), date date) DISTRIBUTED BY (id)
+PARTITION BY RANGE (date)
+(START (date '2016-01-01') INCLUSIVE END (date '2016-01-04') EXCLUSIVE EVERY 
(INTERVAL '1 day'));
+CREATE TABLE t2 (id varchar(32)) DISTRIBUTED BY (id);
+analyze t1;
+analyze t2;
+\d+ t1;
+                                      Partitioned table "public.t1"
+ Column |         Type          | Collation | Nullable | Default | Storage  | 
Stats target | Description 
+--------+-----------------------+-----------+----------+---------+----------+--------------+-------------
+ id     | character varying(32) |           |          |         | extended |  
            | 
+ date   | date                  |           |          |         | plain    |  
            | 
+Partition key: RANGE (date)
+Partitions: t1_1_prt_1 FOR VALUES FROM ('01-01-2016') TO ('01-02-2016'),
+            t1_1_prt_2 FOR VALUES FROM ('01-02-2016') TO ('01-03-2016'),
+            t1_1_prt_3 FOR VALUES FROM ('01-03-2016') TO ('01-04-2016')
+Distributed by: (id)
+
+\d+ t2;
+                                            Table "public.t2"
+ Column |         Type          | Collation | Nullable | Default | Storage  | 
Stats target | Description 
+--------+-----------------------+-----------+----------+---------+----------+--------------+-------------
+ id     | character varying(32) |           |          |         | extended |  
            | 
+Distributed by: (id)
+
+EXPLAIN(COSTS OFF) SELECT COUNT(*) FROM t1_1_prt_1 JOIN t2 USING(id);
+                            QUERY PLAN                            
+------------------------------------------------------------------
+ Aggregate
+   ->  Gather Motion 3:1  (slice1; segments: 3)
+         ->  Hash Join
+               Hash Cond: ((t1_1_prt_1.id)::text = (t2.id)::text)
+               ->  Seq Scan on t1_1_prt_1
+               ->  Hash
+                     ->  Seq Scan on t2
+ Optimizer: Postgres query optimizer
+(8 rows)
+
+EXPLAIN(COSTS OFF) SELECT COUNT(*) FROM t1 JOIN t2 USING(id);
+                            QUERY PLAN                            
+------------------------------------------------------------------
+ Aggregate
+   ->  Gather Motion 3:1  (slice1; segments: 3)
+         ->  Hash Join
+               Hash Cond: ((t1.id)::text = (t2.id)::text)
+               ->  Append
+                     ->  Seq Scan on t1_1_prt_1 t1_1
+                     ->  Seq Scan on t1_1_prt_2 t1_2
+                     ->  Seq Scan on t1_1_prt_3 t1_3
+               ->  Hash
+                     ->  Seq Scan on t2
+ Optimizer: Postgres query optimizer
+(11 rows)
+
+ABORT;
diff --git a/src/test/regress/sql/partition_join.sql 
b/src/test/regress/sql/partition_join.sql
index 24baa458b1d..404f9242b16 100644
--- a/src/test/regress/sql/partition_join.sql
+++ b/src/test/regress/sql/partition_join.sql
@@ -1173,3 +1173,43 @@ SELECT t1.*, t2.* FROM alpha t1 INNER JOIN beta t2 ON 
(t1.a = t2.a AND t1.c = t2
 EXPLAIN (COSTS OFF)
 SELECT t1.*, t2.* FROM alpha t1 INNER JOIN beta t2 ON (t1.a = t2.a AND t1.b = 
t2.b AND t1.c = t2.c) WHERE ((t1.b >= 100 AND t1.b < 110) OR (t1.b >= 200 AND 
t1.b < 210)) AND ((t2.b >= 100 AND t2.b < 110) OR (t2.b >= 200 AND t2.b < 210)) 
AND t1.c IN ('0004', '0009') ORDER BY t1.a, t1.b;
 SELECT t1.*, t2.* FROM alpha t1 INNER JOIN beta t2 ON (t1.a = t2.a AND t1.b = 
t2.b AND t1.c = t2.c) WHERE ((t1.b >= 100 AND t1.b < 110) OR (t1.b >= 200 AND 
t1.b < 210)) AND ((t2.b >= 100 AND t2.b < 110) OR (t2.b >= 200 AND t2.b < 210)) 
AND t1.c IN ('0004', '0009') ORDER BY t1.a, t1.b;
+
+--
+-- test issue https://github.com/apache/cloudberry/issues/1301
+--
+begin;
+create table t_issue_1301_big(
+       id varchar(32),
+       t varchar(32)
+) distributed by (id)
+partition by range(t)
+(
+partition p1 start ('0') end ('5'),
+partition p2 start ('5') end ('9999999999999999999')
+);
+create index idx_t_issue_1301_big_id on t_issue_1301_big(id);
+insert into t_issue_1301_big select seq, seq from generate_series(1, 100000) 
as seq;
+create table t_issue_1301_small(
+  id varchar(32),
+  t varchar(32)
+) distributed by (id);
+insert into t_issue_1301_small select seq*10000, seq*10000 from 
generate_series(1, 100) as seq;
+set local optimizer = off;
+set local enable_nestloop to on;
+analyze t_issue_1301_big;
+analyze t_issue_1301_small;
+explain(costs off) select a.* from t_issue_1301_small a left join 
t_issue_1301_big b on a.id=b.id;
+abort;
+
+BEGIN;
+CREATE TABLE t1 (id varchar(32), date date) DISTRIBUTED BY (id)
+PARTITION BY RANGE (date)
+(START (date '2016-01-01') INCLUSIVE END (date '2016-01-04') EXCLUSIVE EVERY 
(INTERVAL '1 day'));
+CREATE TABLE t2 (id varchar(32)) DISTRIBUTED BY (id);
+analyze t1;
+analyze t2;
+\d+ t1;
+\d+ t2;
+EXPLAIN(COSTS OFF) SELECT COUNT(*) FROM t1_1_prt_1 JOIN t2 USING(id);
+EXPLAIN(COSTS OFF) SELECT COUNT(*) FROM t1 JOIN t2 USING(id);
+ABORT;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(cloudberry) branch main updated: Add option to avoid additional generated EquivalenceCalss for RelabelType in cdb_pull_up_eclass.

Reply via email to