(cloudberry) branch main updated: Fix ORCA choosing wrong column type for CTAS with UNION ALL (#1431) (#1645)

yjhjstz Wed, 01 Apr 2026 05:43:31 -0700

This is an automated email from the ASF dual-hosted git repository.

yjhjstz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git



The following commit(s) were added to refs/heads/main by this push:
     new 95486365241 Fix ORCA choosing wrong column type for CTAS with UNION 
ALL (#1431) (#1645)
95486365241 is described below

commit 95486365241206725901fb2e1c5fa97149f155fa
Author: Jianghua.yjh <[email protected]>
AuthorDate: Wed Apr 1 05:43:21 2026 -0700

    Fix ORCA choosing wrong column type for CTAS with UNION ALL (#1431) (#1645)
    
    * Fix ORCA choosing wrong column type for CTAS with UNION ALL (#1431)
    
    When removing redundant Result nodes in the ORCA post-processing,
    push_down_expr_mutator replaces parent Var nodes with child expressions.
    It already propagates typmod for Const child expressions, but missed the
    case where the child expression is also a Var. This caused the correctly
    resolved common typmod (e.g. -1 for varchar without length) to be
    overwritten by the child's original typmod (e.g. varchar(1)), resulting
    in wrong column types in the created table.
    
    * Add regression test for CTAS with UNION ALL typmod fix (#1431)
    
    Verify that ORCA produces the correct column type (character varying
    without length limit) when creating a table from UNION ALL of branches
    with different varchar lengths.
    
    ---------
    
    Co-authored-by: reshke <[email protected]>
---
 src/backend/optimizer/plan/orca.c                |   4 +
 src/test/regress/expected/union_gp.out           |  30 +++++
 src/test/regress/expected/union_gp_optimizer.out | 134 +++++++++++++++--------
 src/test/regress/sql/union_gp.sql                |  25 ++++-
 4 files changed, 147 insertions(+), 46 deletions(-)

diff --git a/src/backend/optimizer/plan/orca.c 
b/src/backend/optimizer/plan/orca.c
index 97f63f7a334..514385cc2e9 100644
--- a/src/backend/optimizer/plan/orca.c
+++ b/src/backend/optimizer/plan/orca.c
@@ -545,6 +545,10 @@ push_down_expr_mutator(Node *node, List *child_tlist)
                        {
                                ((Const *) child_tle->expr)->consttypmod = 
((Var *) node)->vartypmod;
                        }
+                       else if (IsA(child_tle->expr, Var))
+                       {
+                               ((Var *) child_tle->expr)->vartypmod = ((Var *) 
node)->vartypmod;
+                       }
 
                        return (Node *) child_tle->expr;
                }
diff --git a/src/test/regress/expected/union_gp.out 
b/src/test/regress/expected/union_gp.out
index 5bdae3e887c..d134f223502 100644
--- a/src/test/regress/expected/union_gp.out
+++ b/src/test/regress/expected/union_gp.out
@@ -2342,6 +2342,36 @@ with result as (update r_1240 set a = a +1 where a < 5 
returning *) select * fro
 drop table r_1240;
 drop table p1_1240;
 --
+-- Test CTAS with UNION ALL when branches have different typmods (issue #1431).
+-- ORCA should resolve the output column type to character varying (no length),
+-- same as the Postgres planner, instead of picking the first branch's typmod.
+--
+create table union_ctas_t1(id int, name varchar(1));
+create table union_ctas_t2(id int, name varchar(2));
+insert into union_ctas_t1 values (1, 'a');
+insert into union_ctas_t2 values (1, 'ab');
+create table union_ctas_result as
+  (select id, name from union_ctas_t1)
+  union all
+  (select id, name from union_ctas_t2);
+-- name column should be "character varying" without length, not varchar(1)
+select atttypmod from pg_attribute
+where attrelid = 'union_ctas_result'::regclass and attname = 'name';
+ atttypmod 
+-----------
+        -1
+(1 row)
+
+-- data should not be truncated
+select * from union_ctas_result order by name;
+ id | name 
+----+------
+  1 | a
+  1 | ab
+(2 rows)
+
+drop table union_ctas_t1, union_ctas_t2, union_ctas_result;
+--
 -- Clean up
 --
 DROP TABLE IF EXISTS T_a1 CASCADE;
diff --git a/src/test/regress/expected/union_gp_optimizer.out 
b/src/test/regress/expected/union_gp_optimizer.out
index 8ff8655591d..8704f0fe7a7 100644
--- a/src/test/regress/expected/union_gp_optimizer.out
+++ b/src/test/regress/expected/union_gp_optimizer.out
@@ -1,7 +1,7 @@
 -- Additional GPDB-added tests for UNION
 SET optimizer_trace_fallback=on;
 create temp table t_union1 (a int, b int);
-NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' 
as the Greenplum Database data distribution key for this table.
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' 
as the Apache Cloudberry data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
 select distinct a, null::integer as c from t_union1 union select a, b from 
t_union1;
  a | c 
@@ -44,8 +44,8 @@ LINE 1: select 1 intersect (select 1, 2 union all select 3, 
4);
 select 1 a, row_number() over (partition by 'a') union all (select 1 a , 2 b);
  a | row_number 
 ---+------------
- 1 |          2
  1 |          1
+ 1 |          2
 (2 rows)
 
 -- This should preserve domain types
@@ -104,8 +104,7 @@ DETAIL:  Falling back to Postgres-based planner because 
GPORCA does not support
 (1 row)
 
 CREATE TABLE union_ctas (a, b) AS SELECT 1, 2 UNION SELECT 1, 1 UNION SELECT 
1, 1;
-NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 
'a' as the Greenplum Database data distribution key for this table.
-HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause. Creating a NULL policy 
entry.
 SELECT * FROM union_ctas;
  a | b 
 ---+---
@@ -116,11 +115,9 @@ SELECT * FROM union_ctas;
 DROP TABLE union_ctas;
 -- MPP-21075: push quals below union
 CREATE TABLE union_quals1 (a, b) AS SELECT i, i%2 from generate_series(1,10) i;
-NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 
'a' as the Greenplum Database data distribution key for this table.
-HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause. Creating a NULL policy 
entry.
 CREATE TABLE union_quals2 (a, b) AS SELECT i%2, i from generate_series(1,10) i;
-NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 
'a' as the Greenplum Database data distribution key for this table.
-HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause. Creating a NULL policy 
entry.
 SELECT * FROM (SELECT a, b from union_quals1 UNION SELECT b, a from 
union_quals2) as foo(a,b) where a > b order by a;
  a  | b 
 ----+---
@@ -225,7 +222,7 @@ select distinct a from (select  distinct 'A' from (select 
'C' from (select disti
 -- on a single QE.
 --
 CREATE TABLE test1 (id int);
-NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id' 
as the Greenplum Database data distribution key for this table.
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id' 
as the Apache Cloudberry data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
 insert into test1 values (1);
 CREATE EXTERNAL WEB TABLE test2 (id int) EXECUTE 'echo 2' ON COORDINATOR 
FORMAT 'csv';
@@ -234,8 +231,8 @@ union
 (SELECT 'test2' as branch, id FROM test2);
  branch | id 
 --------+----
- test1  |  1
  test2  |  2
+ test1  |  1
 (2 rows)
 
 explain (SELECT 'test1' as branch, id FROM test1 LIMIT 1)
@@ -243,10 +240,10 @@ union
 (SELECT 'test2' as branch, id FROM test2);
                                                           QUERY PLAN           
                                               
 
------------------------------------------------------------------------------------------------------------------------------
- Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..984.78 rows=1125 
width=12)
-   ->  HashAggregate  (cost=0.00..984.73 rows=375 width=12)
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..985.86 rows=1125 
width=12)
+   ->  HashAggregate  (cost=0.00..985.81 rows=375 width=12)
          Group Key: ('test1'::text), test1.id
-         ->  Append  (cost=0.00..984.65 rows=334 width=12)
+         ->  Append  (cost=0.00..985.73 rows=334 width=12)
                ->  Redistribute Motion 1:3  (slice2)  (cost=0.00..431.00 
rows=1 width=12)
                      Hash Key: ('test1'::text), test1.id
                      ->  GroupAggregate  (cost=0.00..431.00 rows=1 width=12)
@@ -257,16 +254,16 @@ union
                                        ->  Result  (cost=0.00..431.00 rows=1 
width=12)
                                              ->  Gather Motion 3:1  (slice3; 
segments: 3)  (cost=0.00..431.00 rows=1 width=4)
                                                    ->  Seq Scan on test1  
(cost=0.00..431.00 rows=1 width=4)
-               ->  HashAggregate  (cost=0.00..553.64 rows=334 width=12)
-                     Group Key: ('test2'::text), test2.id
-                     ->  Redistribute Motion 3:3  (slice4; segments: 3)  
(cost=0.00..553.56 rows=334 width=12)
-                           Hash Key: ('test2'::text), test2.id
-                           ->  Streaming HashAggregate  (cost=0.00..553.55 
rows=334 width=12)
-                                 Group Key: 'test2'::text, test2.id
-                                 ->  Result  (cost=0.00..471.53 rows=333334 
width=12)
-                                       ->  Redistribute Motion 1:3  (slice5)  
(cost=0.00..467.53 rows=333334 width=4)
-                                             ->  Foreign Scan on test2  
(cost=0.00..449.70 rows=1000000 width=4)
- Optimizer: Pivotal Optimizer (GPORCA)
+               ->  HashAggregate  (cost=0.00..554.73 rows=334 width=12)
+                     Group Key: ('test2'::text), id
+                     ->  Redistribute Motion 3:3  (slice4; segments: 3)  
(cost=0.00..554.64 rows=334 width=12)
+                           Hash Key: ('test2'::text), id
+                           ->  Streaming HashAggregate  (cost=0.00..554.63 
rows=334 width=12)
+                                 Group Key: 'test2'::text, id
+                                 ->  Result  (cost=0.00..473.73 rows=333334 
width=12)
+                                       ->  Redistribute Motion 1:3  (slice5)  
(cost=0.00..469.73 rows=333334 width=4)
+                                             ->  Foreign Scan on test2  
(cost=0.00..451.90 rows=1000000 width=4)
+ Optimizer: GPORCA
 (24 rows)
 
 --
@@ -320,8 +317,8 @@ INFO:  GPORCA failed to produce a plan, falling back to 
Postgres-based planner
 DETAIL:  Unknown error: Partially Distributed Data
                                       QUERY PLAN                               
        
 
---------------------------------------------------------------------------------------
- Gather Motion 1:1  (slice1; segments: 1)  (cost=1922.00..1922.00 rows=172200 
width=8)
-   ->  Append  (cost=0.00..1922.00 rows=172200 width=8)
+ Gather Motion 1:1  (slice1; segments: 1)  (cost=2783.00..2783.00 rows=172200 
width=8)
+   ->  Append  (cost=0.00..2783.00 rows=172200 width=8)
          ->  Seq Scan on rep2  (cost=0.00..961.00 rows=86100 width=8)
          ->  Seq Scan on rep3  (cost=0.00..961.00 rows=86100 width=8)
  Optimizer: Postgres query optimizer
@@ -353,7 +350,7 @@ INSERT INTO T_a1 SELECT i, i%5 from generate_series(1,10) i;
 CREATE TABLE T_b2 (b1 int, b2 int) DISTRIBUTED BY(b2);
 INSERT INTO T_b2 SELECT i, i%5 from generate_series(1,20) i;
 CREATE TABLE T_random (c1 int, c2 int);
-NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' 
as the Greenplum Database data distribution key for this table.
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' 
as the Apache Cloudberry data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
 INSERT INTO T_random SELECT i, i%5 from generate_series(1,30) i;
 --start_ignore
@@ -2079,14 +2076,18 @@ insert into t1_ncols values (1, 11, 'one', 
'2001-01-01');
 insert into t2_ncols values (2, 22, 'two', '2002-02-02');
 insert into t2_ncols values (4, 44, 'four','2004-04-04');
 select b from t1_ncols union all select a from t2_ncols;
+NOTICE:  One or more columns in the following table(s) do not have statistics: 
t2_ncols
+HINT:  For non-partitioned tables, run analyze <table_name>(<column_list>). 
For partitioned tables, run analyze rootpartition <table_name>(<column_list>). 
See log for columns missing statistics.
  b  
 ----
-  4
-  2
  11
+  2
+  4
 (3 rows)
 
 select a+100, b, d from t1_ncols union select b, a+200, d from t2_ncols order 
by 1;
+NOTICE:  One or more columns in the following table(s) do not have statistics: 
t2_ncols
+HINT:  For non-partitioned tables, run analyze <table_name>(<column_list>). 
For partitioned tables, run analyze rootpartition <table_name>(<column_list>). 
See log for columns missing statistics.
  ?column? |  b  |     d      
 ----------+-----+------------
        22 | 202 | 02-02-2002
@@ -2095,15 +2096,19 @@ select a+100, b, d from t1_ncols union select b, a+200, 
d from t2_ncols order by
 (3 rows)
 
 select c, a from v1_ncols;
+NOTICE:  One or more columns in the following table(s) do not have statistics: 
t2_ncols
+HINT:  For non-partitioned tables, run analyze <table_name>(<column_list>). 
For partitioned tables, run analyze rootpartition <table_name>(<column_list>). 
See log for columns missing statistics.
   c   | a 
 ------+---
  one  | 1
- four | 4
  two  | 2
+ four | 4
 (3 rows)
 
 with cte1(aa, b, c, d) as (select a*100, b, c, d from t1_ncols union select * 
from t2_ncols)
 select x.aa/100 aaa, x.c, y.c from cte1 x join cte1 y on x.aa=y.aa;
+NOTICE:  One or more columns in the following table(s) do not have statistics: 
t2_ncols
+HINT:  For non-partitioned tables, run analyze <table_name>(<column_list>). 
For partitioned tables, run analyze rootpartition <table_name>(<column_list>). 
See log for columns missing statistics.
  aaa |  c   |  c   
 -----+------+------
    0 | two  | two
@@ -2122,13 +2127,13 @@ NOTICE:  schema "union_schema" does not exist, skipping
 -- end_ignore
 create schema union_schema;
 create table union_schema.t1(a int, b int);
-NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' 
as the Greenplum Database data distribution key for this table.
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' 
as the Apache Cloudberry data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
 create table union_schema.t2(a int, b int);
-NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' 
as the Greenplum Database data distribution key for this table.
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' 
as the Apache Cloudberry data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
 create table union_schema.t3(a int, b int);
-NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' 
as the Greenplum Database data distribution key for this table.
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' 
as the Apache Cloudberry data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
 set allow_system_table_mods = on;
 update gp_distribution_policy set numsegments = 1
@@ -2188,8 +2193,8 @@ INFO:  GPORCA failed to produce a plan, falling back to 
Postgres-based planner
 DETAIL:  Unknown error: Partially Distributed Data
                                              QUERY PLAN                        
                      
 
-----------------------------------------------------------------------------------------------------
- Gather Motion 3:1  (slice1; segments: 3)  (cost=1.23..1472.30 rows=86130 
width=8)
-   ->  Append  (cost=1.23..323.90 rows=28710 width=8)
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=1.23..1615.85 rows=86130 
width=8)
+   ->  Append  (cost=1.23..467.45 rows=28710 width=8)
          ->  Hash Join  (cost=1.23..2.80 rows=10 width=8)
                Hash Cond: (t2.b = t1.a)
                ->  Redistribute Motion 2:3  (slice2; segments: 2)  
(cost=0.00..1.40 rows=20 width=4)
@@ -2208,6 +2213,8 @@ INFO:  GPORCA failed to produce a plan, falling back to 
Postgres-based planner
 DETAIL:  Unknown error: Partially Distributed Data
  a  | b  | a  | b  
 ----+----+----+----
+  1 |  1 |  1 |  1
+  5 |  5 |  5 |  5
   2 |  2 |  2 |  2
   3 |  3 |  3 |  3
   4 |  4 |  4 |  4
@@ -2216,8 +2223,6 @@ DETAIL:  Unknown error: Partially Distributed Data
   8 |  8 |  8 |  8
   9 |  9 |  9 |  9
  10 | 10 | 10 | 10
-  1 |  1 |  1 |  1
-  5 |  5 |  5 |  5
 (10 rows)
 
 select union_schema.t1.a, union_schema.t2.b
@@ -2229,6 +2234,8 @@ INFO:  GPORCA failed to produce a plan, falling back to 
Postgres-based planner
 DETAIL:  Unknown error: Partially Distributed Data
  a  | b  
 ----+----
+  1 |  1
+  5 |  5
   2 |  2
   3 |  3
   4 |  4
@@ -2237,8 +2244,6 @@ DETAIL:  Unknown error: Partially Distributed Data
   8 |  8
   9 |  9
  10 | 10
-  1 |  1
-  5 |  5
 (10 rows)
 
 truncate union_schema.t1, union_schema.t2;
@@ -2276,8 +2281,8 @@ INFO:  GPORCA failed to produce a plan, falling back to 
Postgres-based planner
 DETAIL:  Unknown error: Partially Distributed Data
                                                 QUERY PLAN                     
                            
 
-----------------------------------------------------------------------------------------------------------
- Gather Motion 3:1  (slice1; segments: 3)  (cost=1.32..1472.20 rows=86130 
width=8)
-   ->  Append  (cost=1.32..323.80 rows=28710 width=8)
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=1.32..1615.75 rows=86130 
width=8)
+   ->  Append  (cost=1.32..467.35 rows=28710 width=8)
          ->  Hash Join  (cost=1.32..2.70 rows=10 width=8)
                Hash Cond: (t1.a = t2.b)
                ->  Seq Scan on t1  (cost=0.00..1.20 rows=20 width=4)
@@ -2340,6 +2345,8 @@ reset allow_system_table_mods;
 create table rep (a int) distributed replicated;
 insert into rep select i from generate_series (1, 10) i; 
 create table dist (a int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' 
as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
 insert into dist select i from generate_series (1, 1000) i;
 analyze dist;
 analyze rep;
@@ -2352,7 +2359,7 @@ explain select a from rep union all select a from dist;
  Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..862.03 rows=1010 
width=4)
    ->  Append  (cost=0.00..862.01 rows=337 width=4)
          ->  Result  (cost=0.00..431.00 rows=4 width=4)
-               One-Time Filter: (gp_execution_segment() = 2)
+               One-Time Filter: (gp_execution_segment() = 0)
                ->  Seq Scan on rep  (cost=0.00..431.00 rows=10 width=4)
          ->  Seq Scan on dist  (cost=0.00..431.01 rows=334 width=4)
  Optimizer: GPORCA
@@ -2368,12 +2375,12 @@ analyze rand;
 explain select i from generate_series(1,1000) i union all select a from rand;
                                        QUERY PLAN                              
         
 
----------------------------------------------------------------------------------------
- Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..431.28 rows=11000 
width=4)
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..431.29 rows=11000 
width=4)
    ->  Append  (cost=0.00..431.12 rows=3667 width=4)
          ->  Result  (cost=0.00..0.01 rows=334 width=4)
-               One-Time Filter: (gp_execution_segment() = 2)
+               One-Time Filter: (gp_execution_segment() = 0)
                ->  Function Scan on generate_series  (cost=0.00..0.00 rows=334 
width=4)
-         ->  Seq Scan on rand  (cost=0.00..431.06 rows=3334 width=4)
+         ->  Seq Scan on rand  (cost=0.00..431.07 rows=3334 width=4)
  Optimizer: GPORCA
 (7 rows)
 
@@ -2460,7 +2467,7 @@ DETAIL:  Falling back to Postgres-based planner because 
GPORCA does not support
          ->  Gather Motion 3:1  (slice2; segments: 3)
                ->  Subquery Scan on "*SELECT* 2"
                      ->  Seq Scan on p1_1240
- Optimizer: Postgres-based planner
+ Optimizer: Postgres query optimizer
 (11 rows)
 
 with result as (update r_1240 set a = a +1 where a < 5 returning *) select * 
from result except select * from p1_1240;
@@ -2475,6 +2482,43 @@ DETAIL:  Falling back to Postgres-based planner because 
GPORCA does not support
 drop table r_1240;
 drop table p1_1240;
 --
+-- Test CTAS with UNION ALL when branches have different typmods (issue #1431).
+-- ORCA should resolve the output column type to character varying (no length),
+-- same as the Postgres planner, instead of picking the first branch's typmod.
+--
+create table union_ctas_t1(id int, name varchar(1));
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id' 
as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table union_ctas_t2(id int, name varchar(2));
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id' 
as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+insert into union_ctas_t1 values (1, 'a');
+insert into union_ctas_t2 values (1, 'ab');
+create table union_ctas_result as
+  (select id, name from union_ctas_t1)
+  union all
+  (select id, name from union_ctas_t2);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause. Creating a NULL policy 
entry.
+-- name column should be "character varying" without length, not varchar(1)
+select atttypmod from pg_attribute
+where attrelid = 'union_ctas_result'::regclass and attname = 'name';
+INFO:  GPORCA failed to produce a plan, falling back to Postgres-based planner
+DETAIL:  Falling back to Postgres-based planner because GPORCA does not 
support the following feature: Queries on master-only tables
+ atttypmod 
+-----------
+        -1
+(1 row)
+
+-- data should not be truncated
+select * from union_ctas_result order by name;
+ id | name 
+----+------
+  1 | a
+  1 | ab
+(2 rows)
+
+drop table union_ctas_t1, union_ctas_t2, union_ctas_result;
+--
 -- Clean up
 --
 DROP TABLE IF EXISTS T_a1 CASCADE;
diff --git a/src/test/regress/sql/union_gp.sql 
b/src/test/regress/sql/union_gp.sql
index e7cac952704..9e9e5c3a815 100644
--- a/src/test/regress/sql/union_gp.sql
+++ b/src/test/regress/sql/union_gp.sql
@@ -721,9 +721,32 @@ drop table r_1240;
 drop table p1_1240;
 
 --
--- Clean up
+-- Test CTAS with UNION ALL when branches have different typmods (issue #1431).
+-- ORCA should resolve the output column type to character varying (no length),
+-- same as the Postgres planner, instead of picking the first branch's typmod.
 --
+create table union_ctas_t1(id int, name varchar(1));
+create table union_ctas_t2(id int, name varchar(2));
+insert into union_ctas_t1 values (1, 'a');
+insert into union_ctas_t2 values (1, 'ab');
+
+create table union_ctas_result as
+  (select id, name from union_ctas_t1)
+  union all
+  (select id, name from union_ctas_t2);
+
+-- name column should be "character varying" without length, not varchar(1)
+select atttypmod from pg_attribute
+where attrelid = 'union_ctas_result'::regclass and attname = 'name';
+
+-- data should not be truncated
+select * from union_ctas_result order by name;
 
+drop table union_ctas_t1, union_ctas_t2, union_ctas_result;
+
+--
+-- Clean up
+--
 DROP TABLE IF EXISTS T_a1 CASCADE;
 DROP TABLE IF EXISTS T_b2 CASCADE;
 DROP TABLE IF EXISTS T_random CASCADE;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(cloudberry) branch main updated: Fix ORCA choosing wrong column type for CTAS with UNION ALL (#1431) (#1645)

Reply via email to