This is an automated email from the ASF dual-hosted git repository.
avamingli pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git
The following commit(s) were added to refs/heads/main by this push:
new 5cbac92d378 Fix duplicate distribution keys from subqueries.
5cbac92d378 is described below
commit 5cbac92d37881aabdd244edfb5c1ee9fbec60756
Author: Zhang Mingli <[email protected]>
AuthorDate: Mon Oct 20 10:52:06 2025 +0800
Fix duplicate distribution keys from subqueries.
The parser normally prevents duplicate distribution keys in main query
grammar, but they can still originate from subqueries - particularly in
window function PARTITION BY clauses with equivalent expressions.
When duplicate distribution keys are detected, we now properly fall back
to alternative processing. The caller already handles this fallback
scenario correctly; this fix ensures the fallback is triggered when
needed.
---
.../test/regress/expected/create_table_distpol.out | 2 --
src/backend/cdb/cdbllize.c | 19 +++++++++++++++----
src/test/regress/expected/create_table_distpol.out | 2 --
src/test/regress/expected/gp_create_table.out | 5 +++++
src/test/regress/sql/gp_create_table.sql | 4 ++++
5 files changed, 24 insertions(+), 8 deletions(-)
diff --git
a/contrib/pax_storage/src/test/regress/expected/create_table_distpol.out
b/contrib/pax_storage/src/test/regress/expected/create_table_distpol.out
index 545d14625f4..46479ca2126 100644
--- a/contrib/pax_storage/src/test/regress/expected/create_table_distpol.out
+++ b/contrib/pax_storage/src/test/regress/expected/create_table_distpol.out
@@ -128,9 +128,7 @@ select distkey from gp_distribution_policy where localoid =
'distpol_person_copy
RESET gp_create_table_random_default_distribution;
-- Test duplicate distribute keys
CREATE TABLE ctas_dup_dk as SELECT distinct age as c1, age as c2 from
distpol_person;
-ERROR: duplicate DISTRIBUTED BY column 'c1'
SELECT distinct age c1, age c2 into ctas_dup_dk_1 from distpol_person;
-ERROR: duplicate DISTRIBUTED BY column 'c1'
--
-- Test deriving distribution key from the query's distribution in
-- CREATE TABLE AS
diff --git a/src/backend/cdb/cdbllize.c b/src/backend/cdb/cdbllize.c
index a4037c36158..da702806be5 100644
--- a/src/backend/cdb/cdbllize.c
+++ b/src/backend/cdb/cdbllize.c
@@ -161,6 +161,7 @@ get_partitioned_policy_from_path(PlannerInfo *root, Path
*path)
ListCell *dk_cell;
ListCell *ec_cell;
ListCell *em_cell;
+ bool duplicate_keys_detected = false;
/*
* Is it a Hashed distribution?
@@ -184,6 +185,9 @@ get_partitioned_policy_from_path(PlannerInfo *root, Path
*path)
DistributionKey *dk = lfirst(dk_cell);
bool found_expr = false;
+ if (duplicate_keys_detected)
+ return NULL;
+
foreach(ec_cell, dk->dk_eclasses)
{
EquivalenceClass *ec = lfirst(ec_cell);
@@ -230,10 +234,17 @@ get_partitioned_policy_from_path(PlannerInfo *root, Path
*path)
Assert(list_length(policykeys)
< MaxPolicyAttributeNumber);
if (list_member_int(policykeys,
attno))
- ereport(ERROR,
-
(errcode(ERRCODE_DUPLICATE_COLUMN),
-
errmsg("duplicate DISTRIBUTED BY column '%s'",
-
target->resname ? target->resname : "???")));
+ {
+ /*
+ * Although the parser
prevents duplicate distribution keys in regular grammar,
+ * they can still occur
in subqueries(e.g., window function PARTITION BY columns
+ * with equivalent
expressions).
+ * We fall back to an
alternative approach when duplicate distribution keys are detected;
+ * the caller handles
this scenario appropriately.
+ */
+ duplicate_keys_detected
= true;
+ break;
+ }
/*
* We know the btree operator
family corresponding to
diff --git a/src/test/regress/expected/create_table_distpol.out
b/src/test/regress/expected/create_table_distpol.out
index 545d14625f4..46479ca2126 100644
--- a/src/test/regress/expected/create_table_distpol.out
+++ b/src/test/regress/expected/create_table_distpol.out
@@ -128,9 +128,7 @@ select distkey from gp_distribution_policy where localoid =
'distpol_person_copy
RESET gp_create_table_random_default_distribution;
-- Test duplicate distribute keys
CREATE TABLE ctas_dup_dk as SELECT distinct age as c1, age as c2 from
distpol_person;
-ERROR: duplicate DISTRIBUTED BY column 'c1'
SELECT distinct age c1, age c2 into ctas_dup_dk_1 from distpol_person;
-ERROR: duplicate DISTRIBUTED BY column 'c1'
--
-- Test deriving distribution key from the query's distribution in
-- CREATE TABLE AS
diff --git a/src/test/regress/expected/gp_create_table.out
b/src/test/regress/expected/gp_create_table.out
index 88b126e4e65..8a26a9a7e33 100644
--- a/src/test/regress/expected/gp_create_table.out
+++ b/src/test/regress/expected/gp_create_table.out
@@ -202,3 +202,8 @@ c1 int,c2 int,c3 int,c4 int,c5 int,c6 int,c7 int,c8 int,c9
int,c10 int,c11 int,c
c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24,c25,c26,c27,c28,c29,c30,c31,c32,c33,c34,c35,c36,c37,c38,c39,c40,c41,c42,c43,c44,c45,c46,c47,c48,c49,c50,c51,c52,c53,c54,c55,c56,c57,c58,c59,c60,c61,c62,c63,c64,c65,c66,c67,c68,c69,c70,c71,c72,c73,c74,c75,c76,c77,c78,c79,c80,c81,c82,c83,c84,c85,c86,c87,c88,c89,c90,c91,c92,c93,c94,c95,c96,c97,c98,c99,c100,c101,c102,c103,c104,c105,c106,c107,c108,c109,c110,c111,c112,c113,c114,c115,c116,c117,c118,c119,c120,
[...]
);
ERROR: tables can have at most 1600 columns
+create table dup_key_t(a int, b int, c int) distributed randomly;
+create temp table ctas_dump_key_t as select rn, a, b from
+ (select a, b, row_number() over (partition by a, b order by c) rn from
dup_key_t where dup_key_t.a = dup_key_t.b) x;
+NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named
'rn' as the Apache Cloudberry data distribution key for this table.
+HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make
sure column(s) chosen are the optimal data distribution key to minimize skew.
diff --git a/src/test/regress/sql/gp_create_table.sql
b/src/test/regress/sql/gp_create_table.sql
index 503ccfa43e1..5b39b216d34 100644
--- a/src/test/regress/sql/gp_create_table.sql
+++ b/src/test/regress/sql/gp_create_table.sql
@@ -129,3 +129,7 @@ c1 int,c2 int,c3 int,c4 int,c5 int,c6 int,c7 int,c8 int,c9
int,c10 int,c11 int,c
(
c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24,c25,c26,c27,c28,c29,c30,c31,c32,c33,c34,c35,c36,c37,c38,c39,c40,c41,c42,c43,c44,c45,c46,c47,c48,c49,c50,c51,c52,c53,c54,c55,c56,c57,c58,c59,c60,c61,c62,c63,c64,c65,c66,c67,c68,c69,c70,c71,c72,c73,c74,c75,c76,c77,c78,c79,c80,c81,c82,c83,c84,c85,c86,c87,c88,c89,c90,c91,c92,c93,c94,c95,c96,c97,c98,c99,c100,c101,c102,c103,c104,c105,c106,c107,c108,c109,c110,c111,c112,c113,c114,c115,c116,c117,c118,c119,c120,
[...]
);
+
+create table dup_key_t(a int, b int, c int) distributed randomly;
+create temp table ctas_dump_key_t as select rn, a, b from
+ (select a, b, row_number() over (partition by a, b order by c) rn from
dup_key_t where dup_key_t.a = dup_key_t.b) x;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]