This is an automated email from the ASF dual-hosted git repository. yjhjstz pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/cloudberry.git
commit 36ba03bfb67476ec98f58e999e784c0acd8bce32 Author: Chris Hajas <[email protected]> AuthorDate: Mon May 13 14:16:18 2024 -0700 Fix missing discard_output variable in shared scan node's functions (#17453) Commit 1b5be8c introduced the concept of discarding output, but forgot to add the node variable to the node's copy/out/read functions. This meant that this wasn't populated, and didn't give us the performance improvement we wanted. Now, we see the correct output in explain analyze where the output rows of the shared producer is 0 for Orca plans: ``` Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..24112.07 rows=1 width=8) (actual time=47805.782..47805.843 rows=1 loops=1) -> Sequence (cost=0.00..24112.07 rows=1 width=8) (actual time=47804.357..47804.359 rows=1 loops=1) -> Shared Scan (share slice:id 1:0) (cost=0.00..1657.02 rows=33333939 width=1) (actual time=0.000..6079.135 rows=0 loops=1) -> Seq Scan on foo (cost=0.00..1127.68 rows=33333939 width=8) (actual time=0.023..2617.554 rows=33335145 loops=1) ``` Note the shared scan outputting "rows=0" As a test case, the following went from ~6s to 3.8s on my laptop: ``` create table foo (a int, b int); insert into foo select i,i from generate_series(1,100000000)i; with cte1 as (select * from foo) select count(1) from (select * from cte1 limit 10) ab left join (select * from cte1 limit 10) ac on ab.a=ac.a; ``` --- src/backend/nodes/copyfuncs.c | 1 + src/backend/nodes/outfuncs_common.c | 1 + src/backend/nodes/readfast.c | 1 + src/test/regress/expected/gporca.out | 16 ++++++++++++++++ src/test/regress/expected/gporca_optimizer.out | 19 +++++++++++++++++++ src/test/regress/sql/gporca.sql | 5 +++++ 6 files changed, 43 insertions(+) diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 1fecefc425..f0d1513147 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -1239,6 +1239,7 @@ _copyShareInputScan(const ShareInputScan *from) COPY_SCALAR_FIELD(producer_slice_id); COPY_SCALAR_FIELD(this_slice_id); COPY_SCALAR_FIELD(nconsumers); + COPY_SCALAR_FIELD(discard_output); return newnode; } diff --git a/src/backend/nodes/outfuncs_common.c b/src/backend/nodes/outfuncs_common.c index 584029ec8a..a454c11da0 100644 --- a/src/backend/nodes/outfuncs_common.c +++ b/src/backend/nodes/outfuncs_common.c @@ -398,6 +398,7 @@ _outShareInputScan(StringInfo str, const ShareInputScan *node) WRITE_INT_FIELD(producer_slice_id); WRITE_INT_FIELD(this_slice_id); WRITE_INT_FIELD(nconsumers); + WRITE_BOOL_FIELD(discard_output); _outPlanInfo(str, (Plan *) node); } diff --git a/src/backend/nodes/readfast.c b/src/backend/nodes/readfast.c index d659263dcf..e82d84126b 100644 --- a/src/backend/nodes/readfast.c +++ b/src/backend/nodes/readfast.c @@ -1000,6 +1000,7 @@ _readShareInputScan(void) READ_INT_FIELD(producer_slice_id); READ_INT_FIELD(this_slice_id); READ_INT_FIELD(nconsumers); + READ_BOOL_FIELD(discard_output); ReadCommonPlan(&local_node->scan.plan); diff --git a/src/test/regress/expected/gporca.out b/src/test/regress/expected/gporca.out index dd7932e40b..8d2bcbf221 100644 --- a/src/test/regress/expected/gporca.out +++ b/src/test/regress/expected/gporca.out @@ -14637,3 +14637,19 @@ explain select * from baz where baz.a::bpchar='b' or baz.a='c'; (4 rows) drop table baz; +-- ensure shared scan producer returns 0 rows +create table cte_test(a int); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into cte_test select i from generate_series(1,10)i; +analyze cte_test; +explain (analyze, costs off, summary off, timing off) with cte as (select * from cte_test) select * from cte union all select * from cte; + QUERY PLAN +--------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) (actual rows=20 loops=1) + -> Append (actual rows=10 loops=1) + -> Seq Scan on cte_test (actual rows=5 loops=1) + -> Seq Scan on cte_test cte_test_1 (actual rows=5 loops=1) + Optimizer: Postgres query optimizer +(5 rows) + diff --git a/src/test/regress/expected/gporca_optimizer.out b/src/test/regress/expected/gporca_optimizer.out index f53f05999a..ef9204fdb6 100644 --- a/src/test/regress/expected/gporca_optimizer.out +++ b/src/test/regress/expected/gporca_optimizer.out @@ -14757,3 +14757,22 @@ explain select * from baz where baz.a::bpchar='b' or baz.a='c'; (4 rows) drop table baz; +-- ensure shared scan producer returns 0 rows +create table cte_test(a int); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into cte_test select i from generate_series(1,10)i; +analyze cte_test; +explain (analyze, costs off, summary off, timing off) with cte as (select * from cte_test) select * from cte union all select * from cte; + QUERY PLAN +---------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) (actual rows=20 loops=1) + -> Sequence (actual rows=10 loops=1) + -> Shared Scan (share slice:id 1:0) (actual rows=0 loops=1) + -> Seq Scan on cte_test (actual rows=5 loops=1) + -> Append (actual rows=10 loops=1) + -> Shared Scan (share slice:id 1:0) (actual rows=5 loops=1) + -> Shared Scan (share slice:id 1:0) (actual rows=5 loops=1) + Optimizer: Pivotal Optimizer (GPORCA) +(8 rows) + diff --git a/src/test/regress/sql/gporca.sql b/src/test/regress/sql/gporca.sql index e02448a605..6f07c35543 100644 --- a/src/test/regress/sql/gporca.sql +++ b/src/test/regress/sql/gporca.sql @@ -3604,6 +3604,11 @@ create table baz ( a varchar); explain select * from baz where baz.a::bpchar='b' or baz.a='c'; drop table baz; +-- ensure shared scan producer returns 0 rows +create table cte_test(a int); +insert into cte_test select i from generate_series(1,10)i; +analyze cte_test; +explain (analyze, costs off, summary off, timing off) with cte as (select * from cte_test) select * from cte union all select * from cte; -- start_ignore DROP SCHEMA orca CASCADE; -- end_ignore --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
