Hi all,

   I recently run benchmark[1] on master, but I found performance problem
as below:

explain analyze select
  subq_0.c0 as c0,
  subq_0.c1 as c1,
  subq_0.c2 as c2
from
  (select
        ref_0.l_shipmode as c0,
        sample_0.l_orderkey as c1,
        sample_0.l_quantity as c2,
        ref_0.l_orderkey as c3,
        sample_0.l_shipmode as c5,
        ref_0.l_shipinstruct as c6
      from
        public.lineitem as ref_0
          left join public.lineitem as sample_0
          on ((select p_partkey from public.part order by p_partkey limit 1)
                 is not NULL)
      where sample_0.l_orderkey is NULL) as subq_0
where subq_0.c5 is NULL
limit 1;
                                                               QUERY PLAN

-----------------------------------------------------------------------------------------------------------------------------------------
 Limit  (cost=78.00..45267050.75 rows=1 width=27) (actual
time=299695.097..299695.099 rows=0 loops=1)
   InitPlan 1 (returns $0)
     ->  Limit  (cost=78.00..78.00 rows=1 width=8) (actual
time=0.651..0.652 rows=1 loops=1)
           ->  Sort  (cost=78.00..83.00 rows=2000 width=8) (actual
time=0.650..0.651 rows=1 loops=1)
                 Sort Key: part.p_partkey
                 Sort Method: top-N heapsort  Memory: 25kB
                 ->  Seq Scan on part  (cost=0.00..68.00 rows=2000 width=8)
(actual time=0.013..0.428 rows=2000 loops=1)
   ->  Nested Loop Left Join  (cost=0.00..45266972.75 rows=1 width=27)
(actual time=299695.096..299695.096 rows=0 loops=1)
         Join Filter: ($0 IS NOT NULL)
         Filter: ((sample_0.l_orderkey IS NULL) AND (sample_0.l_shipmode IS
NULL))
         Rows Removed by Filter: 3621030625
         ->  Seq Scan on lineitem ref_0  (cost=0.00..1969.75 rows=60175
width=11) (actual time=0.026..6.225 rows=60175 loops=1)
         ->  Materialize  (cost=0.00..2270.62 rows=60175 width=27) (actual
time=0.000..2.554 rows=60175 loops=60175)
               ->  Seq Scan on lineitem sample_0  (cost=0.00..1969.75
rows=60175 width=27) (actual time=0.004..8.169 rows=60175 loops=1)
 Planning Time: 0.172 ms
 Execution Time: 299695.501 ms
(16 rows)

After I set enable_material to off, the same query run faster, as below:
set enable_material = off;
explain analyze  select
  subq_0.c0 as c0,
  subq_0.c1 as c1,
  subq_0.c2 as c2
from
  (select
        ref_0.l_shipmode as c0,
        sample_0.l_orderkey as c1,
        sample_0.l_quantity as c2,
        ref_0.l_orderkey as c3,
        sample_0.l_shipmode as c5,
        ref_0.l_shipinstruct as c6
      from
        public.lineitem as ref_0
          left join public.lineitem as sample_0
          on ((select p_partkey from public.part order by p_partkey limit 1)
                 is not NULL)
      where sample_0.l_orderkey is NULL) as subq_0
where subq_0.c5 is NULL
limit 1;
                                                                  QUERY
PLAN
-----------------------------------------------------------------------------------------------------------------------------------------------
 Limit  (cost=1078.00..91026185.57 rows=1 width=27) (actual
time=192669.605..192670.425 rows=0 loops=1)
   InitPlan 1 (returns $0)
     ->  Limit  (cost=78.00..78.00 rows=1 width=8) (actual
time=0.662..0.663 rows=1 loops=1)
           ->  Sort  (cost=78.00..83.00 rows=2000 width=8) (actual
time=0.661..0.662 rows=1 loops=1)
                 Sort Key: part.p_partkey
                 Sort Method: top-N heapsort  Memory: 25kB
                 ->  Seq Scan on part  (cost=0.00..68.00 rows=2000 width=8)
(actual time=0.017..0.430 rows=2000 loops=1)
   ->  Gather  (cost=1000.00..91026107.57 rows=1 width=27) (actual
time=192669.604..192670.422 rows=0 loops=1)
         Workers Planned: 1
         Params Evaluated: $0
         Workers Launched: 1
         ->  Nested Loop Left Join  (cost=0.00..91025107.47 rows=1
width=27) (actual time=192588.143..192588.144 rows=0 loops=2)
               Join Filter: ($0 IS NOT NULL)
               Filter: ((sample_0.l_orderkey IS NULL) AND
(sample_0.l_shipmode IS NULL))
               Rows Removed by Filter: 1810515312
               ->  Parallel Seq Scan on lineitem ref_0  (cost=0.00..1721.97
rows=35397 width=11) (actual time=0.007..3.797 rows=30088 loops=2)
               ->  Seq Scan on lineitem sample_0  (cost=0.00..1969.75
rows=60175 width=27) (actual time=0.000..2.637 rows=60175 loops=60175)
 Planning Time: 0.174 ms
 Execution Time: 192670.458 ms
(19 rows)

I debug the code and find consider_parallel_nestloop() doesn't consider
materialized form of the cheapest inner path.
When enable_material = true,  we can see Material path won in first plan,
but Parallel Seq Scan node doesn't add as outer path, which because
in try_partial_nestloop_path() , the cost of nestloop wat computed using
seq scan path not material path.

[1] include test table schema and data, you can repeat above problem.

I try fix this problem in attached patch, and I found pg12.12 also had this
issue. Please review my patch, thanks!

[1] https://github.com/tenderwg/tpch_test
From 7b652382928cc80ab6fbe782feada9f0eab4c5a8 Mon Sep 17 00:00:00 2001
From: "tender.wang" <tender.w...@openpie.com>
Date: Tue, 5 Sep 2023 14:33:24 +0800
Subject: [PATCH] Parallel seq scan should consider materila inner path in
 nestloop case.

---
 src/backend/optimizer/path/joinpath.c         | 16 +++++++++++
 src/test/regress/expected/select_parallel.out | 27 +++++++++++++++++++
 src/test/regress/sql/select_parallel.sql      | 10 +++++++
 3 files changed, 53 insertions(+)

diff --git a/src/backend/optimizer/path/joinpath.c 
b/src/backend/optimizer/path/joinpath.c
index 821d282497..5a10fb7f4b 100644
--- a/src/backend/optimizer/path/joinpath.c
+++ b/src/backend/optimizer/path/joinpath.c
@@ -2004,10 +2004,22 @@ consider_parallel_nestloop(PlannerInfo *root,
 {
        JoinType        save_jointype = jointype;
        ListCell   *lc1;
+       Path *matpath = NULL;
+       Path *inner_cheapest_total = innerrel->cheapest_total_path;
 
        if (jointype == JOIN_UNIQUE_INNER)
                jointype = JOIN_INNER;
 
+       /*
+        * Consider materializing the cheapest inner path, unless
+        * enable_material is off or the path in question materializes its
+        * output anyway.
+        */
+       if (enable_material && inner_cheapest_total != NULL &&
+               !ExecMaterializesOutput(inner_cheapest_total->pathtype))
+               matpath = (Path *)
+                       create_material_path(innerrel, inner_cheapest_total);
+
        foreach(lc1, outerrel->partial_pathlist)
        {
                Path       *outerpath = (Path *) lfirst(lc1);
@@ -2064,6 +2076,10 @@ consider_parallel_nestloop(PlannerInfo *root,
                                try_partial_nestloop_path(root, joinrel, 
outerpath, mpath,
                                                                                
  pathkeys, jointype, extra);
                }
+               /* Also consider materialized form of the cheapest inner path */
+               if (matpath != NULL && matpath->parallel_safe)
+                       try_partial_nestloop_path(root, joinrel, outerpath, 
matpath,
+                                                                         
pathkeys, jointype, extra);
        }
 }
 
diff --git a/src/test/regress/expected/select_parallel.out 
b/src/test/regress/expected/select_parallel.out
index d88353d496..452a3aed07 100644
--- a/src/test/regress/expected/select_parallel.out
+++ b/src/test/regress/expected/select_parallel.out
@@ -844,6 +844,33 @@ select * from
 (12 rows)
 
 reset enable_material;
+-- test materialized form of the cheapest inner path
+set min_parallel_table_scan_size = '512kB';
+explain(costs off) select suq.two, suq.four
+from
+ (select t1.two, t2.four, t1.ten,t2.twenty
+ from tenk1 t1 left join tenk2 t2
+ on ((select q1 from int8_tbl order by q1 limit 1)is not null) where t2.ten is 
null) as suq;
+                   QUERY PLAN                    
+-------------------------------------------------
+ Nested Loop Left Join
+   Join Filter: ($0 IS NOT NULL)
+   Filter: (t2.ten IS NULL)
+   InitPlan 1 (returns $0)
+     ->  Limit
+           ->  Sort
+                 Sort Key: int8_tbl.q1
+                 ->  Seq Scan on int8_tbl
+   ->  Gather
+         Workers Planned: 4
+         ->  Parallel Seq Scan on tenk1 t1
+   ->  Materialize
+         ->  Gather
+               Workers Planned: 2
+               ->  Parallel Seq Scan on tenk2 t2
+(15 rows)
+
+set min_parallel_table_scan_size = 0;
 reset enable_hashagg;
 -- check parallelized int8 aggregate (bug #14897)
 explain (costs off)
diff --git a/src/test/regress/sql/select_parallel.sql 
b/src/test/regress/sql/select_parallel.sql
index 80c914dc02..ded44a0c09 100644
--- a/src/test/regress/sql/select_parallel.sql
+++ b/src/test/regress/sql/select_parallel.sql
@@ -312,6 +312,16 @@ select * from
 
 reset enable_material;
 
+-- test materialized form of the cheapest inner path
+set min_parallel_table_scan_size = '512kB';
+
+explain(costs off) select suq.two, suq.four
+from
+ (select t1.two, t2.four, t1.ten,t2.twenty
+ from tenk1 t1 left join tenk2 t2
+ on ((select q1 from int8_tbl order by q1 limit 1)is not null) where t2.ten is 
null) as suq;
+
+set min_parallel_table_scan_size = 0;
 reset enable_hashagg;
 
 -- check parallelized int8 aggregate (bug #14897)
-- 
2.25.1

Reply via email to