Re: Backends stalled in 'startup' state

2023-01-17 Thread Ashwin Agrawal
On Tue, Jan 17, 2023 at 4:52 PM Ashwin Agrawal  wrote:

>
> We recently saw many backends (close to max_connection limit) get stalled
> in 'startup' in one of the production environments for Greenplum (fork of
> PostgreSQL). Tracing the reason, it was found all the tuples created by
> bootstrap (xmin=1) in pg_attribute were at super high block numbers (for
> example beyond 30,000). Tracing the reason for the backend startup stall
> exactly matched Tom's reasoning in [1]. Stalls became much longer in
> presence of sub-transaction overflow or presence of long running
> transactions as tuple visibility took longer. The thread ruled out the
> possibility of system catalog rows to be present in higher block numbers
> instead of in front for pg_attribute.
>
> This thread provides simple reproduction on the latest version of
> PostgreSQL and RCA for how bootstrap catalog entries can move to higher
> blocks and as a result cause stalls for backend starts. Simple fix to avoid
> the issue provided at the end.
>
> The cause is syncscan triggering during VACUUM FULL. VACUUM FULL rewrites
> the table by performing the seqscan as well. And
> heapam_relation_copy_for_cluster() conveys feel free to use syncscan. Hence
> logic to not start from block 0 instead some other block already in cache
> is possible and opens the possibility to move the bootstrap tuples to
> anywhere else in the table.
>
> --
> Repro
> --
> -- create database to play
> drop database if exists test;
> create database test;
> \c test
>
> -- function just to create many tables to increase pg_attribute size
> -- (ideally many column table might do the job more easily)
> CREATE OR REPLACE FUNCTION public.f(id integer)
>  RETURNS void
>  LANGUAGE plpgsql
>  STRICT
> AS $function$
> declare
>   sql text;
>   i int;
> begin
>   for i in id..id+ loop
> sql='create table if not exists tbl'||i||' (id int)';
> execute sql;
>   end loop;
> end;
> $function$;
>
> select f(1);
> select f(2);
> select f(3);
> select f(4);
>
> -- validate pg_attribute size is greater than 1/4 of shared_buffers
> -- for syncscan to triggger
> show shared_buffers;
> select pg_size_pretty(pg_relation_size('pg_attribute'));
> select ctid, xmin, attrelid, attname from pg_attribute where xmin = 1
> limit 5;
>
> -- perform seq scan of pg_attribute to page past bootstrapped tuples
> copy (select * from pg_attribute limit 2000) to '/tmp/p';
>
> -- this will internally use syncscan starting with block after bootstrap
> tuples
> -- and hence move bootstrap tuples last to higher block numbers
> vacuum full pg_attribute;
>
> --
> Sample run
> --
> show shared_buffers;
>  shared_buffers
> 
>  128MB
> (1 row)
>
> select pg_size_pretty(pg_relation_size('pg_attribute'));
>  pg_size_pretty
> 
>  40 MB
> (1 row)
>
> select ctid, xmin, attrelid, attname from pg_attribute where xmin = 1
> limit 5;
>  ctid  | xmin | attrelid |   attname
> ---+--+--+--
>  (0,1) |1 | 1255 | oid
>  (0,2) |1 | 1255 | proname
>  (0,3) |1 | 1255 | pronamespace
>  (0,4) |1 | 1255 | proowner
>  (0,5) |1 | 1255 | prolang
> (5 rows)
>
> copy (select * from pg_attribute limit 2000) to '/tmp/p';
> COPY 2000
> vacuum full pg_attribute;
> VACUUM
> select ctid, xmin, attrelid, attname from pg_attribute where xmin = 1
> limit 5;
>ctid| xmin | attrelid |   attname
> ---+--+--+--
>  (5115,14) |1 | 1255 | oid
>  (5115,15) |1 | 1255 | proname
>  (5115,16) |1 | 1255 | pronamespace
>  (5115,17) |1 | 1255 | proowner
>  (5115,18) |1 | 1255 | prolang
> (5 rows)
>
>
> Note:
> -- used logic causing the problem to fix it as well on the system :-)
> -- scan till block where bootstrap tuples are located
> select ctid, xmin, attrelid, attname from pg_attribute where xmin = 1
> limit 5;
> -- now due to syncscan triggering it will pick the blocks with bootstrap
> tuples first and help to bring them back to front
> vacuum full pg_attribute;
>
> --
> Patch to avoid the problem:
> --
> diff --git a/src/backend/access/heap/heapam_handler.c
> b/src/backend/access/heap/heapam_handler.c
> index a3414a76e8..4c031914a3 100644
> --- a/src/backend/access/heap/heapam_handler.c
> +++ b/src/backend/access/heap/heapam_handler.c
> @@ -757,7 +757,17 @@ heapam_relation_copy_for_cluster(Relation OldHeap,
> Relation NewHeap,
> pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
>
>  PROGRESS_CLUSTER_PHASE_SEQ_SCAN_HEAP);
>
> -   tableScan = table_beginscan(OldHea

Re: Backends stalled in 'startup' state

2022-09-27 Thread Ashwin Agrawal
On Thu, Sep 15, 2022 at 4:42 PM Ashwin Agrawal  wrote:

>
> We recently saw many backends (close to max_connection limit) get stalled
> in 'startup' in one of the production environments for Greenplum (fork of
> PostgreSQL). Tracing the reason, it was found all the tuples created by
> bootstrap (xmin=1) in pg_attribute were at super high block numbers (for
> example beyond 30,000). Tracing the reason for the backend startup stall
> exactly matched Tom's reasoning in [1]. Stalls became much longer in
> presence of sub-transaction overflow or presence of long running
> transactions as tuple visibility took longer. The thread ruled out the
> possibility of system catalog rows to be present in higher block numbers
> instead of in front for pg_attribute.
>
> This thread provides simple reproduction on the latest version of
> PostgreSQL and RCA for how bootstrap catalog entries can move to higher
> blocks and as a result cause stalls for backend starts. Simple fix to avoid
> the issue provided at the end.
>
> The cause is syncscan triggering during VACUUM FULL. VACUUM FULL rewrites
> the table by performing the seqscan as well. And
> heapam_relation_copy_for_cluster() conveys feel free to use syncscan. Hence
> logic to not start from block 0 instead some other block already in cache
> is possible and opens the possibility to move the bootstrap tuples to
> anywhere else in the table.
>
> --
> Repro
> --
> -- create database to play
> drop database if exists test;
> create database test;
> \c test
>
> -- function just to create many tables to increase pg_attribute size
> -- (ideally many column table might do the job more easily)
> CREATE OR REPLACE FUNCTION public.f(id integer)
>  RETURNS void
>  LANGUAGE plpgsql
>  STRICT
> AS $function$
> declare
>   sql text;
>   i int;
> begin
>   for i in id..id+ loop
> sql='create table if not exists tbl'||i||' (id int)';
> execute sql;
>   end loop;
> end;
> $function$;
>
> select f(1);
> select f(2);
> select f(3);
> select f(4);
>
> -- validate pg_attribute size is greater than 1/4 of shared_buffers
> -- for syncscan to triggger
> show shared_buffers;
> select pg_size_pretty(pg_relation_size('pg_attribute'));
> select ctid, xmin, attrelid, attname from pg_attribute where xmin = 1
> limit 5;
>
> -- perform seq scan of pg_attribute to page past bootstrapped tuples
> copy (select * from pg_attribute limit 2000) to '/tmp/p';
>
> -- this will internally use syncscan starting with block after bootstrap
> tuples
> -- and hence move bootstrap tuples last to higher block numbers
> vacuum full pg_attribute;
>
> --
> Sample run
> --
> show shared_buffers;
>  shared_buffers
> 
>  128MB
> (1 row)
>
> select pg_size_pretty(pg_relation_size('pg_attribute'));
>  pg_size_pretty
> 
>  40 MB
> (1 row)
>
> select ctid, xmin, attrelid, attname from pg_attribute where xmin = 1
> limit 5;
>  ctid  | xmin | attrelid |   attname
> ---+--+--+--
>  (0,1) |1 | 1255 | oid
>  (0,2) |1 | 1255 | proname
>  (0,3) |1 | 1255 | pronamespace
>  (0,4) |1 | 1255 | proowner
>  (0,5) |1 | 1255 | prolang
> (5 rows)
>
> copy (select * from pg_attribute limit 2000) to '/tmp/p';
> COPY 2000
> vacuum full pg_attribute;
> VACUUM
> select ctid, xmin, attrelid, attname from pg_attribute where xmin = 1
> limit 5;
>ctid| xmin | attrelid |   attname
> ---+--+--+--
>  (5115,14) |1 | 1255 | oid
>  (5115,15) |1 | 1255 | proname
>  (5115,16) |1 | 1255 | pronamespace
>  (5115,17) |1 | 1255 | proowner
>  (5115,18) |1 | 1255 | prolang
> (5 rows)
>
>
> Note:
> -- used logic causing the problem to fix it as well on the system :-)
> -- scan till block where bootstrap tuples are located
> select ctid, xmin, attrelid, attname from pg_attribute where xmin = 1
> limit 5;
> -- now due to syncscan triggering it will pick the blocks with bootstrap
> tuples first and help to bring them back to front
> vacuum full pg_attribute;
>
> --
> Patch to avoid the problem:
> --
> diff --git a/src/backend/access/heap/heapam_handler.c
> b/src/backend/access/heap/heapam_handler.c
> index a3414a76e8..4c031914a3 100644
> --- a/src/backend/access/heap/heapam_handler.c
> +++ b/src/backend/access/heap/heapam_handler.c
> @@ -757,7 +757,17 @@ heapam_relation_copy_for_cluster(Relation OldHeap,
> Relation NewHeap,
> pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
>
>  PROGRESS_CLUSTER_PHASE_SEQ_SCAN_HEAP);
>
> -   tableScan = table_beginscan(OldHea