This is an automated email from the ASF dual-hosted git repository.

maxyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git

commit 841aacdb43c7e23ea445d336bf5ed1937eafc6cd
Author: Ashwin Agrawal <[email protected]>
AuthorDate: Wed Jun 8 18:05:08 2022 -0700

    Avoid large replication lag due to FPI WAL records from hintbits
    
    SELECT or other read only operations can result in setting hintbits on
    pages, which in turn results in generating Full Page Images (FPI) WAL
    records. These operations are not throttled currently and hence can
    generate huge wal traffic on primary without waiting for mirror. This
    causes concurrent transactions to suffer from replication lag.
    
    One of scenario this was seen in production was during SELECT post
    restore of large heap table from backup. Ideally, these situations
    should be avoided by performing vacuum on such tables or via some
    other mechanisms. Though there is still possibility the best practice
    guidance is not followed and hence to still avoid the situation better
    to have throttling in writing the FPI code as well.
    
    Co-authored-by: Divyesh Vanjare <[email protected]>
---
 src/backend/access/transam/xloginsert.c            |   2 +
 src/backend/utils/misc/guc_gp.c                    |   8 +-
 .../expected/segwalrep/select_throttle.out         | 100 +++++++++++++++++++++
 src/test/isolation2/expected/setup.out             |   3 +
 src/test/isolation2/isolation2_schedule            |   1 +
 .../isolation2/sql/segwalrep/select_throttle.sql   |  58 ++++++++++++
 src/test/isolation2/sql/setup.sql                  |  12 +++
 7 files changed, 183 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/transam/xloginsert.c 
b/src/backend/access/transam/xloginsert.c
index 73b93fda91..98505ea675 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -1020,6 +1020,8 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
                XLogRegisterBlock(0, &rnode, forkno, blkno, copied_buffer.data, 
flags);
 
                recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI_FOR_HINT);
+
+               wait_to_avoid_large_repl_lag();
        }
 
        return recptr;
diff --git a/src/backend/utils/misc/guc_gp.c b/src/backend/utils/misc/guc_gp.c
index 99c75aba25..d68c9b291f 100644
--- a/src/backend/utils/misc/guc_gp.c
+++ b/src/backend/utils/misc/guc_gp.c
@@ -1505,10 +1505,16 @@ struct config_bool ConfigureNamesBool_gp[] =
 
        {
                {"gp_disable_tuple_hints", PGC_USERSET, DEVELOPER_OPTIONS,
-                       gettext_noop("Specify if reader should set hint bits on 
tuples."),
+                       gettext_noop("Specify if hint bits on tuples should be 
deferred."),
                        NULL,
                        GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE
                },
+               /*
+                * If gp_disable_tuple_hints is off, always mark buffer dirty.
+                * If gp_disable_tuple_hints is on, defer marking the buffer 
dirty
+                * until after transaction is identified as old.
+                * (unless it is a catalog table) See: markDirty
+                */
                &gp_disable_tuple_hints,
                true,
                NULL, NULL, NULL
diff --git a/src/test/isolation2/expected/segwalrep/select_throttle.out 
b/src/test/isolation2/expected/segwalrep/select_throttle.out
new file mode 100644
index 0000000000..7e5ad17201
--- /dev/null
+++ b/src/test/isolation2/expected/segwalrep/select_throttle.out
@@ -0,0 +1,100 @@
+-- Test: SELECT or other read-only operations which set hint bits on pages,
+-- and in turn generate Full Page Images (FPI) WAL records should be throttled.
+-- We will only throttle when our transaction wal exceeds
+-- wait_for_replication_threshold
+-- For this test we will:
+-- 1. Set wait_for_replication_threshold to 1kB for quicker test
+-- 2. create two tables (one small and one large)
+-- 3. set gp_disable_tuple_hints=off so buffer will be immediately marked 
dirty on hint bit change
+-- 4. Suspend walsender
+-- 5. Perform a read-only operation (SELECT) which would now set the hint bits
+--  For the small table this operation should finish,
+--  but for large table the SELECT should be throttled
+--  since it would generate a lot of WAL greater than 
wait_for_replication_threshold
+-- 6. Confirm that the query is waiting on Syncrep
+-- 7. Reset the walsender and the transaction should complete
+--
+
+-- set wait_for_replication_threshold to 1kB for quicker test
+ALTER SYSTEM SET wait_for_replication_threshold = 1;
+ALTER
+SELECT pg_reload_conf();
+ pg_reload_conf 
+----------------
+ t              
+(1 row)
+
+CREATE TABLE select_no_throttle(a int) DISTRIBUTED BY (a);
+CREATE
+INSERT INTO select_no_throttle SELECT generate_series (1, 10);
+INSERT 10
+CREATE TABLE select_throttle(a int) DISTRIBUTED BY (a);
+CREATE
+INSERT INTO select_throttle SELECT generate_series (1, 100000);
+INSERT 100000
+
+-- Enable tuple hints so that buffer will be marked dirty upon a hint bit 
change
+-- (so that we don't have to wait for the tuple to age. See logic in markDirty)
+1U: SET gp_disable_tuple_hints=off;
+SET
+
+-- flush the data to disk
+checkpoint;
+CHECKPOINT
+
+-- Suspend walsender
+SELECT gp_inject_fault_infinite('wal_sender_loop', 'suspend', dbid) FROM 
gp_segment_configuration WHERE role = 'p' and content = 1;
+ gp_inject_fault_infinite 
+--------------------------
+ Success:                 
+(1 row)
+
+-- the following SELECTS will set the hint bit on (the buffer will be marked 
dirty)
+-- This query should not wait
+1U: SELECT count(*) FROM select_no_throttle;
+ count 
+-------
+ 1     
+(1 row)
+checkpoint;
+CHECKPOINT
+-- This query should wait for Syncrep since its WAL size for hint bits is 
greater than wait_for_replication_threshold
+1U&: SELECT count(*) FROM select_throttle;  <waiting ...>
+
+-- check if the above query is waiting on SyncRep in pg_stat_activity
+SELECT is_query_waiting_for_syncrep(50, 'SELECT count(*) FROM 
select_throttle;');
+ is_query_waiting_for_syncrep 
+------------------------------
+ t                            
+(1 row)
+
+-- reset walsender
+SELECT gp_inject_fault_infinite('wal_sender_loop', 'reset', dbid) FROM 
gp_segment_configuration WHERE role = 'p' and content = 1;
+ gp_inject_fault_infinite 
+--------------------------
+ Success:                 
+(1 row)
+-- after this, system continue to proceed
+
+1U<:  <... completed>
+ count 
+-------
+ 33327 
+(1 row)
+
+SELECT wait_until_all_segments_synchronized();
+ wait_until_all_segments_synchronized 
+--------------------------------------
+ OK                                   
+(1 row)
+1U: RESET gp_disable_tuple_hints;
+RESET
+1Uq: ... <quitting>
+
+ALTER SYSTEM RESET wait_for_replication_threshold;
+ALTER
+SELECT pg_reload_conf();
+ pg_reload_conf 
+----------------
+ t              
+(1 row)
diff --git a/src/test/isolation2/expected/setup.out 
b/src/test/isolation2/expected/setup.out
index 5184e6d7cb..280470d408 100644
--- a/src/test/isolation2/expected/setup.out
+++ b/src/test/isolation2/expected/setup.out
@@ -89,6 +89,9 @@ CREATE
 create or replace function wait_until_all_segments_synchronized() returns text 
as $$ begin for i in 1..6000 loop if (select count(*) = 0 from 
gp_segment_configuration where content != -1 and mode != 's') then return 'OK'; 
/* in func */ end if; /* in func */ perform pg_sleep(0.1); /* in func */ 
perform gp_request_fts_probe_scan(); /* in func */ end loop; /* in func */ 
return 'Fail'; /* in func */ end; /* in func */ $$ language plpgsql;
 CREATE
 
+CREATE OR REPLACE FUNCTION is_query_waiting_for_syncrep(iterations int, 
check_query text) RETURNS bool AS $$ for i in range(iterations): results = 
plpy.execute("SELECT gp_execution_segment() AS content, query, wait_event\ FROM 
gp_dist_random('pg_stat_activity')\ WHERE gp_execution_segment() = 1 AND\ query 
= '%s' AND\ wait_event = 'SyncRep'" % check_query ) if results: return True 
return False $$ LANGUAGE plpython3u VOLATILE;
+CREATE
+
 create or replace function wait_for_replication_replay (segid int, retries 
int) returns bool as $$ declare i int; /* in func */ result bool; /* in func */ 
begin i := 0; /* in func */ -- Wait until the mirror/standby has replayed up to 
flush location loop SELECT flush_lsn = replay_lsn INTO result from 
gp_stat_replication where gp_segment_id = segid; /* in func */ if result then 
return true; /* in func */ end if; /* in func */ 
 if i >= retries then return false; /* in func */ end if; /* in func */ perform 
pg_sleep(0.1); /* in func */ perform pg_stat_clear_snapshot(); /* in func */ i 
:= i + 1; /* in func */ end loop; /* in func */ end; /* in func */ $$ language 
plpgsql;
 CREATE
diff --git a/src/test/isolation2/isolation2_schedule 
b/src/test/isolation2/isolation2_schedule
index d6847bd2f1..87f8f2aff7 100644
--- a/src/test/isolation2/isolation2_schedule
+++ b/src/test/isolation2/isolation2_schedule
@@ -234,6 +234,7 @@ test: segwalrep/twophase_tolerance_with_mirror_promotion
 test: segwalrep/dtm_recovery_on_standby
 test: segwalrep/commit_blocking_on_standby
 test: segwalrep/dtx_recovery_wait_lsn
+test: segwalrep/select_throttle
 test: fts_manual_probe
 test: fts_session_reset
 # unstable FTS test in different arch
diff --git a/src/test/isolation2/sql/segwalrep/select_throttle.sql 
b/src/test/isolation2/sql/segwalrep/select_throttle.sql
new file mode 100644
index 0000000000..6c613bb241
--- /dev/null
+++ b/src/test/isolation2/sql/segwalrep/select_throttle.sql
@@ -0,0 +1,58 @@
+-- Test: SELECT or other read-only operations which set hint bits on pages,
+-- and in turn generate Full Page Images (FPI) WAL records should be throttled.
+-- We will only throttle when our transaction wal exceeds
+-- wait_for_replication_threshold
+-- For this test we will:
+-- 1. Set wait_for_replication_threshold to 1kB for quicker test
+-- 2. create two tables (one small and one large)
+-- 3. set gp_disable_tuple_hints=off so buffer will be immediately marked 
dirty on hint bit change
+-- 4. Suspend walsender
+-- 5. Perform a read-only operation (SELECT) which would now set the hint bits
+--  For the small table this operation should finish,
+--  but for large table the SELECT should be throttled
+--  since it would generate a lot of WAL greater than 
wait_for_replication_threshold
+-- 6. Confirm that the query is waiting on Syncrep
+-- 7. Reset the walsender and the transaction should complete
+--
+
+-- set wait_for_replication_threshold to 1kB for quicker test
+ALTER SYSTEM SET wait_for_replication_threshold = 1;
+SELECT pg_reload_conf();
+
+CREATE TABLE select_no_throttle(a int) DISTRIBUTED BY (a);
+INSERT INTO select_no_throttle SELECT generate_series (1, 10);
+CREATE TABLE select_throttle(a int) DISTRIBUTED BY (a);
+INSERT INTO select_throttle SELECT generate_series (1, 100000);
+
+-- Enable tuple hints so that buffer will be marked dirty upon a hint bit 
change
+-- (so that we don't have to wait for the tuple to age. See logic in markDirty)
+1U: SET gp_disable_tuple_hints=off;
+
+-- flush the data to disk
+checkpoint;
+
+-- Suspend walsender
+SELECT gp_inject_fault_infinite('wal_sender_loop', 'suspend', dbid) FROM 
gp_segment_configuration WHERE role = 'p' and content = 1;
+
+-- the following SELECTS will set the hint bit on (the buffer will be marked 
dirty)
+-- This query should not wait
+1U: SELECT count(*) FROM select_no_throttle;
+checkpoint;
+-- This query should wait for Syncrep since its WAL size for hint bits is 
greater than wait_for_replication_threshold
+1U&: SELECT count(*) FROM select_throttle;
+
+-- check if the above query is waiting on SyncRep in pg_stat_activity
+SELECT is_query_waiting_for_syncrep(50, 'SELECT count(*) FROM 
select_throttle;');
+
+-- reset walsender
+SELECT gp_inject_fault_infinite('wal_sender_loop', 'reset', dbid) FROM 
gp_segment_configuration WHERE role = 'p' and content = 1;
+-- after this, system continue to proceed
+
+1U<:
+
+SELECT wait_until_all_segments_synchronized();
+1U: RESET gp_disable_tuple_hints;
+1Uq:
+
+ALTER SYSTEM RESET wait_for_replication_threshold;
+SELECT pg_reload_conf();
diff --git a/src/test/isolation2/sql/setup.sql 
b/src/test/isolation2/sql/setup.sql
index fa81e3685c..5fcd90fc7f 100644
--- a/src/test/isolation2/sql/setup.sql
+++ b/src/test/isolation2/sql/setup.sql
@@ -256,6 +256,18 @@ begin
 end; /* in func */
 $$ language plpgsql;
 
+CREATE OR REPLACE FUNCTION is_query_waiting_for_syncrep(iterations int, 
check_query text) RETURNS bool AS $$
+    for i in range(iterations):
+        results = plpy.execute("SELECT gp_execution_segment() AS content, 
query, wait_event\
+                                FROM gp_dist_random('pg_stat_activity')\
+                                WHERE gp_execution_segment() = 1 AND\
+                                query = '%s' AND\
+                                wait_event = 'SyncRep'" % check_query )
+        if results:
+            return True
+    return False
+$$ LANGUAGE plpython3u VOLATILE;
+
 create or replace function wait_for_replication_replay (segid int, retries 
int) returns bool as
 $$
 declare


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to