This is an automated email from the ASF dual-hosted git repository. maxyang pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/cloudberry.git
commit 841aacdb43c7e23ea445d336bf5ed1937eafc6cd Author: Ashwin Agrawal <[email protected]> AuthorDate: Wed Jun 8 18:05:08 2022 -0700 Avoid large replication lag due to FPI WAL records from hintbits SELECT or other read only operations can result in setting hintbits on pages, which in turn results in generating Full Page Images (FPI) WAL records. These operations are not throttled currently and hence can generate huge wal traffic on primary without waiting for mirror. This causes concurrent transactions to suffer from replication lag. One of scenario this was seen in production was during SELECT post restore of large heap table from backup. Ideally, these situations should be avoided by performing vacuum on such tables or via some other mechanisms. Though there is still possibility the best practice guidance is not followed and hence to still avoid the situation better to have throttling in writing the FPI code as well. Co-authored-by: Divyesh Vanjare <[email protected]> --- src/backend/access/transam/xloginsert.c | 2 + src/backend/utils/misc/guc_gp.c | 8 +- .../expected/segwalrep/select_throttle.out | 100 +++++++++++++++++++++ src/test/isolation2/expected/setup.out | 3 + src/test/isolation2/isolation2_schedule | 1 + .../isolation2/sql/segwalrep/select_throttle.sql | 58 ++++++++++++ src/test/isolation2/sql/setup.sql | 12 +++ 7 files changed, 183 insertions(+), 1 deletion(-) diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 73b93fda91..98505ea675 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -1020,6 +1020,8 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std) XLogRegisterBlock(0, &rnode, forkno, blkno, copied_buffer.data, flags); recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI_FOR_HINT); + + wait_to_avoid_large_repl_lag(); } return recptr; diff --git a/src/backend/utils/misc/guc_gp.c b/src/backend/utils/misc/guc_gp.c index 99c75aba25..d68c9b291f 100644 --- a/src/backend/utils/misc/guc_gp.c +++ b/src/backend/utils/misc/guc_gp.c @@ -1505,10 +1505,16 @@ struct config_bool ConfigureNamesBool_gp[] = { {"gp_disable_tuple_hints", PGC_USERSET, DEVELOPER_OPTIONS, - gettext_noop("Specify if reader should set hint bits on tuples."), + gettext_noop("Specify if hint bits on tuples should be deferred."), NULL, GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE }, + /* + * If gp_disable_tuple_hints is off, always mark buffer dirty. + * If gp_disable_tuple_hints is on, defer marking the buffer dirty + * until after transaction is identified as old. + * (unless it is a catalog table) See: markDirty + */ &gp_disable_tuple_hints, true, NULL, NULL, NULL diff --git a/src/test/isolation2/expected/segwalrep/select_throttle.out b/src/test/isolation2/expected/segwalrep/select_throttle.out new file mode 100644 index 0000000000..7e5ad17201 --- /dev/null +++ b/src/test/isolation2/expected/segwalrep/select_throttle.out @@ -0,0 +1,100 @@ +-- Test: SELECT or other read-only operations which set hint bits on pages, +-- and in turn generate Full Page Images (FPI) WAL records should be throttled. +-- We will only throttle when our transaction wal exceeds +-- wait_for_replication_threshold +-- For this test we will: +-- 1. Set wait_for_replication_threshold to 1kB for quicker test +-- 2. create two tables (one small and one large) +-- 3. set gp_disable_tuple_hints=off so buffer will be immediately marked dirty on hint bit change +-- 4. Suspend walsender +-- 5. Perform a read-only operation (SELECT) which would now set the hint bits +-- For the small table this operation should finish, +-- but for large table the SELECT should be throttled +-- since it would generate a lot of WAL greater than wait_for_replication_threshold +-- 6. Confirm that the query is waiting on Syncrep +-- 7. Reset the walsender and the transaction should complete +-- + +-- set wait_for_replication_threshold to 1kB for quicker test +ALTER SYSTEM SET wait_for_replication_threshold = 1; +ALTER +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +CREATE TABLE select_no_throttle(a int) DISTRIBUTED BY (a); +CREATE +INSERT INTO select_no_throttle SELECT generate_series (1, 10); +INSERT 10 +CREATE TABLE select_throttle(a int) DISTRIBUTED BY (a); +CREATE +INSERT INTO select_throttle SELECT generate_series (1, 100000); +INSERT 100000 + +-- Enable tuple hints so that buffer will be marked dirty upon a hint bit change +-- (so that we don't have to wait for the tuple to age. See logic in markDirty) +1U: SET gp_disable_tuple_hints=off; +SET + +-- flush the data to disk +checkpoint; +CHECKPOINT + +-- Suspend walsender +SELECT gp_inject_fault_infinite('wal_sender_loop', 'suspend', dbid) FROM gp_segment_configuration WHERE role = 'p' and content = 1; + gp_inject_fault_infinite +-------------------------- + Success: +(1 row) + +-- the following SELECTS will set the hint bit on (the buffer will be marked dirty) +-- This query should not wait +1U: SELECT count(*) FROM select_no_throttle; + count +------- + 1 +(1 row) +checkpoint; +CHECKPOINT +-- This query should wait for Syncrep since its WAL size for hint bits is greater than wait_for_replication_threshold +1U&: SELECT count(*) FROM select_throttle; <waiting ...> + +-- check if the above query is waiting on SyncRep in pg_stat_activity +SELECT is_query_waiting_for_syncrep(50, 'SELECT count(*) FROM select_throttle;'); + is_query_waiting_for_syncrep +------------------------------ + t +(1 row) + +-- reset walsender +SELECT gp_inject_fault_infinite('wal_sender_loop', 'reset', dbid) FROM gp_segment_configuration WHERE role = 'p' and content = 1; + gp_inject_fault_infinite +-------------------------- + Success: +(1 row) +-- after this, system continue to proceed + +1U<: <... completed> + count +------- + 33327 +(1 row) + +SELECT wait_until_all_segments_synchronized(); + wait_until_all_segments_synchronized +-------------------------------------- + OK +(1 row) +1U: RESET gp_disable_tuple_hints; +RESET +1Uq: ... <quitting> + +ALTER SYSTEM RESET wait_for_replication_threshold; +ALTER +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) diff --git a/src/test/isolation2/expected/setup.out b/src/test/isolation2/expected/setup.out index 5184e6d7cb..280470d408 100644 --- a/src/test/isolation2/expected/setup.out +++ b/src/test/isolation2/expected/setup.out @@ -89,6 +89,9 @@ CREATE create or replace function wait_until_all_segments_synchronized() returns text as $$ begin for i in 1..6000 loop if (select count(*) = 0 from gp_segment_configuration where content != -1 and mode != 's') then return 'OK'; /* in func */ end if; /* in func */ perform pg_sleep(0.1); /* in func */ perform gp_request_fts_probe_scan(); /* in func */ end loop; /* in func */ return 'Fail'; /* in func */ end; /* in func */ $$ language plpgsql; CREATE +CREATE OR REPLACE FUNCTION is_query_waiting_for_syncrep(iterations int, check_query text) RETURNS bool AS $$ for i in range(iterations): results = plpy.execute("SELECT gp_execution_segment() AS content, query, wait_event\ FROM gp_dist_random('pg_stat_activity')\ WHERE gp_execution_segment() = 1 AND\ query = '%s' AND\ wait_event = 'SyncRep'" % check_query ) if results: return True return False $$ LANGUAGE plpython3u VOLATILE; +CREATE + create or replace function wait_for_replication_replay (segid int, retries int) returns bool as $$ declare i int; /* in func */ result bool; /* in func */ begin i := 0; /* in func */ -- Wait until the mirror/standby has replayed up to flush location loop SELECT flush_lsn = replay_lsn INTO result from gp_stat_replication where gp_segment_id = segid; /* in func */ if result then return true; /* in func */ end if; /* in func */ if i >= retries then return false; /* in func */ end if; /* in func */ perform pg_sleep(0.1); /* in func */ perform pg_stat_clear_snapshot(); /* in func */ i := i + 1; /* in func */ end loop; /* in func */ end; /* in func */ $$ language plpgsql; CREATE diff --git a/src/test/isolation2/isolation2_schedule b/src/test/isolation2/isolation2_schedule index d6847bd2f1..87f8f2aff7 100644 --- a/src/test/isolation2/isolation2_schedule +++ b/src/test/isolation2/isolation2_schedule @@ -234,6 +234,7 @@ test: segwalrep/twophase_tolerance_with_mirror_promotion test: segwalrep/dtm_recovery_on_standby test: segwalrep/commit_blocking_on_standby test: segwalrep/dtx_recovery_wait_lsn +test: segwalrep/select_throttle test: fts_manual_probe test: fts_session_reset # unstable FTS test in different arch diff --git a/src/test/isolation2/sql/segwalrep/select_throttle.sql b/src/test/isolation2/sql/segwalrep/select_throttle.sql new file mode 100644 index 0000000000..6c613bb241 --- /dev/null +++ b/src/test/isolation2/sql/segwalrep/select_throttle.sql @@ -0,0 +1,58 @@ +-- Test: SELECT or other read-only operations which set hint bits on pages, +-- and in turn generate Full Page Images (FPI) WAL records should be throttled. +-- We will only throttle when our transaction wal exceeds +-- wait_for_replication_threshold +-- For this test we will: +-- 1. Set wait_for_replication_threshold to 1kB for quicker test +-- 2. create two tables (one small and one large) +-- 3. set gp_disable_tuple_hints=off so buffer will be immediately marked dirty on hint bit change +-- 4. Suspend walsender +-- 5. Perform a read-only operation (SELECT) which would now set the hint bits +-- For the small table this operation should finish, +-- but for large table the SELECT should be throttled +-- since it would generate a lot of WAL greater than wait_for_replication_threshold +-- 6. Confirm that the query is waiting on Syncrep +-- 7. Reset the walsender and the transaction should complete +-- + +-- set wait_for_replication_threshold to 1kB for quicker test +ALTER SYSTEM SET wait_for_replication_threshold = 1; +SELECT pg_reload_conf(); + +CREATE TABLE select_no_throttle(a int) DISTRIBUTED BY (a); +INSERT INTO select_no_throttle SELECT generate_series (1, 10); +CREATE TABLE select_throttle(a int) DISTRIBUTED BY (a); +INSERT INTO select_throttle SELECT generate_series (1, 100000); + +-- Enable tuple hints so that buffer will be marked dirty upon a hint bit change +-- (so that we don't have to wait for the tuple to age. See logic in markDirty) +1U: SET gp_disable_tuple_hints=off; + +-- flush the data to disk +checkpoint; + +-- Suspend walsender +SELECT gp_inject_fault_infinite('wal_sender_loop', 'suspend', dbid) FROM gp_segment_configuration WHERE role = 'p' and content = 1; + +-- the following SELECTS will set the hint bit on (the buffer will be marked dirty) +-- This query should not wait +1U: SELECT count(*) FROM select_no_throttle; +checkpoint; +-- This query should wait for Syncrep since its WAL size for hint bits is greater than wait_for_replication_threshold +1U&: SELECT count(*) FROM select_throttle; + +-- check if the above query is waiting on SyncRep in pg_stat_activity +SELECT is_query_waiting_for_syncrep(50, 'SELECT count(*) FROM select_throttle;'); + +-- reset walsender +SELECT gp_inject_fault_infinite('wal_sender_loop', 'reset', dbid) FROM gp_segment_configuration WHERE role = 'p' and content = 1; +-- after this, system continue to proceed + +1U<: + +SELECT wait_until_all_segments_synchronized(); +1U: RESET gp_disable_tuple_hints; +1Uq: + +ALTER SYSTEM RESET wait_for_replication_threshold; +SELECT pg_reload_conf(); diff --git a/src/test/isolation2/sql/setup.sql b/src/test/isolation2/sql/setup.sql index fa81e3685c..5fcd90fc7f 100644 --- a/src/test/isolation2/sql/setup.sql +++ b/src/test/isolation2/sql/setup.sql @@ -256,6 +256,18 @@ begin end; /* in func */ $$ language plpgsql; +CREATE OR REPLACE FUNCTION is_query_waiting_for_syncrep(iterations int, check_query text) RETURNS bool AS $$ + for i in range(iterations): + results = plpy.execute("SELECT gp_execution_segment() AS content, query, wait_event\ + FROM gp_dist_random('pg_stat_activity')\ + WHERE gp_execution_segment() = 1 AND\ + query = '%s' AND\ + wait_event = 'SyncRep'" % check_query ) + if results: + return True + return False +$$ LANGUAGE plpython3u VOLATILE; + create or replace function wait_for_replication_replay (segid int, retries int) returns bool as $$ declare --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
