Hello. > I have tried such an approach but looks like it is not effective, > probably because of CPU caching issues.
It was a mistake by me. I have repeated the approach and got good results with small and a non-invasive patch. The main idea is simple optimistic optimization - store offset to next valid entry. So, in most cases, we could just skip all the gaps. Of course, it adds some additional impact for workloads without long (few seconds) transactions but it is almost not detectable (because of CPU caches). * TEST The next testing setup was used: max_connections=5000 (taken from real RDS installation) pgbench -i -s 10 -U postgres -d postgres # emulate typical OLTP load pgbench -b simple-update -j 1 -c 50 -n -P 1 -T 18000 -U postgres postgres #emulate typical cache load on replica pgbench -b select-only -p 6543 -j 1 -c 50 -n -P 1 -T 18000 -U postgres postgres # emulate some typical long transactions up to 60 seconds on primary echo "\set t random(0, 60) BEGIN; select txid_current(); select pg_sleep(:t); COMMIT;" > slow_transactions.bench pgbench -f /home/nkey/pg/slow_transactions.bench -p 5432 -j 1 -c 10 -n -P 1 -T 18000 -U postgres postgres * RESULTS *REL_13_STABLE* - 23.02% vs 0.76% non-patched: 23.02% postgres [.] KnownAssignedXidsGetAndSetXmin 2.56% postgres [.] base_yyparse 2.15% postgres [.] AllocSetAlloc 1.68% postgres [.] MemoryContextAllocZeroAligned 1.51% postgres [.] hash_search_with_hash_value 1.26% postgres [.] SearchCatCacheInternal 1.03% postgres [.] hash_bytes 0.92% postgres [.] pg_checksum_block 0.89% postgres [.] expression_tree_walker 0.81% postgres [.] core_yylex 0.69% postgres [.] palloc 0.68% [kernel] [k] do_syscall_64 0.59% postgres [.] _bt_compare 0.54% postgres [.] new_list patched: 3.09% postgres [.] base_yyparse 3.00% postgres [.] AllocSetAlloc 2.01% postgres [.] MemoryContextAllocZeroAligned 1.89% postgres [.] SearchCatCacheInternal 1.80% postgres [.] hash_search_with_hash_value 1.27% postgres [.] expression_tree_walker 1.27% postgres [.] pg_checksum_block 1.18% postgres [.] hash_bytes 1.10% postgres [.] core_yylex 0.96% [kernel] [k] do_syscall_64 0.86% postgres [.] palloc 0.84% postgres [.] _bt_compare 0.79% postgres [.] new_list 0.76% postgres [.] KnownAssignedXidsGetAndSetXmin *MASTER* - 6.16% vs ~0% (includes snapshot scalability optimization by Andres Freund (1)) non-patched: 6.16% postgres [.] KnownAssignedXidsGetAndSetXmin 3.05% postgres [.] AllocSetAlloc 2.59% postgres [.] base_yyparse 1.95% postgres [.] hash_search_with_hash_value 1.87% postgres [.] MemoryContextAllocZeroAligned 1.85% postgres [.] SearchCatCacheInternal 1.27% postgres [.] hash_bytes 1.16% postgres [.] expression_tree_walker 1.06% postgres [.] core_yylex 0.94% [kernel] [k] do_syscall_64 patched: 3.35% postgres [.] base_yyparse 2.84% postgres [.] AllocSetAlloc 1.89% postgres [.] hash_search_with_hash_value 1.82% postgres [.] MemoryContextAllocZeroAligned 1.79% postgres [.] SearchCatCacheInternal 1.49% postgres [.] pg_checksum_block 1.26% postgres [.] hash_bytes 1.26% postgres [.] expression_tree_walker 1.08% postgres [.] core_yylex 1.04% [kernel] [k] do_syscall_64 0.81% postgres [.] palloc Looks like it is possible to get a significant TPS increase on a very typical standby workload. Currently, I have no environment to measure TPS accurately. Could you please try it on yours? I have attached two versions of the patch - for master and REL_13_STABLE. Also, I am going to add a patch to commitfest (2). Thanks, MIchail. (1): https://commitfest.postgresql.org/29/2500/ (2): https://commitfest.postgresql.org/34/3271/
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index c7816fcfb3..27486c096b 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -267,6 +267,7 @@ static PGPROC *allProcs; */ static TransactionId *KnownAssignedXids; static bool *KnownAssignedXidsValid; +static pg_atomic_uint32 *KnownAssignedXidsNext; static TransactionId latestObservedXid = InvalidTransactionId; /* @@ -405,6 +406,7 @@ void CreateSharedProcArray(void) { bool found; + int i; /* Create or attach to the ProcArray shared structure */ procArray = (ProcArrayStruct *) @@ -446,6 +448,12 @@ CreateSharedProcArray(void) ShmemInitStruct("KnownAssignedXidsValid", mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS), &found); + KnownAssignedXidsNext = (pg_atomic_uint32 *) + ShmemInitStruct("KnownAssignedXidsNext", + mul_size(sizeof(pg_atomic_uint32), TOTAL_MAX_CACHED_SUBXIDS), + &found); + for (i = 0; i < TOTAL_MAX_CACHED_SUBXIDS; i++) + pg_atomic_init_u32(&KnownAssignedXidsNext[i], 1); } } @@ -4598,12 +4606,13 @@ KnownAssignedXidsCompress(bool force) * re-aligning data to 0th element. */ compress_index = 0; - for (i = tail; i < head; i++) + for (i = tail; i < head; i += pg_atomic_read_u32(&KnownAssignedXidsNext[i])) { if (KnownAssignedXidsValid[i]) { KnownAssignedXids[compress_index] = KnownAssignedXids[i]; KnownAssignedXidsValid[compress_index] = true; + pg_atomic_write_u32(&KnownAssignedXidsNext[compress_index], 1); compress_index++; } } @@ -4706,6 +4715,7 @@ KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid, { KnownAssignedXids[head] = next_xid; KnownAssignedXidsValid[head] = true; + pg_atomic_write_u32(&KnownAssignedXidsNext[head], 1); TransactionIdAdvance(next_xid); head++; } @@ -4921,7 +4931,7 @@ KnownAssignedXidsRemovePreceding(TransactionId removeXid) tail = pArray->tailKnownAssignedXids; head = pArray->headKnownAssignedXids; - for (i = tail; i < head; i++) + for (i = tail; i < head; i += pg_atomic_read_u32(&KnownAssignedXidsNext[i])) { if (KnownAssignedXidsValid[i]) { @@ -4944,7 +4954,7 @@ KnownAssignedXidsRemovePreceding(TransactionId removeXid) /* * Advance the tail pointer if we've marked the tail item invalid. */ - for (i = tail; i < head; i++) + for (i = tail; i < head; i += pg_atomic_read_u32(&KnownAssignedXidsNext[i])) { if (KnownAssignedXidsValid[i]) break; @@ -4994,7 +5004,8 @@ KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin, int count = 0; int head, tail; - int i; + int i, + prev; /* * Fetch head just once, since it may change while we loop. We can stop @@ -5008,9 +5019,10 @@ KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin, SpinLockAcquire(&procArray->known_assigned_xids_lck); tail = procArray->tailKnownAssignedXids; head = procArray->headKnownAssignedXids; + prev = tail; SpinLockRelease(&procArray->known_assigned_xids_lck); - for (i = tail; i < head; i++) + for (i = tail; i < head; i += pg_atomic_read_u32(&KnownAssignedXidsNext[i])) { /* Skip any gaps in the array */ if (KnownAssignedXidsValid[i]) @@ -5033,6 +5045,14 @@ KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin, TransactionIdFollowsOrEquals(knownXid, xmax)) break; + if (prev != i) + { + uint32 offset = i - prev; + if (pg_atomic_read_u32(&KnownAssignedXidsNext[prev]) != offset) + pg_atomic_write_u32(&KnownAssignedXidsNext[prev], offset); + prev = i; + } + /* Add knownXid into output array */ xarray[count++] = knownXid; } @@ -5060,7 +5080,7 @@ KnownAssignedXidsGetOldestXmin(void) head = procArray->headKnownAssignedXids; SpinLockRelease(&procArray->known_assigned_xids_lck); - for (i = tail; i < head; i++) + for (i = tail; i < head; i += pg_atomic_read_u32(&KnownAssignedXidsNext[i])) { /* Skip any gaps in the array */ if (KnownAssignedXidsValid[i]) @@ -5095,7 +5115,7 @@ KnownAssignedXidsDisplay(int trace_level) initStringInfo(&buf); - for (i = tail; i < head; i++) + for (i = tail; i < head; i += pg_atomic_read_u32(&KnownAssignedXidsNext[i])) { if (KnownAssignedXidsValid[i]) {
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 3447481b16..fd600944b6 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -109,6 +109,7 @@ static PGXACT *allPgXact; */ static TransactionId *KnownAssignedXids; static bool *KnownAssignedXidsValid; +static pg_atomic_uint32 *KnownAssignedXidsNext; static TransactionId latestObservedXid = InvalidTransactionId; /* @@ -225,6 +226,7 @@ void CreateSharedProcArray(void) { bool found; + int i; /* Create or attach to the ProcArray shared structure */ procArray = (ProcArrayStruct *) @@ -266,6 +268,12 @@ CreateSharedProcArray(void) ShmemInitStruct("KnownAssignedXidsValid", mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS), &found); + KnownAssignedXidsNext = (pg_atomic_uint32 *) + ShmemInitStruct("KnownAssignedXidsNext", + mul_size(sizeof(pg_atomic_uint32), TOTAL_MAX_CACHED_SUBXIDS), + &found); + for (i = 0; i < TOTAL_MAX_CACHED_SUBXIDS; i++) + pg_atomic_init_u32(&KnownAssignedXidsNext[i], 1); } } @@ -3558,12 +3566,13 @@ KnownAssignedXidsCompress(bool force) * re-aligning data to 0th element. */ compress_index = 0; - for (i = tail; i < head; i++) + for (i = tail; i < head; i += pg_atomic_read_u32(&KnownAssignedXidsNext[i])) { if (KnownAssignedXidsValid[i]) { KnownAssignedXids[compress_index] = KnownAssignedXids[i]; KnownAssignedXidsValid[compress_index] = true; + pg_atomic_write_u32(&KnownAssignedXidsNext[compress_index], 1); compress_index++; } } @@ -3666,6 +3675,7 @@ KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid, { KnownAssignedXids[head] = next_xid; KnownAssignedXidsValid[head] = true; + pg_atomic_write_u32(&KnownAssignedXidsNext[head], 1); TransactionIdAdvance(next_xid); head++; } @@ -3881,7 +3891,7 @@ KnownAssignedXidsRemovePreceding(TransactionId removeXid) tail = pArray->tailKnownAssignedXids; head = pArray->headKnownAssignedXids; - for (i = tail; i < head; i++) + for (i = tail; i < head; i += pg_atomic_read_u32(&KnownAssignedXidsNext[i])) { if (KnownAssignedXidsValid[i]) { @@ -3904,7 +3914,7 @@ KnownAssignedXidsRemovePreceding(TransactionId removeXid) /* * Advance the tail pointer if we've marked the tail item invalid. */ - for (i = tail; i < head; i++) + for (i = tail; i < head; i += pg_atomic_read_u32(&KnownAssignedXidsNext[i])) { if (KnownAssignedXidsValid[i]) break; @@ -3954,7 +3964,8 @@ KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin, int count = 0; int head, tail; - int i; + int i, + prev; /* * Fetch head just once, since it may change while we loop. We can stop @@ -3968,9 +3979,10 @@ KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin, SpinLockAcquire(&procArray->known_assigned_xids_lck); tail = procArray->tailKnownAssignedXids; head = procArray->headKnownAssignedXids; + prev = tail; SpinLockRelease(&procArray->known_assigned_xids_lck); - for (i = tail; i < head; i++) + for (i = tail; i < head; i += pg_atomic_read_u32(&KnownAssignedXidsNext[i])) { /* Skip any gaps in the array */ if (KnownAssignedXidsValid[i]) @@ -3993,6 +4005,14 @@ KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin, TransactionIdFollowsOrEquals(knownXid, xmax)) break; + if (prev != i) + { + uint32 offset = i - prev; + if (pg_atomic_read_u32(&KnownAssignedXidsNext[prev]) != offset) + pg_atomic_write_u32(&KnownAssignedXidsNext[prev], offset); + prev = i; + } + /* Add knownXid into output array */ xarray[count++] = knownXid; } @@ -4020,7 +4040,7 @@ KnownAssignedXidsGetOldestXmin(void) head = procArray->headKnownAssignedXids; SpinLockRelease(&procArray->known_assigned_xids_lck); - for (i = tail; i < head; i++) + for (i = tail; i < head; i += pg_atomic_read_u32(&KnownAssignedXidsNext[i])) { /* Skip any gaps in the array */ if (KnownAssignedXidsValid[i]) @@ -4055,7 +4075,7 @@ KnownAssignedXidsDisplay(int trace_level) initStringInfo(&buf); - for (i = tail; i < head; i++) + for (i = tail; i < head; i += pg_atomic_read_u32(&KnownAssignedXidsNext[i])) { if (KnownAssignedXidsValid[i]) {