On Thu, 3 Nov 2022 at 06:25, Andres Freund <and...@anarazel.de> wrote: > Attached is an experimental patch/hack for that. It ended up being more > beneficial to make the access ordering more optimal than prefetching the tuple > contents, but I'm not at all sure that's the be-all-end-all.
Thanks for writing that patch. I've been experimenting with it. I tried unrolling the loop (patch 0003) as you mentioned in: + * FIXME: Worth unrolling so that we don't fetch the same cacheline + * over and over, due to line items being smaller than a cacheline? but didn't see any gains from doing that. I also adjusted your patch a little so that instead of doing: - OffsetNumber rs_vistuples[MaxHeapTuplesPerPage]; /* their offsets */ + OffsetNumber *rs_vistuples; + OffsetNumber rs_vistuples_d[MaxHeapTuplesPerPage]; /* their offsets */ to work around the issue of having to populate rs_vistuples_d in reverse, I added a new field called rs_startindex to mark where the first element in the rs_vistuples array is. The way you wrote it seems to require fewer code changes, but per the FIXME comment you left, I get the idea you just did it the way you did to make it work enough for testing. I'm quite keen to move forward in committing the 0001 patch to add the pg_prefetch_mem macro. What I'm a little undecided about is what the best patch is to commit first to make use of the new macro. I did some tests on the attached set of patches: alter system set max_parallel_workers_per_gather = 0; select pg_reload_conf(); create table t as select a from generate_series(1,10000000)a; alter table t set (autovacuum_enabled=false); $ cat bench.sql select * from t where a = 0; psql -c "select pg_prewarm('t');" postgres -- Test 1 no frozen tuples in "t" Master (@9c6ad5eaa): $ pgbench -n -f bench.sql -M prepared -T 10 postgres | grep -E "^latency" latency average = 383.332 ms latency average = 375.747 ms latency average = 376.090 ms Master + 0001 + 0002: $ pgbench -n -f bench.sql -M prepared -T 10 postgres | grep -E "^latency" latency average = 370.133 ms latency average = 370.149 ms latency average = 370.157 ms Master + 0001 + 0005: $ pgbench -n -f bench.sql -M prepared -T 10 postgres | grep -E "^latency" latency average = 372.662 ms latency average = 371.034 ms latency average = 372.709 ms -- Test 2 "select count(*) from t" with all tuples frozen $ cat bench1.sql select count(*) from t; psql -c "vacuum freeze t;" postgres psql -c "select pg_prewarm('t');" postgres Master (@9c6ad5eaa): $ pgbench -n -f bench1.sql -M prepared -T 10 postgres | grep -E "^latency" latency average = 406.238 ms latency average = 407.029 ms latency average = 406.962 ms Master + 0001 + 0005: $ pgbench -n -f bench1.sql -M prepared -T 10 postgres | grep -E "^latency" latency average = 345.470 ms latency average = 345.775 ms latency average = 345.354 ms My current thoughts are that it might be best to go with 0005 to start with. I know Melanie is working on making some changes in this area, so perhaps it's best to leave 0002 until that work is complete. David
From 491df9d6ab87a54bbc76b876484733d02d6c94ea Mon Sep 17 00:00:00 2001 From: David Rowley <dgrow...@gmail.com> Date: Wed, 19 Oct 2022 08:54:01 +1300 Subject: [PATCH v2 1/5] Add pg_prefetch_mem() macro to load cache lines. Initially mapping to GCC, Clang and MSVC builtins. Discussion: https://postgr.es/m/CAEepm%3D2y9HM9QP%2BHhRZdQ3pU6FShSMyu%3DV1uHXhQ5gG-dketHg%40mail.gmail.com --- config/c-compiler.m4 | 17 ++++++++++++++++ configure | 40 ++++++++++++++++++++++++++++++++++++++ configure.ac | 3 +++ meson.build | 3 ++- src/include/c.h | 8 ++++++++ src/include/pg_config.h.in | 3 +++ src/tools/msvc/Solution.pm | 1 + 7 files changed, 74 insertions(+), 1 deletion(-) diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 000b075312..582a47501c 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -355,6 +355,23 @@ AC_DEFINE_UNQUOTED(AS_TR_CPP([HAVE$1]), 1, [Define to 1 if your compiler understands $1.]) fi])# PGAC_CHECK_BUILTIN_FUNC +# PGAC_CHECK_BUILTIN_VOID_FUNC +# ----------------------- +# Variant for void functions. +AC_DEFUN([PGAC_CHECK_BUILTIN_VOID_FUNC], +[AC_CACHE_CHECK(for $1, pgac_cv$1, +[AC_LINK_IFELSE([AC_LANG_PROGRAM([ +void +call$1($2) +{ + $1(x); +}], [])], +[pgac_cv$1=yes], +[pgac_cv$1=no])]) +if test x"${pgac_cv$1}" = xyes ; then +AC_DEFINE_UNQUOTED(AS_TR_CPP([HAVE$1]), 1, + [Define to 1 if your compiler understands $1.]) +fi])# PGAC_CHECK_BUILTIN_VOID_FUNC # PGAC_CHECK_BUILTIN_FUNC_PTR diff --git a/configure b/configure index 3966368b8d..c4685b8a1e 100755 --- a/configure +++ b/configure @@ -15988,6 +15988,46 @@ _ACEOF fi +# Can we use a built-in to prefetch memory? +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_prefetch" >&5 +$as_echo_n "checking for __builtin_prefetch... " >&6; } +if ${pgac_cv__builtin_prefetch+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +void +call__builtin_prefetch(void *x) +{ + __builtin_prefetch(x); +} +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv__builtin_prefetch=yes +else + pgac_cv__builtin_prefetch=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__builtin_prefetch" >&5 +$as_echo "$pgac_cv__builtin_prefetch" >&6; } +if test x"${pgac_cv__builtin_prefetch}" = xyes ; then + +cat >>confdefs.h <<_ACEOF +#define HAVE__BUILTIN_PREFETCH 1 +_ACEOF + +fi + # We require 64-bit fseeko() to be available, but run this check anyway # in case it finds that _LARGEFILE_SOURCE has to be #define'd for that. { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _LARGEFILE_SOURCE value needed for large files" >&5 diff --git a/configure.ac b/configure.ac index f76b7ee31f..2d4938d43d 100644 --- a/configure.ac +++ b/configure.ac @@ -1802,6 +1802,9 @@ PGAC_CHECK_BUILTIN_FUNC([__builtin_popcount], [unsigned int x]) # so it needs a different test function. PGAC_CHECK_BUILTIN_FUNC_PTR([__builtin_frame_address], [0]) +# Can we use a built-in to prefetch memory? +PGAC_CHECK_BUILTIN_VOID_FUNC([__builtin_prefetch], [void *x]) + # We require 64-bit fseeko() to be available, but run this check anyway # in case it finds that _LARGEFILE_SOURCE has to be #define'd for that. AC_FUNC_FSEEKO diff --git a/meson.build b/meson.build index 058382046e..096643703c 100644 --- a/meson.build +++ b/meson.build @@ -1587,10 +1587,11 @@ builtins = [ 'bswap32', 'bswap64', 'clz', - 'ctz', 'constant_p', + 'ctz', 'frame_address', 'popcount', + 'prefetch', 'unreachable', ] diff --git a/src/include/c.h b/src/include/c.h index 98cdd285dd..a7f7531450 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -361,6 +361,14 @@ typedef void (*pg_funcptr_t) (void); */ #define FLEXIBLE_ARRAY_MEMBER /* empty */ +/* Do we have support for prefetching memory? */ +#if defined(HAVE__BUILTIN_PREFETCH) +#define pg_prefetch_mem(a) __builtin_prefetch(a) +#elif defined(_MSC_VER) +#define pg_prefetch_mem(a) _m_prefetch(a) +#else +#define pg_prefetch_mem(a) +#endif /* ---------------------------------------------------------------- * Section 2: bool, true, false diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index c5a80b829e..07a661e288 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -559,6 +559,9 @@ /* Define to 1 if your compiler understands __builtin_popcount. */ #undef HAVE__BUILTIN_POPCOUNT +/* Define to 1 if your compiler understands __builtin_prefetch. */ +#undef HAVE__BUILTIN_PREFETCH + /* Define to 1 if your compiler understands __builtin_types_compatible_p. */ #undef HAVE__BUILTIN_TYPES_COMPATIBLE_P diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm index c2acb58df0..95de91890e 100644 --- a/src/tools/msvc/Solution.pm +++ b/src/tools/msvc/Solution.pm @@ -227,6 +227,7 @@ sub GenerateFiles HAVE_BACKTRACE_SYMBOLS => undef, HAVE_BIO_GET_DATA => undef, HAVE_BIO_METH_NEW => undef, + HAVE__BUILTIN_PREFETCH => undef, HAVE_COMPUTED_GOTO => undef, HAVE_COPYFILE => undef, HAVE_COPYFILE_H => undef, -- 2.35.1.windows.2
From c5ec896a2df02041f08d1e41a982223781137d5f Mon Sep 17 00:00:00 2001 From: David Rowley <dgrow...@gmail.com> Date: Thu, 17 Nov 2022 13:04:49 +1300 Subject: [PATCH v2 2/5] Perform memory prefetching in heapgetpage --- src/backend/access/heap/heapam.c | 87 +++++++++++++++++++----- src/backend/access/heap/heapam_handler.c | 7 +- src/include/access/heapam.h | 1 + 3 files changed, 74 insertions(+), 21 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index d18c5ca6f5..81c7f69644 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -451,31 +451,82 @@ heapgetpage(TableScanDesc sscan, BlockNumber block) */ all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery; - for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(page, lineoff); - lineoff <= lines; - lineoff++, lpp++) + if (all_visible) { - if (ItemIdIsNormal(lpp)) + HeapTupleData loctup; + + loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd); + + for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(page, lineoff); + lineoff <= lines; + lineoff++, lpp++) { - HeapTupleData loctup; - bool valid; + if (!ItemIdIsNormal(lpp)) + continue; - loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd); loctup.t_data = (HeapTupleHeader) PageGetItem(page, lpp); loctup.t_len = ItemIdGetLength(lpp); ItemPointerSet(&(loctup.t_self), block, lineoff); - if (all_visible) - valid = true; - else - valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); + HeapCheckForSerializableConflictOut(true, scan->rs_base.rs_rd, + &loctup, buffer, snapshot); + scan->rs_vistuples[ntup++] = lineoff; + } + scan->rs_startindex = 0; + } + else + { + HeapTupleData loctup; + int normcount = 0; + int startindex = MaxHeapTuplesPerPage; + OffsetNumber normoffsets[MaxHeapTuplesPerPage]; + + loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd); + + /* + * Iterate forward over line items, they're laid out in increasing + * order in memory. Doing this separately allows to benefit from + * out-of-order capabilities of the CPU and simplifies the next loop. + */ + for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(page, lineoff); + lineoff <= lines; + lineoff++, lpp++) + { + pg_prefetch_mem(PageGetItemId(page, lineoff+5)); + if (ItemIdIsNormal(lpp)) + normoffsets[normcount++] = lineoff; + } + + /* + * Process tuples in reverse order. That'll most often lead to memory + * accesses in increasing order, which typically is more efficient for + * the CPUs prefetcher. To avoid affecting sort order, we populate the + * rs_vistuples[] array backwards and use rs_startindex to mark the + * first used element in the array. + */ + for (int i = normcount - 1; i >= 0; i--) + { + bool valid; + + lineoff = normoffsets[i]; + lpp = PageGetItemId(page, lineoff); + + loctup.t_data = (HeapTupleHeader) PageGetItem(page, lpp); + loctup.t_len = ItemIdGetLength(lpp); + ItemPointerSet(&(loctup.t_self), block, lineoff); + valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); HeapCheckForSerializableConflictOut(valid, scan->rs_base.rs_rd, &loctup, buffer, snapshot); if (valid) - scan->rs_vistuples[ntup++] = lineoff; + { + scan->rs_vistuples[--startindex] = lineoff; + ntup++; + } } + /* record the first used element in rs_vistuples[] */ + scan->rs_startindex = startindex; } LockBuffer(buffer, BUFFER_LOCK_UNLOCK); @@ -902,7 +953,7 @@ heapgettup_pagemode(HeapScanDesc scan, else block = scan->rs_startblock; /* first page */ heapgetpage((TableScanDesc) scan, block); - lineindex = 0; + lineindex = scan->rs_startindex; scan->rs_inited = true; } else @@ -917,7 +968,7 @@ heapgettup_pagemode(HeapScanDesc scan, lines = scan->rs_ntuples; /* block and lineindex now reference the next visible tid */ - linesleft = lines - lineindex; + linesleft = lines - lineindex + scan->rs_startindex; } else if (backward) { @@ -968,7 +1019,7 @@ heapgettup_pagemode(HeapScanDesc scan, if (!scan->rs_inited) { - lineindex = lines - 1; + lineindex = scan->rs_startindex + lines - 1; scan->rs_inited = true; } else @@ -977,7 +1028,7 @@ heapgettup_pagemode(HeapScanDesc scan, } /* block and lineindex now reference the previous visible tid */ - linesleft = lineindex + 1; + linesleft = lineindex + 1 - scan->rs_startindex; } else { @@ -1127,9 +1178,9 @@ heapgettup_pagemode(HeapScanDesc scan, lines = scan->rs_ntuples; linesleft = lines; if (backward) - lineindex = lines - 1; + lineindex = MaxHeapTuplesPerPage - 1; else - lineindex = 0; + lineindex = scan->rs_startindex; } } diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index ab1bcf3522..d39284465b 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -2490,15 +2490,16 @@ SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer, { /* * In pageatatime mode, heapgetpage() already did visibility checks, - * so just look at the info it left in rs_vistuples[]. + * so just look at the info it left in rs_vistuples[] starting at + * rs_startindex. * * We use a binary search over the known-sorted array. Note: we could * save some effort if we insisted that NextSampleTuple select tuples * in increasing order, but it's not clear that there would be enough * gain to justify the restriction. */ - int start = 0, - end = hscan->rs_ntuples - 1; + int start = hscan->rs_startindex, + end = hscan->rs_startindex + hscan->rs_ntuples - 1; while (start <= end) { diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 810baaf9d0..aba0795fc6 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -74,6 +74,7 @@ typedef struct HeapScanDescData /* these fields only used in page-at-a-time mode and for bitmap scans */ int rs_cindex; /* current tuple's index in vistuples */ int rs_ntuples; /* number of visible tuples on page */ + int rs_startindex; /* first used element in rs_vistuples[] */ OffsetNumber rs_vistuples[MaxHeapTuplesPerPage]; /* their offsets */ } HeapScanDescData; typedef struct HeapScanDescData *HeapScanDesc; -- 2.35.1.windows.2
From f0b8920b23b200132a4e92942c4bb281426e4f9c Mon Sep 17 00:00:00 2001 From: David Rowley <dgrow...@gmail.com> Date: Mon, 31 Oct 2022 10:05:12 +1300 Subject: [PATCH v2 5/5] Prefetch tuple memory during forward seqscans --- src/backend/access/heap/heapam.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 8fe4f4c837..6dc53effb4 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -1115,6 +1115,17 @@ heapgettup_pagemode(HeapScanDesc scan, tuple->t_len = ItemIdGetLength(lpp); ItemPointerSet(&(tuple->t_self), block, lineoff); + /* + * Prefetching the memory for the next tuple has shown to improve + * performance on certain hardware. + */ + if (!backward && linesleft > 1) + { + lineoff = scan->rs_vistuples[lineindex + 1]; + lpp = PageGetItemId(page, lineoff); + pg_prefetch_mem(PageGetItem(page, lpp)); + } + /* * if current tuple qualifies, return it. */ -- 2.35.1.windows.2
From 8f47b1e0163028ec8928aaf02ce49d34bb89c647 Mon Sep 17 00:00:00 2001 From: David Rowley <dgrow...@gmail.com> Date: Wed, 19 Oct 2022 08:55:45 +1300 Subject: [PATCH v2 4/5] heapam: WIP: cacheline prefetching for hot pruning. Author: Andres Freund, with contributions from Dmitry Dolgov --- src/backend/access/heap/pruneheap.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 91c5f5e9ef..a094ac18f5 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -302,6 +302,30 @@ heap_page_prune(Relation relation, Buffer buffer, maxoff = PageGetMaxOffsetNumber(page); tup.t_tableOid = RelationGetRelid(prstate.rel); +#if 1 + for (char *p = (char *) PageGetItemId(page, FirstOffsetNumber); + p < (char *) PageGetItemId(page, maxoff); + p += 64) + { + pg_prefetch_mem((ItemId)p); + } + + + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid; + + itemid = PageGetItemId(page, offnum); + if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid) || !ItemIdHasStorage(itemid)) + continue; + + pg_prefetch_mem((HeapTupleHeader) PageGetItem(page, itemid)); + pg_prefetch_mem(PageGetItem(page, itemid) + sizeof(HeapTupleHeaderData) - 1); + } +#endif + /* * Determine HTSV for all tuples. * -- 2.35.1.windows.2
From f1b183123befc1a3f096ba36b1c3834b4c67d3ec Mon Sep 17 00:00:00 2001 From: David Rowley <dgrow...@gmail.com> Date: Thu, 17 Nov 2022 13:24:40 +1300 Subject: [PATCH v2 3/5] Unroll loop in heapgetpage --- src/backend/access/heap/heapam.c | 48 ++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 81c7f69644..8fe4f4c837 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -484,15 +484,51 @@ heapgetpage(TableScanDesc sscan, BlockNumber block) loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd); /* - * Iterate forward over line items, they're laid out in increasing - * order in memory. Doing this separately allows to benefit from - * out-of-order capabilities of the CPU and simplifies the next loop. + * Iterate forward over line items processing 16 at a time (this + * assumes there will be 16 ItemIds per CPU cacheline. */ for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(page, lineoff); - lineoff <= lines; - lineoff++, lpp++) + lineoff <= lines - 15; + lineoff += 16, lpp += 16) + { + pg_prefetch_mem(PageGetItemId(page, lineoff + 16)); + if (ItemIdIsNormal(lpp)) + normoffsets[normcount++] = lineoff; + if (ItemIdIsNormal(lpp + 1)) + normoffsets[normcount++] = lineoff + 1; + if (ItemIdIsNormal(lpp + 2)) + normoffsets[normcount++] = lineoff + 2; + if (ItemIdIsNormal(lpp + 3)) + normoffsets[normcount++] = lineoff + 3; + if (ItemIdIsNormal(lpp + 4)) + normoffsets[normcount++] = lineoff + 4; + if (ItemIdIsNormal(lpp + 5)) + normoffsets[normcount++] = lineoff + 5; + if (ItemIdIsNormal(lpp + 6)) + normoffsets[normcount++] = lineoff + 6; + if (ItemIdIsNormal(lpp + 7)) + normoffsets[normcount++] = lineoff + 7; + if (ItemIdIsNormal(lpp + 8)) + normoffsets[normcount++] = lineoff + 8; + if (ItemIdIsNormal(lpp + 9)) + normoffsets[normcount++] = lineoff + 9; + if (ItemIdIsNormal(lpp + 10)) + normoffsets[normcount++] = lineoff + 10; + if (ItemIdIsNormal(lpp + 11)) + normoffsets[normcount++] = lineoff + 11; + if (ItemIdIsNormal(lpp + 12)) + normoffsets[normcount++] = lineoff + 12; + if (ItemIdIsNormal(lpp + 13)) + normoffsets[normcount++] = lineoff + 13; + if (ItemIdIsNormal(lpp + 14)) + normoffsets[normcount++] = lineoff + 14; + if (ItemIdIsNormal(lpp + 15)) + normoffsets[normcount++] = lineoff + 15; + } + + /* get remainder */ + for (; lineoff <= lines; lineoff++, lpp++) { - pg_prefetch_mem(PageGetItemId(page, lineoff+5)); if (ItemIdIsNormal(lpp)) normoffsets[normcount++] = lineoff; } -- 2.35.1.windows.2