Hi All, I have added a microvacuum support for hash index access method and attached is the v1 patch for the same. The patch basically takes care of the following things:
1. Firstly, it changes the marking of dead tuples from 'tuple-at-a-time' to 'page-at-a-time' during hash index scan. For this we accumulate the heap tids and offset of all the hash index tuples if it is pointed by kill_prior_tuple during scan and then mark all accumulated tids as LP_DEAD either while stepping from one page to another (assuming the scan in both forward and backward direction) or during end of the hash index scan or during rescan. 2. Secondly, when inserting tuple into hash index table, if not enough space is found on a current page then it ensures that we first clean the dead tuples if found in the current hash index page before moving to the next page in a bucket chain or going for a bucket split. This basically increases the page reusability and reduces the number of page splits, thereby reducing the overall size of hash index table. I have compared the hash index size with and without my patch (microvacuum_hash_index_v1.patch attached with this mail) on a high end machine at various scale factors and the results are shown below. For testing this, i have created hash index (pgbench_accounts_aid) on aid column of 'pgbench_accounts' table instead of primary key and the results shown in below table are for the same. The patch (pgbench.patch) having these changes is also attached with this mail. Moreover, I am using my own script file (file_hash_kill_prior_tuple) for updating the index column with pgbench read-write command. The script file 'file_hash_kill_prior_tuple' is also attached with this mail. Here are some initial test results showing the benefit of this patch: postgresql.conf and pgbench settings: autovacuum=off client counts = 64 run time duration = 15 mins ./pgbench -c $threads -j $threads -T 900 postgres -f ~/file_hash_kill_prior_tuple Scale Factor hash index size @ start HEAD HEAD + Patch 10 32 MB 579 MB 158 MB 50 128 MB 630 MB 350 MB 100 256 MB 1255 MB 635 MB 300 1024 MB 2233 MB 1093 MB As shown in above result, at 10 scale factor the hash index size has reduced by almost 4 times whereas at 50 and 300 scale factors it has reduced by half with my patch. This basically proves that we can reduce the hash index size to a good extent with this patch. System specifications: --------------------------------- Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 128 On-line CPU(s) list: 0-127 Thread(s) per core: 2 Core(s) per socket: 8 Socket(s): 8 NUMA node(s): 8 Vendor ID: GenuineIntel Note: The patch (microvacuum_hash_index_v1.patch) is prepared on top of concurrent_hash_index_v8.patch-[1] and wal_hash_index_v5.1.patch[2] for hash index. [1] - https://www.postgresql.org/message-id/CAA4eK1%2BX%3D8sUd1UCZDZnE3D9CGi9kw%2Bkjxp2Tnw7SX5w8pLBNw%40mail.gmail.com [2] - https://www.postgresql.org/message-id/CAA4eK1KE%3D%2BkkowyYD0vmch%3Dph4ND3H1tViAB%2B0cWTHqjZDDfqg%40mail.gmail.com
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index db73f05..a0720ef 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -325,14 +325,21 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir) if (scan->kill_prior_tuple) { /* - * Yes, so mark it by setting the LP_DEAD state in the item flags. + * Yes, so remember it for later. (We'll deal with all such + * tuples at once right after leaving the index page or at + * end of scan.) */ - ItemIdMarkDead(PageGetItemId(page, offnum)); + if (so->killedItems == NULL) + so->killedItems = palloc(MaxIndexTuplesPerPage * + sizeof(HashScanPosItem)); - /* - * Since this can be redone later if needed, mark as a hint. - */ - MarkBufferDirtyHint(buf, true); + if (so->numKilled < MaxIndexTuplesPerPage) + { + so->killedItems[so->numKilled].heapTid = so->hashso_heappos; + so->killedItems[so->numKilled].indexOffset = + ItemPointerGetOffsetNumber(&(so->hashso_curpos)); + so->numKilled++; + } } /* @@ -439,6 +446,9 @@ hashbeginscan(Relation rel, int nkeys, int norderbys) so->hashso_skip_moved_tuples = false; + so->killedItems = NULL; + so->numKilled = 0; + scan->opaque = so; return scan; @@ -454,6 +464,10 @@ hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, HashScanOpaque so = (HashScanOpaque) scan->opaque; Relation rel = scan->indexRelation; + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0) + hashkillitems(scan); + _hash_dropscanbuf(rel, so); /* set position invalid (this will cause _hash_first call) */ @@ -480,6 +494,10 @@ hashendscan(IndexScanDesc scan) HashScanOpaque so = (HashScanOpaque) scan->opaque; Relation rel = scan->indexRelation; + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0) + hashkillitems(scan); + _hash_dropscanbuf(rel, so); pfree(so); @@ -809,6 +827,15 @@ hashbucketcleanup(Relation rel, Buffer bucket_buf, PageIndexMultiDelete(page, deletable, ndeletable); bucket_dirty = true; + /* + * Let us mark the page as clean if vacuum removes the DEAD tuples + * from an index page. We do this by clearing LH_PAGE_HAS_DEAD_TUPLES + * flag. + */ + if (tuples_removed && *tuples_removed > 0 && + opaque->hasho_flag & LH_PAGE_HAS_DEAD_TUPLES) + opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; + MarkBufferDirty(buf); /* XLOG stuff */ diff --git a/src/backend/access/hash/hash_xlog.c b/src/backend/access/hash/hash_xlog.c index d030a8d..5f2bc7c 100644 --- a/src/backend/access/hash/hash_xlog.c +++ b/src/backend/access/hash/hash_xlog.c @@ -921,6 +921,82 @@ hash_xlog_update_meta_page(XLogReaderState *record) UnlockReleaseBuffer(metabuf); } +/* + * replay delete operation in hash index to remove + * tuples marked as DEAD during index tuple insertion. + */ +static void +hash_xlog_vacuum_one_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_vacuum *xldata = (xl_hash_vacuum *) XLogRecGetData(record); + Buffer bucketbuf = InvalidBuffer; + Buffer buffer; + Buffer metabuf; + Page page; + XLogRedoAction action; + + if (xldata->is_primary_bucket_page) + action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &buffer); + else + { + RelFileNode rnode; + BlockNumber blkno; + + XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno); + bucketbuf = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, + RBM_NORMAL); + + if (BufferIsValid(bucketbuf)) + LockBufferForCleanup(bucketbuf); + + action = XLogReadBufferForRedo(record, 1, &buffer); + } + + if (action == BLK_NEEDS_REDO) + { + char *ptr; + Size len; + + ptr = XLogRecGetBlockData(record, 1, &len); + + page = (Page) BufferGetPage(buffer); + + if (len > 0) + { + OffsetNumber *unused; + OffsetNumber *unend; + + unused = (OffsetNumber *) ptr; + unend = (OffsetNumber *) ((char *) ptr + len); + + if ((unend - unused) > 0) + PageIndexMultiDelete(page, unused, unend - unused); + } + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + if (XLogReadBufferForRedo(record, 2, &metabuf) == BLK_NEEDS_REDO) + { + Page metapage; + HashMetaPage metap; + + metapage = BufferGetPage(metabuf); + metap = HashPageGetMeta(metapage); + + metap->hashm_ntuples -= xldata->ntuples; + + PageSetLSN(metapage, lsn); + MarkBufferDirty(metabuf); + } + if (BufferIsValid(metabuf)) + UnlockReleaseBuffer(metabuf); +} + void hash_redo(XLogReaderState *record) { @@ -964,6 +1040,9 @@ hash_redo(XLogReaderState *record) case XLOG_HASH_UPDATE_META_PAGE: hash_xlog_update_meta_page(record); break; + case XLOG_HASH_VACUUM_ONE_PAGE: + hash_xlog_vacuum_one_page(record); + break; default: elog(PANIC, "hash_redo: unknown op code %u", info); } diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c index 3514138..1bcb214 100644 --- a/src/backend/access/hash/hashinsert.c +++ b/src/backend/access/hash/hashinsert.c @@ -19,7 +19,11 @@ #include "access/hash_xlog.h" #include "miscadmin.h" #include "utils/rel.h" +#include "storage/lwlock.h" +#include "storage/buf_internals.h" +static void _hash_vacuum_one_page(Relation rel, Buffer metabuf, Buffer buf, + Buffer bucket_buf, bool is_primary_bucket_page); /* * _hash_doinsert() -- Handle insertion of a single index tuple. @@ -206,6 +210,28 @@ _hash_doinsert(Relation rel, IndexTuple itup) while (PageGetFreeSpace(page) < itemsz) { /* + * Check if current page has any DEAD tuples. If yes, + * delete these tuples and see if we can get a space for + * the new item to be inserted before moving to the next + * page in the bucket chain. + */ + if (H_HAS_DEAD_TUPLES(pageopaque) && CheckBufferForCleanup(bucket_buf)) + { + /* + * Write-lock the meta page so that we can decrement + * tuple count. + */ + _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); + + _hash_vacuum_one_page(rel, metabuf, buf, bucket_buf, + (buf == bucket_buf) ? true : false); + + _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); + if (PageGetFreeSpace(page) >= itemsz) + break; /* OK, now we have enough space */ + } + + /* * no space on this page; check for an overflow page */ BlockNumber nextblkno = pageopaque->hasho_nextblkno; @@ -247,7 +273,8 @@ _hash_doinsert(Relation rel, IndexTuple itup) Assert(PageGetFreeSpace(page) >= itemsz); } pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); - Assert(pageopaque->hasho_flag == LH_OVERFLOW_PAGE); + Assert(pageopaque->hasho_flag == LH_OVERFLOW_PAGE || + pageopaque->hasho_flag == (LH_OVERFLOW_PAGE | LH_PAGE_HAS_DEAD_TUPLES)); Assert(pageopaque->hasho_bucket == bucket); } @@ -390,3 +417,89 @@ _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups, RelationGetRelationName(rel)); } } + +/* + * _hash_vacuum_one_page - vacuum just one index page. + * Try to remove LP_DEAD items from the given page. We + * must acquire cleanup lock on the primary bucket page + * before calling this function. + */ + +static void +_hash_vacuum_one_page(Relation rel, Buffer metabuf, Buffer buf, + Buffer bucket_buf, bool is_primary_bucket_page) +{ + OffsetNumber deletable[MaxOffsetNumber]; + int ndeletable = 0; + OffsetNumber offnum, + maxoff; + Page page = BufferGetPage(buf); + HashPageOpaque pageopaque; + HashMetaPage metap; + double tuples_removed = 0; + + /* Scan each tuple in page to see if it is marked as LP_DEAD */ + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemId = PageGetItemId(page, offnum); + + if (ItemIdIsDead(itemId)) + { + deletable[ndeletable++] = offnum; + tuples_removed += 1; + } + } + + if (ndeletable > 0) + { + /* No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + PageIndexMultiDelete(page, deletable, ndeletable); + + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); + pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; + + metap = HashPageGetMeta(BufferGetPage(metabuf)); + metap->hashm_ntuples -= tuples_removed; + + MarkBufferDirty(buf); + MarkBufferDirty(metabuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_vacuum xlrec; + XLogRecPtr recptr; + + xlrec.is_primary_bucket_page = is_primary_bucket_page; + xlrec.ntuples = tuples_removed; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashVacuum); + + /* + * primary bucket buffer needs to be registered to ensure + * that we acquire cleanup lock during replay. + */ + if (!xlrec.is_primary_bucket_page) + XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD); + + XLogRegisterBuffer(1, buf, REGBUF_STANDARD); + XLogRegisterBufData(1, (char *) deletable, + ndeletable * sizeof(OffsetNumber)); + + XLogRegisterBuffer(2, metabuf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_VACUUM_ONE_PAGE); + + PageSetLSN(BufferGetPage(buf), recptr); + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + END_CRIT_SECTION(); + } +} diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c index 0df64a8..574998e 100644 --- a/src/backend/access/hash/hashsearch.c +++ b/src/backend/access/hash/hashsearch.c @@ -455,6 +455,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) if (offnum <= maxoff) { Assert(offnum >= FirstOffsetNumber); + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); /* @@ -473,6 +474,10 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) break; /* yes, so exit for-loop */ } + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0) + hashkillitems(scan); + /* * ran off the end of this page, try the next */ @@ -544,6 +549,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) if (offnum >= FirstOffsetNumber) { Assert(offnum <= maxoff); + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); /* @@ -562,6 +568,10 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) break; /* yes, so exit for-loop */ } + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0) + hashkillitems(scan); + /* * ran off the end of this page, try the next */ diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c index b5164d7..4350e32 100644 --- a/src/backend/access/hash/hashutil.c +++ b/src/backend/access/hash/hashutil.c @@ -19,6 +19,7 @@ #include "access/relscan.h" #include "utils/lsyscache.h" #include "utils/rel.h" +#include "storage/buf_internals.h" /* @@ -489,3 +490,72 @@ _hash_get_newbucket(Relation rel, Bucket curr_bucket, return new_bucket; } + +/* + * hashkillitems - set LP_DEAD state for items an indexscan caller has + * told us were killed. + * + * scan->opaque, referenced locally through so, contains information about the + * current page and killed tuples thereon (generally, this should only be + * called if so->numKilled > 0). + * + * We match items by heap TID before assuming they are the right ones to + * delete. If an item has moved off the current page due to a split, we'll + * fail to find it and do nothing (this is not an error case --- we assume + * the item will eventually get marked in a future indexscan). + */ +void +hashkillitems(IndexScanDesc scan) +{ + HashScanOpaque so = (HashScanOpaque) scan->opaque; + Page page; + HashPageOpaque opaque; + OffsetNumber offnum, maxoff; + int numKilled = so->numKilled; + int i; + bool killedsomething = false; + + Assert(so->numKilled > 0); + Assert(so->killedItems != NULL); + + /* + * Always reset the scan state, so we don't look for same + * items on other pages. + */ + so->numKilled = 0; + + page = BufferGetPage(so->hashso_curbuf); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + + for (i = 0; i < numKilled; i++) + { + offnum = so->killedItems[i].indexOffset; + + while (offnum <= maxoff) + { + ItemId iid = PageGetItemId(page, offnum); + IndexTuple ituple = (IndexTuple) PageGetItem(page, iid); + + if (ItemPointerEquals(&ituple->t_tid, &so->killedItems[i].heapTid)) + { + /* found the item */ + ItemIdMarkDead(iid); + killedsomething = true; + break; /* out of inner search loop */ + } + offnum = OffsetNumberNext(offnum); + } + } + + /* + * Since this can be redone later if needed, mark as dirty hint. + * Whenever we mark anything LP_DEAD, we also set the page's + * LH_PAGE_HAS_DEAD_TUPLES flag, which is likewise just a hint. + */ + if (killedsomething) + { + opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES; + MarkBufferDirtyHint(so->hashso_curbuf, true); + } +} diff --git a/src/backend/access/rmgrdesc/hashdesc.c b/src/backend/access/rmgrdesc/hashdesc.c index 245ce97..7fc5721 100644 --- a/src/backend/access/rmgrdesc/hashdesc.c +++ b/src/backend/access/rmgrdesc/hashdesc.c @@ -155,6 +155,8 @@ hash_identify(uint8 info) case XLOG_HASH_UPDATE_META_PAGE: id = "UPDATE_META_PAGE"; break; + case XLOG_HASH_VACUUM_ONE_PAGE: + id = "VACUUM_ONE_PAGE"; } return id; diff --git a/src/include/access/hash.h b/src/include/access/hash.h index c0434f5..185d1e8 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -57,6 +57,7 @@ typedef uint32 Bucket; #define LH_BUCKET_NEW_PAGE_SPLIT (1 << 4) #define LH_BUCKET_OLD_PAGE_SPLIT (1 << 5) #define LH_BUCKET_PAGE_HAS_GARBAGE (1 << 6) +#define LH_PAGE_HAS_DEAD_TUPLES (1 << 7) typedef struct HashPageOpaqueData { @@ -74,6 +75,7 @@ typedef HashPageOpaqueData *HashPageOpaque; #define H_NEW_INCOMPLETE_SPLIT(opaque) ((opaque)->hasho_flag & LH_BUCKET_NEW_PAGE_SPLIT) #define H_INCOMPLETE_SPLIT(opaque) (((opaque)->hasho_flag & LH_BUCKET_NEW_PAGE_SPLIT) || \ ((opaque)->hasho_flag & LH_BUCKET_OLD_PAGE_SPLIT)) +#define H_HAS_DEAD_TUPLES(opaque) ((opaque)->hasho_flag & LH_PAGE_HAS_DEAD_TUPLES) /* * The page ID is for the convenience of pg_filedump and similar utilities, @@ -83,6 +85,13 @@ typedef HashPageOpaqueData *HashPageOpaque; */ #define HASHO_PAGE_ID 0xFF80 +typedef struct HashScanPosItem /* what we remember about each match */ +{ + ItemPointerData heapTid; /* TID of referenced heap item */ + OffsetNumber indexOffset; /* index item's location within page */ +} HashScanPosItem; + + /* * HashScanOpaqueData is private state for a hash index scan. */ @@ -116,6 +125,10 @@ typedef struct HashScanOpaqueData /* Whether scan needs to skip tuples that are moved by split */ bool hashso_skip_moved_tuples; + + /* info about killed items if any (killedItems is NULL if never used) */ + HashScanPosItem *killedItems; /* tids and offset numbers of killed items */ + int numKilled; /* number of currently stored items */ } HashScanOpaqueData; typedef HashScanOpaqueData *HashScanOpaque; @@ -177,6 +190,7 @@ typedef struct HashMetaPageData typedef HashMetaPageData *HashMetaPage; + /* * Maximum size of a hash index item (it's okay to have only one per page) */ @@ -381,6 +395,7 @@ extern BlockNumber _hash_get_oldblk(Relation rel, HashPageOpaque opaque); extern BlockNumber _hash_get_newblk(Relation rel, HashPageOpaque opaque); extern Bucket _hash_get_newbucket(Relation rel, Bucket curr_bucket, uint32 lowmask, uint32 maxbucket); +extern void hashkillitems(IndexScanDesc scan); /* hash.c */ extern void hashbucketcleanup(Relation rel, Buffer bucket_buf, diff --git a/src/include/access/hash_xlog.h b/src/include/access/hash_xlog.h index 30e16c0..e9946d1 100644 --- a/src/include/access/hash_xlog.h +++ b/src/include/access/hash_xlog.h @@ -43,6 +43,7 @@ #define XLOG_HASH_UPDATE_META_PAGE 0xB0 /* update meta page after * vacuum */ +#define XLOG_HASH_VACUUM_ONE_PAGE 0xC0 /* remove dead tuples from index page */ /* * xl_hash_split_allocpage flag values, 8 bits are available. @@ -257,6 +258,24 @@ typedef struct xl_hash_init_bitmap_page #define SizeOfHashInitBitmapPage \ (offsetof(xl_hash_init_bitmap_page, bmsize) + sizeof(uint16)) +/* + * This is what we need for index tuple deletion and to + * update the meta page. + * + * This data record is used for XLOG_HASH_VACUUM_ONE_PAGE + * + * Backup Blk 0/1: bucket page + * Backup Blk 2: meta page + */ +typedef struct xl_hash_vacuum +{ + double ntuples; + bool is_primary_bucket_page; +} xl_hash_vacuum; + +#define SizeOfHashVacuum \ + (offsetof(xl_hash_vacuum, is_primary_bucket_page) + sizeof(bool)) + extern void hash_redo(XLogReaderState *record); extern void hash_desc(StringInfo buf, XLogReaderState *record); extern const char *hash_identify(uint8 info);
file_hash_kill_prior_tuple
Description: Binary data
diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c index 87fb006..9fda82d 100644 --- a/src/bin/pgbench/pgbench.c +++ b/src/bin/pgbench/pgbench.c @@ -2381,9 +2381,9 @@ init(bool is_no_vacuum) } }; static const char *const DDLINDEXes[] = { - "alter table pgbench_branches add primary key (bid)", - "alter table pgbench_tellers add primary key (tid)", - "alter table pgbench_accounts add primary key (aid)" + "create index pgbench_branches_bid on pgbench_branches using hash (bid)", + "create index pgbench_tellers_tid on pgbench_tellers using hash (tid)", + "create index pgbench_accounts_aid on pgbench_accounts using hash (aid)" }; static const char *const DDLKEYs[] = { "alter table pgbench_tellers add foreign key (bid) references pgbench_branches",
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers