On Wed, Nov 19, 2014 at 2:09 PM, Peter Geoghegan <p...@heroku.com> wrote: > Attached is a revision of what I previously called btreecheck, which > is now renamed to amcheck.
Whoops. I left in modifications to pg_config_manual.h to build with Valgrind support. Here is a version without that. -- Peter Geoghegan
diff --git a/contrib/amcheck/Makefile b/contrib/amcheck/Makefile new file mode 100644 index ...07b18d8 *** a/contrib/amcheck/Makefile --- b/contrib/amcheck/Makefile *************** *** 0 **** --- 1,18 ---- + # contrib/amcheck/Makefile + + MODULE_big = amcheck + OBJS = amcheck.o + + EXTENSION = amcheck + DATA = amcheck--1.0.sql + + ifdef USE_PGXS + PG_CONFIG = pg_config + PGXS := $(shell $(PG_CONFIG) --pgxs) + include $(PGXS) + else + subdir = contrib/amcheck + top_builddir = ../.. + include $(top_builddir)/src/Makefile.global + include $(top_srcdir)/contrib/contrib-global.mk + endif diff --git a/contrib/amcheck/amcheck--1.0.sql b/contrib/amcheck/amcheck--1.0.sql new file mode 100644 index ...e25fad1 *** a/contrib/amcheck/amcheck--1.0.sql --- b/contrib/amcheck/amcheck--1.0.sql *************** *** 0 **** --- 1,44 ---- + /* contrib/amcheck/amcheck--1.0.sql */ + + -- complain if script is sourced in psql, rather than via CREATE EXTENSION + \echo Use "CREATE EXTENSION amcheck" to load this file. \quit + + -- + -- bt_page_verify() + -- + CREATE FUNCTION bt_page_verify(relname text, blkno int4) + RETURNS VOID + AS 'MODULE_PATHNAME', 'bt_page_verify' + LANGUAGE C STRICT; + + -- + -- bt_parent_page_verify() + -- + CREATE FUNCTION bt_parent_page_verify(relname text, blkno int4) + RETURNS VOID + AS 'MODULE_PATHNAME', 'bt_parent_page_verify' + LANGUAGE C STRICT; + + -- + -- bt_index_verify() + -- + CREATE FUNCTION bt_index_verify(relname text) + RETURNS VOID + AS 'MODULE_PATHNAME', 'bt_index_verify' + LANGUAGE C STRICT; + + -- + -- bt_parent_index_verify() + -- + CREATE FUNCTION bt_parent_index_verify(relname text) + RETURNS VOID + AS 'MODULE_PATHNAME', 'bt_parent_index_verify' + LANGUAGE C STRICT; + + -- + -- bt_leftright_verify() + -- + CREATE FUNCTION bt_leftright_verify(relname text) + RETURNS VOID + AS 'MODULE_PATHNAME', 'bt_leftright_verify' + LANGUAGE C STRICT; diff --git a/contrib/amcheck/amcheck.c b/contrib/amcheck/amcheck.c new file mode 100644 index ...8776fbe *** a/contrib/amcheck/amcheck.c --- b/contrib/amcheck/amcheck.c *************** *** 0 **** --- 1,1009 ---- + /*------------------------------------------------------------------------- + * + * amcheck.c + * Verifies the integrity of access methods based on invariants. + * + * Currently, only checks for the nbtree AM are provided. Provides + * SQL-callable functions for verifying that various invariants in the + * structure of nbtree indexes are respected. This includes for example the + * invariant that each page with a high key has data items bound by the high + * key. Some functions also check invariant conditions that must hold across + * multiple pages, such as the requirement each left link within a page (if + * any) should point to a page that has as its right link a pointer to the + * page. + * + * + * Copyright (c) 2014, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/amcheck/amcheck.c + * + *------------------------------------------------------------------------- + */ + #include "postgres.h" + + #include "access/nbtree.h" + #include "catalog/index.h" + #include "catalog/namespace.h" + #include "commands/tablecmds.h" + #include "funcapi.h" + #include "miscadmin.h" + #include "storage/lmgr.h" + #include "utils/builtins.h" + #include "utils/lsyscache.h" + #include "utils/rel.h" + + + PG_MODULE_MAGIC; + + #define CHECK_SUPERUSER() { \ + if (!superuser()) \ + ereport(ERROR, \ + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), \ + (errmsg("must be superuser to use verification functions")))); } + #define CHECK_RELATION_BTREE(r) { \ + if ((r)->rd_rel->relkind != RELKIND_INDEX || (r)->rd_rel->relam != BTREE_AM_OID) \ + elog(ERROR, "relation \"%s\" is not a btree index", \ + RelationGetRelationName(r)); } + /* reject attempts to read non-local temporary relations */ + #define CHECK_RELATION_IS_NOT_OTHER_TEMP(rel) { \ + if (RELATION_IS_OTHER_TEMP(rel)) \ + ereport(ERROR, \ + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), \ + errmsg("cannot access temporary tables of other sessions"))); } + /* note: BlockNumber is unsigned, hence can't be negative */ + #define CHECK_RELATION_BLOCK_RANGE(rel, blkno) { \ + if (blkno == 0) \ + elog(ERROR, "block 0 is a meta page"); \ + if (RelationGetNumberOfBlocks(rel) <= (BlockNumber) (blkno) ) \ + elog(ERROR, "block number out of range"); } + + /* + * Callers to verification functions can reasonably expect to never receive a + * warning. Therefore, when using amcheck functions for stress testing it + * may be useful to temporally change CHCKELEVEL to PANIC, to immediately halt + * the server in the event of detecting an invariant condition violation. This + * may preserve more information about the nature of the underlying problem. + */ + #define CHCKELEVEL WARNING + + PG_FUNCTION_INFO_V1(bt_page_verify); + PG_FUNCTION_INFO_V1(bt_parent_page_verify); + PG_FUNCTION_INFO_V1(bt_index_verify); + PG_FUNCTION_INFO_V1(bt_parent_index_verify); + PG_FUNCTION_INFO_V1(bt_leftright_verify); + + static void rangevar_callback_for_childcheck(const RangeVar *relation, + Oid relId, Oid oldRelId, + void *arg); + static void bt_do_page_verify(Relation rel, BlockNumber blockNum, + bool childCheck, bool singlepage); + static ScanKey first_item_right_page(Relation rel, BlockNumber rightblockNum, + Page *rpage); + static ScanKey verify_btree_items_le(Relation rel, Page childPage, + ScanKey downLinkParent, IndexTuple pItup); + static BlockNumber verify_leftright_level(Relation rel, BlockNumber leftMost); + static char *form_btree_data(IndexTuple itup); + + /* + * Verify specified B-Tree block/page. + * + * Only acquires AccessShareLock on index relation. + */ + Datum + bt_page_verify(PG_FUNCTION_ARGS) + { + text *relname = PG_GETARG_TEXT_P(0); + uint32 blkno = PG_GETARG_UINT32(1); + Relation rel; + RangeVar *relrv; + + CHECK_SUPERUSER(); + + relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); + rel = relation_openrv(relrv, AccessShareLock); + + CHECK_RELATION_BTREE(rel); + + CHECK_RELATION_IS_NOT_OTHER_TEMP(rel); + + CHECK_RELATION_BLOCK_RANGE(rel, blkno); + + bt_do_page_verify(rel, blkno, false, true); + + relation_close(rel, AccessShareLock); + + PG_RETURN_VOID(); + } + + /* + * Verify specified B-Tree parent block/page. Make sure that each child page + * has as its right link the page that the parent page has as the next + * down-link, and other checks between parent and child pages when examining a + * parent page. + * + * Acquires AccessExclusiveLock on index relation, and ShareLock on the + * associated heap relation, much like REINDEX. + */ + Datum + bt_parent_page_verify(PG_FUNCTION_ARGS) + { + text *relname = PG_GETARG_TEXT_P(0); + uint32 blkno = PG_GETARG_UINT32(1); + Relation iRel, heapRel; + RangeVar *relrv; + Oid indexId, heapId = InvalidOid; + + CHECK_SUPERUSER(); + + relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); + + /* + * Open the target index relation and get an exclusive lock on it, to + * ensure that no one else is touching this particular index. + */ + indexId = RangeVarGetRelidExtended(relrv, AccessExclusiveLock, + false, false, + rangevar_callback_for_childcheck, + (void *) &heapId); + + /* + * Open the target index relations separately (like relation_openrv() but + * with heap relation locked first to prevent deadlocking) + */ + heapRel = heap_open(heapId, NoLock); + iRel = index_open(indexId, NoLock); + + /* check for active uses of the index in the current transaction */ + CheckTableNotInUse(iRel, "bt_parent_page_verify"); + + CHECK_RELATION_BTREE(iRel); + + CHECK_RELATION_IS_NOT_OTHER_TEMP(iRel); + + CHECK_RELATION_BLOCK_RANGE(iRel, blkno); + + bt_do_page_verify(iRel, blkno, true, true); + + index_close(iRel, AccessExclusiveLock); + heap_close(heapRel, ShareLock); + + PG_RETURN_VOID(); + } + + /* + * Verify entire index, without looking at cross-page invariant conditions. + * + * Only acquires AccessShareLock on index relation. + */ + Datum + bt_index_verify(PG_FUNCTION_ARGS) + { + text *relname = PG_GETARG_TEXT_P(0); + Relation rel; + RangeVar *relrv; + BlockNumber i; + BlockNumber nBlocks; + + CHECK_SUPERUSER(); + + relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); + rel = relation_openrv(relrv, AccessShareLock); + + CHECK_RELATION_BTREE(rel); + + CHECK_RELATION_IS_NOT_OTHER_TEMP(rel); + + nBlocks = RelationGetNumberOfBlocks(rel); + + /* Verify every page */ + for (i = BTREE_METAPAGE + 1; i < nBlocks; i++) + { + bt_do_page_verify(rel, i, false, false); + } + + relation_close(rel, AccessShareLock); + + PG_RETURN_VOID(); + } + + /* + * Verify entire index, considering cross-page invariant conditions between + * parent and child pages. + * + * Acquires AccessExclusiveLock on index relation, and ShareLock on the + * associated heap relation, much like REINDEX. + */ + Datum + bt_parent_index_verify(PG_FUNCTION_ARGS) + { + text *relname = PG_GETARG_TEXT_P(0); + RangeVar *relrv; + BlockNumber i; + BlockNumber nBlocks; + Relation iRel, heapRel; + Oid indexId, heapId = InvalidOid; + + CHECK_SUPERUSER(); + + relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); + + /* + * Open the target index relation and get an exclusive lock on it, to + * ensure that no one else is touching this particular index. + */ + indexId = RangeVarGetRelidExtended(relrv, AccessExclusiveLock, + false, false, + rangevar_callback_for_childcheck, + (void *) &heapId); + + /* + * Open the target index relations separately (like relation_openrv() but + * with heap relation locked first to prevent deadlocking) + */ + heapRel = heap_open(heapId, NoLock); + iRel = index_open(indexId, NoLock); + + /* check for active uses of the index in the current transaction */ + CheckTableNotInUse(iRel, "bt_parent_index_verify"); + + CHECK_RELATION_BTREE(iRel); + + CHECK_RELATION_IS_NOT_OTHER_TEMP(iRel); + + nBlocks = RelationGetNumberOfBlocks(iRel); + + /* Verify every page, and parent/child relationships */ + for (i = BTREE_METAPAGE + 1; i < nBlocks; i++) + { + bt_do_page_verify(iRel, i, true, false); + } + + index_close(iRel, AccessExclusiveLock); + heap_close(heapRel, ShareLock); + + PG_RETURN_VOID(); + } + + /* + * Verify left-right link consistency at each level, starting from the true + * root. + * + * Only acquires AccessShareLock on index relation. + */ + Datum + bt_leftright_verify(PG_FUNCTION_ARGS) + { + text *relname = PG_GETARG_TEXT_P(0); + Relation rel; + RangeVar *relrv; + Buffer buffer; + Page page; + BlockNumber leftMost; + BTMetaPageData *metad; + + CHECK_SUPERUSER(); + + relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); + rel = relation_openrv(relrv, AccessShareLock); + + CHECK_RELATION_BTREE(rel); + + CHECK_RELATION_IS_NOT_OTHER_TEMP(rel); + + /* Get true root block from meta-page */ + buffer = ReadBuffer(rel, BTREE_METAPAGE); + LockBuffer(buffer, BT_READ); + + page = BufferGetPage(buffer); + metad = BTPageGetMeta(page); + + UnlockReleaseBuffer(buffer); + + /* + * Verify every level, starting form true root's level. Move left to right. + * Process one level at a time. + */ + leftMost = metad->btm_root; + + while (leftMost != P_NONE) + { + leftMost = verify_leftright_level(rel, leftMost); + } + + relation_close(rel, AccessShareLock); + + PG_RETURN_VOID(); + } + + /* + * Worker for bt_page_verify(), bt_index_verify() and "parent" variants of + * both. + * + * It is the caller's responsibility to handle heavyweight locking. Checking + * children from down-links has race conditions with only an AccessShareLock, + * since there is never an attempt to get a consistent view of multiple pages + * using multiple concurrent buffer locks (in general, we prefer to only lock + * one buffer at a time, and to only manipulate it in local memory). + */ + static void + bt_do_page_verify(Relation rel, BlockNumber blockNum, bool childCheck, + bool singlepage) + { + Buffer buffer; + Page page; + OffsetNumber offset, maxoffset; + BTPageOpaque opaque; + int16 relnatts; + + buffer = ReadBuffer(rel, blockNum); + LockBuffer(buffer, BT_READ); + + /* + * We copy the page into local storage to avoid holding pin on the + * buffer longer than we must, and possibly failing to release it at + * all if the calling query doesn't fetch all rows. + */ + page = palloc(BLCKSZ); + memcpy(page, BufferGetPage(buffer), BLCKSZ); + + UnlockReleaseBuffer(buffer); + + relnatts = rel->rd_rel->relnatts; + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + maxoffset = PageGetMaxOffsetNumber(page); + + for (offset = P_FIRSTDATAKEY(opaque); offset <= maxoffset; offset++) + { + ItemId itemId; + IndexTuple itup; + ScanKey skey; + int32 eqs_hikey, eqs_item; + + /* check for interrupts while we're not holding any buffer lock */ + CHECK_FOR_INTERRUPTS(); + + if (P_IGNORE(opaque)) + { + elog(NOTICE, "page %u of index \"%s\" is ignored", blockNum, + RelationGetRelationName(rel)); + break; + } + + /* + * As noted in comments above _bt_compare(), there is special handling + * of the first data item on a non-leaf page. There is clearly no + * point in building a ScanKey of whatever data happens to be in this + * first data IndexTuple, since its contents are undefined. (This is + * because _bt_compare() is hard-coded to specially treat it as "minus + * infinity"). + */ + if (!P_ISLEAF(opaque) && offset == P_FIRSTDATAKEY(opaque)) + continue; + + itemId = PageGetItemId(page, offset); + + if (!ItemIdIsValid(itemId)) + elog(Max(CHCKELEVEL, ERROR), "invalid itemId"); + + itup = (IndexTuple) PageGetItem(page, itemId); + + /* Build ScanKey that relates to the current item/offset's value */ + skey = _bt_mkscankey(rel, itup); + + eqs_hikey = eqs_item = 0; + + /* If there is a high key, compare this item to it */ + if (!P_RIGHTMOST(opaque)) + eqs_hikey = _bt_compare(rel, relnatts, skey, page, P_HIKEY); + + /* + * Check that items are stored on page in logical order. + * + * In each iteration, check that next item is equal to or greater than + * current item, until there is no next item (i.e. don't do this on the + * last iteration). + */ + if (offset < maxoffset) + { + /* Iteration before last case */ + eqs_item = _bt_compare(rel, relnatts, skey, page, offset + 1); + } + else if (!P_RIGHTMOST(opaque)) + { + ScanKey nLow; + Page rpage; + + /* Last iteration (page item) */ + Assert(offset == maxoffset); + + /* + * There is no "next" item available to compare this final item to + * on this same page (there is only the page highkey). Look for + * one in right-link page, since one should be available. + * + * Get first item on right page by following right link, and verify + * that that item is less than or equal to this last item. Note + * that we always get the first real data item -- not the + * right-link's high key, and not the garbage first "real" item + * that internal pages have. + * + * It might be that there has been a page split and the right link + * is now not the same as the right link during the time that we + * took a local copy of the current page, but (even without + * heavyweight locking by caller) this shouldn't matter. It should + * be possible to treat whatever we find to the right (if anything) + * as an invariant that must hold. (Since pages split right, it + * might be the same page in a sense, which won't in itself have + * changed the first item, or it might be deleted.) + * + * XXX: In general there is a pretty good chance that this right + * page will be visited at some later juncture (if we're in the + * process of verifying the entire index, for example). It seems + * wasteful that there will be considerable redundant buffer + * locking and page copying. Revisit this. + * + * In general, a lot of checking performed by amcheck will probably + * turn out to be superfluous when the code is tightened up in the + * future. + */ + nLow = first_item_right_page(rel, opaque->btpo_next, &rpage); + + if (nLow) + { + int32 eqs_nextpage; + + /* + * This is a bit different to what earlier iterations do. They + * user their item's scankey to compare to next item (so in + * this final iteration, we have just verified that second last + * item is less than or equal to this item -- we're now + * checking this item against items on next page). + * + * Since ScanKey comes from next page, and is compared to + * current offset, next-page-generated scankey should be + * greater than (or equal to) this current offset/item. + */ + eqs_nextpage = _bt_compare(rel, relnatts, nLow, page, offset); + + if (eqs_nextpage < 0) + elog(CHCKELEVEL, "page order invariant violated for index \"%s\" block %u across block and next page", + RelationGetRelationName(rel), offset); + + pfree(nLow); + pfree(rpage); + } + } + + /* + * It is expected that current iteration's ScanKey is always less than + * the high key on this page, and always less than the next + * item/offset. + */ + if (eqs_hikey > 0 || eqs_item > 0) + { + char *hblk, *iblk, *niblk, *dump; + + hblk = psprintf("(%u,%u)", + BlockIdGetBlockNumber(&(itup->t_tid.ip_blkid)), + itup->t_tid.ip_posid); + + iblk = psprintf("(%u,%u)", + blockNum, + offset); + + niblk = psprintf("(%u,%u)", + blockNum, + offset + 1); + + dump = form_btree_data(itup); + + if (eqs_hikey > 0) + elog(CHCKELEVEL, "high key invariant violated for index \"%s\" %s with data %s (points to: %s)", + RelationGetRelationName(rel), iblk, dump, hblk); + if (eqs_item > 0) + elog(CHCKELEVEL, "page order invariant violated for index \"%s\" (%s and %s) with data %s (points to: %s)", + RelationGetRelationName(rel), iblk, niblk, dump, hblk); + } + + /* + * For internal pages, verify that child pages comport with the + * down-link in the parent page. This is safe against concurrent page + * splits because caller must have taken appropriate heavyweight locks + * for this case. If there is an old incomplete page split, detect + * that and handle it appropriately. + * + * This seems unlikely to be too wasteful in any particular scenario + * (other than the fact that it necessitates a heavy lmgr lock). After + * all, this only needs to occur with internal pages, that are almost + * always than 5% of the total, and often less than 1%. + * + * However, we should think about doing this in a separate pass with + * that heavier lock, or perhaps coming up with a smarter buffer + * locking protocol that obviates the need for a hwlock. For now, it + * seems far preferable to not be overly clever with buffer locks. + */ + if (childCheck && !P_ISLEAF(opaque)) + { + Page childPage; + BlockNumber childBlockNum; + Buffer childBuffer; + ScanKey childHkey; + + childBlockNum = ItemPointerGetBlockNumber(&(itup->t_tid)); + + childBuffer = ReadBuffer(rel, childBlockNum); + LockBuffer(childBuffer, BT_READ); + + /* Copy this page into local storage too */ + childPage = palloc(BLCKSZ); + memcpy(childPage, BufferGetPage(childBuffer), BLCKSZ); + + UnlockReleaseBuffer(childBuffer); + + /* + * Verify child page that this down-link points to has the down-link + * key as a lower bound. + */ + childHkey = verify_btree_items_le(rel, childPage, skey, itup); + + if (childHkey && offset < maxoffset) + { + /* state for child page, and next data item/offset */ + BlockNumber childRightLinkBlockNum, + parentNextOffsetBlockNum; + IndexTuple nIndTup; + ItemId nItemId; + BTPageOpaque childOpaque; + int32 res; + + /* + * Since this isn't the last iteration, and there can't have + * been an incomplete page split on child page just inspected + * (since a child high key was returned), further check that + * next item on this page (the parent) is equal to child's high + * key. + */ + childOpaque = (BTPageOpaque) PageGetSpecialPointer(childPage); + Assert(!P_RIGHTMOST(childOpaque)); + Assert(!P_INCOMPLETE_SPLIT(childOpaque)); + + res = _bt_compare(rel, relnatts, childHkey, page, offset + 1); + + if (res != 0) + { + char *iblk, *piblk; + + /* + * Principally blame next item, not this item for problem, + * since it was the down-link that completed the split, + * which seems most plausible as a proximate cause. + */ + iblk = psprintf("(%u,%u)", + blockNum, + offset + 1); + piblk = psprintf("(%u,%u)", + blockNum, + offset); + + elog(CHCKELEVEL, "down-link in parent page in index \"%s\" %s is %s (not equal to) immediately prior item's %s child's (block %u) high key", + RelationGetRelationName(rel), iblk, + res < 0 ? "greater than":"less than", + piblk, childBlockNum); + } + + /* + * Match block number of child's right link to next parent + * offset item's block number, too. This is similar to but + * slightly distinct from the child high key matching + * verification. + */ + nItemId = PageGetItemId(page, offset + 1); + + if (!ItemIdIsValid(nItemId)) + elog(Max(CHCKELEVEL, ERROR), "invalid nItemId for next item"); + + childRightLinkBlockNum = childOpaque->btpo_next; + nIndTup = (IndexTuple) PageGetItem(page, nItemId); + parentNextOffsetBlockNum = + BlockIdGetBlockNumber(&(nIndTup ->t_tid.ip_blkid)); + + if (childRightLinkBlockNum != parentNextOffsetBlockNum) + elog(CHCKELEVEL, "next item/offset down-link (%u) in parent page does not match child (block %u) right link block number", + parentNextOffsetBlockNum, childBlockNum); + + pfree(childHkey); + } + pfree(childPage); + } + else if (childCheck && singlepage && P_ISLEAF(opaque)) + { + elog(CHCKELEVEL, "single leaf page was specified as target for checking against child"); + } + pfree(skey); + } + pfree(page); + } + + /* + * Lock the heap before the RangeVarGetRelidExtended takes the index lock, to + * avoid deadlocks. + * + * If there was ever a requirement to check permissions, this would be the + * place to do it. Currently, we just insist upon superuser. + */ + static void + rangevar_callback_for_childcheck(const RangeVar *relation, + Oid relId, Oid oldRelId, void *arg) + { + Oid *heapOid = (Oid *) arg; + + /* + * If we previously locked some other index's heap, and the name we're + * looking up no longer refers to that relation, release the now-useless + * lock. + */ + if (relId != oldRelId && OidIsValid(oldRelId)) + { + /* lock level here should match reindex_index() heap lock */ + UnlockRelationOid(*heapOid, ShareLock); + *heapOid = InvalidOid; + } + + /* If the relation does not exist, there's nothing more to do. */ + if (!OidIsValid(relId)) + return; + + /* Lock heap before index to avoid deadlock. */ + if (relId != oldRelId) + { + /* + * Lock level here should match elsewhere. If the OID isn't valid, it + * means the index was concurrently dropped, which is not a problem for + * us; just return normally. + */ + *heapOid = IndexGetRelation(relId, true); + if (OidIsValid(*heapOid)) + LockRelationOid(*heapOid, ShareLock); + } + } + + /* + * Worker for bt_do_page_verify() + * + * Requires only that AccessShareLock has been acquired by caller on target + * btree index relation. + * + * Given an original non-rightmost page's right link block (that points to a + * page that may or may not itself be right-most), return a scanKey for the + * first real data item on the right page sufficient to check ordering + * invariant on last item in original page. Also, pass back right-link page, + * which caller manages memory of (since the ScanKey points into it). + * + * Handles concurrent page splits and deletions. + */ + static ScanKey + first_item_right_page(Relation rel, BlockNumber rightblockNum, Page *rpage) + { + Buffer buffer; + BTPageOpaque opaque; + ItemId firstDataId; + IndexTuple firstDataTup; + + /* copy the page into local storage */ + *rpage = palloc(BLCKSZ); + + for (;;) + { + /* check for interrupts while we're not holding any buffer lock */ + CHECK_FOR_INTERRUPTS(); + + buffer = ReadBuffer(rel, rightblockNum); + LockBuffer(buffer, BT_READ); + + memcpy(*rpage, BufferGetPage(buffer), BLCKSZ); + + UnlockReleaseBuffer(buffer); + + opaque = (BTPageOpaque) PageGetSpecialPointer(*rpage); + + /* Ordinarily, we expect to not ignore the right linked page */ + if (!P_IGNORE(opaque)) + break; + + /* + * Handle race -- find first non-ignore page until one is found, or at + * rightmost. + * + * Before freeing memory, check if this is rightmost (which means we're + * done, and couldn't generate a useful scankey for caller's target). + */ + if (P_RIGHTMOST(opaque)) + goto abort; + + rightblockNum = opaque->btpo_next; + elog(NOTICE, "left-most page was found deleted or half dead"); + } + + firstDataId = PageGetItemId(*rpage, P_FIRSTDATAKEY(opaque)); + + if (!P_ISLEAF(opaque)) + { + /* + * Since this is an internal page, the first "real" data item is + * actually a garbage "minus infinity" item. Don't return garbage item + * on caller's target's right page (our target page), though. This + * isn't a real data key for caller's purposes. (This is separately + * avoided by caller, when it iterates through its target page). + */ + if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(*rpage)) + firstDataId = PageGetItemId(*rpage, P_FIRSTDATAKEY(opaque) + 1); + else + goto abort; + } + + if (!ItemIdIsValid(firstDataId)) + elog(Max(CHCKELEVEL, ERROR), "invalid hKeyItemId"); + + /* + * Return scankey, and leave it to caller to free all memory (Page storage + * local memory). + */ + firstDataTup = (IndexTuple) PageGetItem(*rpage, firstDataId); + + return _bt_mkscankey(rel, firstDataTup); + + abort: + pfree(*rpage); + rpage = NULL; + return NULL; + } + + /* + * Verify that each item on page is greater than or equal to the down-link key + * in the parent. Don't assume that the items in the child page are in logical + * order as generally expected -- exhaustively check each and every data item, + * and not just the first. + * + * XXX: Presently, the checking of every item here is usually somewhat + * redundant. Revisit the question of how necessary this is in light of how + * the code is actually called from a higher level. + * + * Returns ScanKey that is initialized with high key value, that can be checked + * within parent against next offset/down-link. However, if there was an + * incomplete page split (and therefore no down-link in parent can reasonably + * be expected yet) or this is the right-most page (and therefore does not have + * a high key), just return NULL. + */ + static ScanKey + verify_btree_items_le(Relation rel, Page childPage, ScanKey downLinkParent, + IndexTuple pItup) + { + OffsetNumber offset, maxoffset; + BTPageOpaque opaque; + int16 relnatts = rel->rd_rel->relnatts; + + opaque = (BTPageOpaque) PageGetSpecialPointer(childPage); + maxoffset = PageGetMaxOffsetNumber(childPage); + + for (offset = P_FIRSTDATAKEY(opaque); offset <= maxoffset; offset++) + { + int32 res; + + /* + * No point in checking first data key on non-leaf page, since it's + * logically "minus infinity". All items on child page should be + * greater than or equal to supplied down-link ScanKey (so the first + * data item on a non-leaf page is only "minus infinity" in a way that + * relies on the very invariant now being verified -- it's not "minus + * infinity" in any general sense. It's "minus infinity" with respect + * to values less than the first "real" data item, but greater than or + * equal to the page's left-link's high key, iff there is a left link). + * + * Leaving aside discussion of invariant conditions, clearly there is + * no point in doing anything with the first data key on a non-leaf + * page because its data is undefined, as noted in comments above + * _bt_compare(). + */ + if (!P_ISLEAF(opaque) && offset == P_FIRSTDATAKEY(opaque)) + continue; + + res = _bt_compare(rel, relnatts, downLinkParent, childPage, offset); + + /* + * Parent down-link is lower bound. Report when this invariant + * condition has been violated. + */ + if (res > 0) + { + char *dump; + + dump = form_btree_data(pItup); + + elog(CHCKELEVEL, "down-link low key invariant violated for index \"%s\" with data %s (points to: %u)", + RelationGetRelationName(rel), dump, + BlockIdGetBlockNumber(&(pItup->t_tid.ip_blkid))); + } + } + + if (P_INCOMPLETE_SPLIT(opaque)) + { + elog(NOTICE, "incomplete split in child page (block %u) found when following down-link", + BlockIdGetBlockNumber(&(pItup->t_tid.ip_blkid))); + } + else if (!P_RIGHTMOST(opaque)) + { + /* + * Useful ScanKey for child page high key exists -- return it to caller + * for further checks on parent page + */ + ItemId hKeyItemId; + IndexTuple hKeyItup; + + hKeyItemId = PageGetItemId(childPage, P_HIKEY); + + if (!ItemIdIsValid(hKeyItemId)) + elog(Max(CHCKELEVEL, ERROR), "invalid hKeyItemId"); + + hKeyItup = (IndexTuple) PageGetItem(childPage, hKeyItemId); + + return _bt_mkscankey(rel, hKeyItup); + } + + return NULL; + } + + /* + * Given a left-most block at some level, move right, verifying that left/right + * sibling links match as pages are passed over. It is the caller's + * responsibility to acquire appropriate heavyweight locks on the index + * relation. + * + * Moves from left to right, so there is no need to worry about concurrent page + * splits, as after a page split the original/left page has the same left-link + * as before. (And, for what it's worth, the page just under consideration has + * the same right link as before, so logically nothing is skipped. Everything + * in the index moves right, and so is denied the opportunity to be skipped + * over). Page deletion receives no special consideration either, because the + * second stage of page deletion (where a half-dead leaf page is unlinked from + * its siblings) involves acquiring a lock on the left-sibling, on the deletion + * target itself, and on the right sibling, all concurrently and in that order. + * That's the only one of the two stages of deletion where side-links are + * updated. + * + * Returns left-most block number one level lower that should be passed on next + * level/call, or P_NONE when done. Caller should call with the true root page + * initially, and work their way down to level 0 (leaf page level). + * + * The return value may become stale (from, say, concurrent page deletion). + * When passed back here, this is handled in a similar though less invasive + * manner to _bt_moveright(). It may prove necessary to "move right" one or + * more times if the left-most page passed here is deleted or half dead, by + * telling caller about a new left-most page on the same level. + */ + static BlockNumber + verify_leftright_level(Relation rel, BlockNumber leftMost) + { + Buffer buffer; + BTPageOpaque opaque; + Page page; + BlockNumber current, currentsLeft, next; + + buffer = ReadBuffer(rel, leftMost); + LockBuffer(buffer, BT_READ); + + /* as always, copy the page into local storage */ + page = palloc(BLCKSZ); + memcpy(page, BufferGetPage(buffer), BLCKSZ); + + UnlockReleaseBuffer(buffer); + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + if (P_IGNORE(opaque)) + { + /* + * Handle race -- tell caller to try again with prior left-most's right + * sibling, in a manner not unlike _bt_moveright() + */ + if (P_RIGHTMOST(opaque)) + elog(Max(CHCKELEVEL, ERROR), "fell off the end of index \"%s\"", + RelationGetRelationName(rel)); + next = opaque->btpo_next; + pfree(page); + elog(NOTICE, "left-most page was found deleted or half dead"); + return next; + } + + if (!P_LEFTMOST(opaque)) + elog(CHCKELEVEL, "page %u of index \"%s\" is not leftmost", + leftMost, RelationGetRelationName(rel)); + + /* remember down-link block to return */ + if (P_ISLEAF(opaque)) + { + next = P_NONE; + } + else + { + IndexTuple itup; + ItemId itemId; + + itemId = PageGetItemId(page, P_FIRSTDATAKEY(opaque)); + + if (!ItemIdIsValid(itemId)) + elog(Max(CHCKELEVEL, ERROR), "invalid itemId"); + + itup = (IndexTuple) PageGetItem(page, itemId); + + next = ItemPointerGetBlockNumber(&(itup->t_tid)); + } + + currentsLeft = leftMost; + current = opaque->btpo_next; + + while (!P_RIGHTMOST(opaque)) + { + /* check for interrupts while we're not holding any buffer lock */ + CHECK_FOR_INTERRUPTS(); + + buffer = ReadBuffer(rel, current); + LockBuffer(buffer, BT_READ); + + memcpy(page, BufferGetPage(buffer), BLCKSZ); + + UnlockReleaseBuffer(buffer); + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + if (P_ISDELETED(opaque)) + elog(NOTICE, "page %u of index \"%s\" is deleted", + current, RelationGetRelationName(rel)); + + if (currentsLeft != opaque->btpo_prev) + { + if (!P_ISDELETED(opaque)) + elog(CHCKELEVEL, "left link/right link pair don't comport at level %u, block %u, last: %u, current left: %u", + opaque->btpo.level, current, currentsLeft, opaque->btpo_prev); + else /* level unavailable */ + elog(CHCKELEVEL, "left link/right link pair don't comport, deleted block %u, last: %u, current left: %u", + current, currentsLeft, opaque->btpo_prev); + } + + currentsLeft = current; + current = opaque->btpo_next; + } + + pfree(page); + + return next; + } + + /* + * Given an IndexTuple, form hex cstring of data for error reporting purposes + */ + static char * + form_btree_data(IndexTuple itup) + { + char *dump, *odump; + char *ptr; + int off; + int dlen; + + dlen = IndexTupleSize(itup) - IndexInfoFindDataOffset(itup->t_info); + ptr = (char *) itup + IndexInfoFindDataOffset(itup->t_info); + dump = palloc0(dlen * 3 + 1); + odump = dump; + + for (off = 0; off < dlen; off++) + { + if (off > 0) + *dump++ = ' '; + sprintf(dump, "%02x", *(ptr + off) & 0xff); + dump += 2; + } + + return odump; + } diff --git a/contrib/amcheck/amcheck.control b/contrib/amcheck/amcheck.control new file mode 100644 index ...ab4b1a3 *** a/contrib/amcheck/amcheck.control --- b/contrib/amcheck/amcheck.control *************** *** 0 **** --- 1,5 ---- + # amcheck extension + comment = 'verify the integrity of indexes at a low level' + default_version = '1.0' + module_pathname = '$libdir/amcheck' + relocatable = true
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers