Hi,Currently, an index split writes all the data on the split page to WAL. That's a lot of WAL traffic. The tuples that are copied to the right page need to be WAL logged, but the tuples that stay on the original page don't.
Here's a patch to do that. It needs further testing, I have used the attached crude crashtest.sh to test the basics, but we need to test the more obscure cases like splitting non-leaf or root page.
On a test case that inserts 10000 rows in increasing key order with a 100 characters wide text-field as key, the patch reduced the total generated WAL traffic from 45MB to 33MB, or ~ 25%. Your mileage may vary, depending on the tuple and key sizes, and the order of inserts.
Anyone see a problem with this? -- Heikki Linnakangas EnterpriseDB http://www.enterprisedb.com
Index: src/backend/access/nbtree/nbtinsert.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/nbtree/nbtinsert.c,v
retrieving revision 1.146
diff -c -r1.146 nbtinsert.c
*** src/backend/access/nbtree/nbtinsert.c 11 Nov 2006 01:14:18 -0000 1.146
--- src/backend/access/nbtree/nbtinsert.c 4 Dec 2006 12:31:13 -0000
***************
*** 932,985 ****
xl_btree_split xlrec;
uint8 xlinfo;
XLogRecPtr recptr;
! XLogRecData rdata[4];
! xlrec.target.node = rel->rd_node;
! ItemPointerSet(&(xlrec.target.tid), itup_blkno, itup_off);
if (newitemonleft)
! xlrec.otherblk = BufferGetBlockNumber(rbuf);
else
! xlrec.otherblk = BufferGetBlockNumber(buf);
! xlrec.leftblk = lopaque->btpo_prev;
! xlrec.rightblk = ropaque->btpo_next;
! xlrec.level = lopaque->btpo.level;
! /*
* Direct access to page is not good but faster - we should implement
* some new func in page API. Note we only store the tuples
* themselves, knowing that the item pointers are in the same order
* and can be reconstructed by scanning the tuples. See comments for
* _bt_restore_page().
*/
! xlrec.leftlen = ((PageHeader) leftpage)->pd_special -
! ((PageHeader) leftpage)->pd_upper;
! rdata[0].data = (char *) &xlrec;
! rdata[0].len = SizeOfBtreeSplit;
! rdata[0].buffer = InvalidBuffer;
! rdata[0].next = &(rdata[1]);
!
! rdata[1].data = (char *) leftpage + ((PageHeader) leftpage)->pd_upper;
! rdata[1].len = xlrec.leftlen;
! rdata[1].buffer = InvalidBuffer;
! rdata[1].next = &(rdata[2]);
!
! rdata[2].data = (char *) rightpage + ((PageHeader) rightpage)->pd_upper;
! rdata[2].len = ((PageHeader) rightpage)->pd_special -
((PageHeader) rightpage)->pd_upper;
! rdata[2].buffer = InvalidBuffer;
! rdata[2].next = NULL;
if (!P_RIGHTMOST(ropaque))
{
! rdata[2].next = &(rdata[3]);
! rdata[3].data = NULL;
! rdata[3].len = 0;
! rdata[3].buffer = sbuf;
! rdata[3].buffer_std = true;
! rdata[3].next = NULL;
}
if (P_ISROOT(oopaque))
xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L_ROOT : XLOG_BTREE_SPLIT_R_ROOT;
else
--- 932,1026 ----
xl_btree_split xlrec;
uint8 xlinfo;
XLogRecPtr recptr;
! XLogRecData rdata[7];
! XLogRecData *lastrdata;
! xlrec.node = rel->rd_node;
! xlrec.leftsib = BufferGetBlockNumber(buf);
! xlrec.rightsib = BufferGetBlockNumber(rbuf);
! xlrec.firstright = firstright;
! xlrec.rnext = ropaque->btpo_next;
! xlrec.level = lopaque->btpo.level;
!
! rdata[0].data = (char *) &xlrec;
! rdata[0].len = SizeOfBtreeSplit;
! rdata[0].buffer = InvalidBuffer;
!
! lastrdata = &rdata[0];
!
! /* Log downlink on non-leaf pages. */
!
! if (lopaque->btpo.level > 0)
! {
! lastrdata->next = lastrdata + 1;
! lastrdata++;
!
! lastrdata->data = (char *) &newitem->t_tid.ip_blkid;
! lastrdata->len = sizeof(BlockIdData);
! lastrdata->buffer = InvalidBuffer;
! }
!
!
! /* Log the new item, if it was inserted on the left page. If it was
! * put on the right page, we don't need to explicitly WAL log it
! * because it's included with all the other on the right page.
! */
! lastrdata->next = lastrdata + 1;
! lastrdata++;
if (newitemonleft)
! {
! lastrdata->data = (char *) &newitemoff;
! lastrdata->len = sizeof(OffsetNumber);
! lastrdata->buffer = buf;
! lastrdata->buffer_std = true;
!
! lastrdata->next = lastrdata + 1;
! lastrdata++;
! lastrdata->data = (char *)newitem;
! lastrdata->len = newitemsz;
! lastrdata->buffer = buf;
! lastrdata->buffer_std = true;
! }
else
! {
! lastrdata->data = NULL;
! lastrdata->len = 0;
! lastrdata->buffer = buf;
! lastrdata->buffer_std = true;
! }
! /* Log the contents of the right page in the format understood by
! * _bt_restore_page(). We don't set the buffer-field, because we're
! * going to recreate the whole page anyway.
! *
* Direct access to page is not good but faster - we should implement
* some new func in page API. Note we only store the tuples
* themselves, knowing that the item pointers are in the same order
* and can be reconstructed by scanning the tuples. See comments for
* _bt_restore_page().
*/
! lastrdata->next = lastrdata + 1;
! lastrdata++;
! lastrdata->data = (char *) rightpage + ((PageHeader) rightpage)->pd_upper;
! lastrdata->len = ((PageHeader) rightpage)->pd_special -
((PageHeader) rightpage)->pd_upper;
! lastrdata->buffer = InvalidBuffer;
+ /* Log the right sibling, because we've changed it's prev-pointer. */
if (!P_RIGHTMOST(ropaque))
{
! lastrdata->next = lastrdata + 1;
! lastrdata++;
!
! lastrdata->data = NULL;
! lastrdata->len = 0;
! lastrdata->buffer = sbuf;
! lastrdata->buffer_std = true;
}
+ lastrdata->next = NULL;
+
if (P_ISROOT(oopaque))
xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L_ROOT : XLOG_BTREE_SPLIT_R_ROOT;
else
Index: src/backend/access/nbtree/nbtxlog.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/nbtree/nbtxlog.c,v
retrieving revision 1.39
diff -c -r1.39 nbtxlog.c
*** src/backend/access/nbtree/nbtxlog.c 1 Nov 2006 19:43:17 -0000 1.39
--- src/backend/access/nbtree/nbtxlog.c 4 Dec 2006 14:13:39 -0000
***************
*** 264,385 ****
{
xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record);
Relation reln;
- BlockNumber targetblk;
- OffsetNumber targetoff;
- BlockNumber leftsib;
- BlockNumber rightsib;
BlockNumber downlink = 0;
! Buffer buffer;
! Page page;
! BTPageOpaque pageop;
! reln = XLogOpenRelation(xlrec->target.node);
! targetblk = ItemPointerGetBlockNumber(&(xlrec->target.tid));
! targetoff = ItemPointerGetOffsetNumber(&(xlrec->target.tid));
! leftsib = (onleft) ? targetblk : xlrec->otherblk;
! rightsib = (onleft) ? xlrec->otherblk : targetblk;
! /* Left (original) sibling */
! buffer = XLogReadBuffer(reln, leftsib, true);
! Assert(BufferIsValid(buffer));
! page = (Page) BufferGetPage(buffer);
! _bt_pageinit(page, BufferGetPageSize(buffer));
! pageop = (BTPageOpaque) PageGetSpecialPointer(page);
- pageop->btpo_prev = xlrec->leftblk;
- pageop->btpo_next = rightsib;
- pageop->btpo.level = xlrec->level;
- pageop->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0;
- pageop->btpo_cycleid = 0;
! _bt_restore_page(page,
! (char *) xlrec + SizeOfBtreeSplit,
! xlrec->leftlen);
! if (onleft && xlrec->level > 0)
{
! IndexTuple itup;
! /* extract downlink in the target tuple */
! itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, targetoff));
! downlink = ItemPointerGetBlockNumber(&(itup->t_tid));
! Assert(ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY);
}
! PageSetLSN(page, lsn);
! PageSetTLI(page, ThisTimeLineID);
! MarkBufferDirty(buffer);
! UnlockReleaseBuffer(buffer);
! /* Right (new) sibling */
! buffer = XLogReadBuffer(reln, rightsib, true);
! Assert(BufferIsValid(buffer));
! page = (Page) BufferGetPage(buffer);
! _bt_pageinit(page, BufferGetPageSize(buffer));
! pageop = (BTPageOpaque) PageGetSpecialPointer(page);
- pageop->btpo_prev = leftsib;
- pageop->btpo_next = xlrec->rightblk;
- pageop->btpo.level = xlrec->level;
- pageop->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0;
- pageop->btpo_cycleid = 0;
! _bt_restore_page(page,
! (char *) xlrec + SizeOfBtreeSplit + xlrec->leftlen,
! record->xl_len - SizeOfBtreeSplit - xlrec->leftlen);
! if (!onleft && xlrec->level > 0)
{
! IndexTuple itup;
- /* extract downlink in the target tuple */
- itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, targetoff));
- downlink = ItemPointerGetBlockNumber(&(itup->t_tid));
- Assert(ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY);
}
! PageSetLSN(page, lsn);
! PageSetTLI(page, ThisTimeLineID);
! MarkBufferDirty(buffer);
! UnlockReleaseBuffer(buffer);
/* Fix left-link of right (next) page */
! if (!(record->xl_info & XLR_BKP_BLOCK_1))
{
! if (xlrec->rightblk != P_NONE)
{
! buffer = XLogReadBuffer(reln, xlrec->rightblk, false);
if (BufferIsValid(buffer))
{
! page = (Page) BufferGetPage(buffer);
! if (XLByteLE(lsn, PageGetLSN(page)))
! {
! UnlockReleaseBuffer(buffer);
! }
! else
{
! pageop = (BTPageOpaque) PageGetSpecialPointer(page);
! pageop->btpo_prev = rightsib;
PageSetLSN(page, lsn);
PageSetTLI(page, ThisTimeLineID);
MarkBufferDirty(buffer);
- UnlockReleaseBuffer(buffer);
}
}
}
}
/* Forget any split this insertion completes */
if (xlrec->level > 0)
! forget_matching_split(xlrec->target.node, downlink, false);
/* The job ain't done till the parent link is inserted... */
! log_incomplete_split(xlrec->target.node,
! leftsib, rightsib, isroot);
}
static void
--- 264,429 ----
{
xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record);
Relation reln;
BlockNumber downlink = 0;
! Buffer lbuf, rbuf;
! Page lpage, rpage;
! BTPageOpaque ropaque, lopaque;
! char *datapos;
! int datalen;
! bool bkp_left = record->xl_info & XLR_BKP_BLOCK_1;
! bool bkp_nextsib = record->xl_info & XLR_BKP_BLOCK_3;
! OffsetNumber newitemoff;
! IndexTuple newitem = NULL;
! Size newitemsz = 0;
! reln = XLogOpenRelation(xlrec->node);
! datapos = (char *) xlrec + SizeOfBtreeSplit;
! datalen = record->xl_len - SizeOfBtreeSplit;
! /* Extract downlink */
! if (xlrec->level > 0)
! {
! downlink = BlockIdGetBlockNumber((BlockId) datapos);
!
! datapos += sizeof(BlockIdData);
! datalen -= sizeof(BlockIdData);
! }
! /* Extract newitem and newitemoff */
! if (!bkp_left && onleft)
{
! /* Extract the offset of the new tuple and it's contents */
! memcpy(&newitemoff, datapos, sizeof(OffsetNumber));
! datapos += sizeof(OffsetNumber);
! datalen -= sizeof(OffsetNumber);
! newitem = (IndexTuple) datapos;
! newitemsz = IndexTupleDSize(*newitem); /* XXX: alignment? */
! datapos += newitemsz;
! datalen -= newitemsz;
}
! /* Reconstruct right (new) sibling */
! rbuf = XLogReadBuffer(reln, xlrec->rightsib, true);
! Assert(BufferIsValid(rbuf));
! rpage = (Page) BufferGetPage(rbuf);
! _bt_pageinit(rpage, BufferGetPageSize(rbuf));
! ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage);
! ropaque->btpo_prev = xlrec->leftsib;
! ropaque->btpo_next = xlrec->rnext;
! ropaque->btpo.level = xlrec->level;
! ropaque->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0;
! ropaque->btpo_cycleid = 0;
!
! _bt_restore_page(rpage, datapos, datalen);
!
! PageSetLSN(rpage, lsn);
! PageSetTLI(rpage, ThisTimeLineID);
! MarkBufferDirty(rbuf);
!
! /* don't release the buffer yet, because reconstructing the left sibling
! * needs to access the data on the right page
! */
! /* Reconstruct left (original) sibling */
! if(!bkp_left)
{
! lbuf = XLogReadBuffer(reln, xlrec->leftsib, false);
!
! if (BufferIsValid(lbuf))
! {
! lpage = (Page) BufferGetPage(lbuf);
! lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
!
! if (!XLByteLE(lsn, PageGetLSN(lpage)))
! {
! /* Remove the items from the left page that were copied to
! * right page, and add the new item if it was inserted to
! * left page.
! */
! OffsetNumber off;
! OffsetNumber maxoff = PageGetMaxOffsetNumber(lpage);
!
! for(off = maxoff ; off >= xlrec->firstright; off--)
! PageIndexTupleDelete(lpage, off);
!
! if (onleft)
! {
! if (PageAddItem(lpage, (Item) newitem, newitemsz,
! newitemoff, LP_USED) == InvalidOffsetNumber)
! elog(PANIC, "can't add new item to left sibling after split");
! }
! PageSetLSN(lpage, lsn);
! PageSetTLI(lpage, ThisTimeLineID);
! MarkBufferDirty(lbuf);
!
!
! /* Set high key equal to the first key on the next page */
! {
! ItemId hiItemId = PageGetItemId(rpage, FirstOffsetNumber);
! Item hiItem = PageGetItem(rpage, hiItemId);
!
! if(!P_RIGHTMOST(lopaque))
! {
! /* But remove the old high key first */
! PageIndexTupleDelete(lpage, P_HIKEY);
! }
!
! if(PageAddItem(lpage, hiItem, ItemIdGetLength(hiItemId),
! P_HIKEY, LP_USED) == InvalidOffsetNumber)
! elog(PANIC, "can't add high key after split to left page");
! }
!
! lopaque->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0;
! lopaque->btpo_next = xlrec->rightsib;
! lopaque->btpo_cycleid = 0;
! }
!
! UnlockReleaseBuffer(lbuf);
! }
}
! UnlockReleaseBuffer(rbuf);
/* Fix left-link of right (next) page */
! if (!bkp_nextsib)
{
! if (xlrec->rnext != P_NONE)
{
! Buffer buffer = XLogReadBuffer(reln, xlrec->rnext, false);
if (BufferIsValid(buffer))
{
! Page page = (Page) BufferGetPage(buffer);
! if (!XLByteLE(lsn, PageGetLSN(page)))
{
! BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page);
! pageop->btpo_prev = xlrec->rightsib;
PageSetLSN(page, lsn);
PageSetTLI(page, ThisTimeLineID);
MarkBufferDirty(buffer);
}
+ UnlockReleaseBuffer(buffer);
}
}
}
/* Forget any split this insertion completes */
if (xlrec->level > 0)
! forget_matching_split(xlrec->node, downlink, false);
/* The job ain't done till the parent link is inserted... */
! log_incomplete_split(xlrec->node,
! xlrec->leftsib, xlrec->rightsib, isroot);
}
static void
***************
*** 723,728 ****
--- 767,773 ----
out_target(buf, &(xlrec->target));
break;
}
+ #ifdef NOT_USED
case XLOG_BTREE_SPLIT_L:
{
xl_btree_split *xlrec = (xl_btree_split *) rec;
***************
*** 763,768 ****
--- 808,814 ----
xlrec->otherblk, xlrec->rightblk);
break;
}
+ #endif
case XLOG_BTREE_DELETE:
{
xl_btree_delete *xlrec = (xl_btree_delete *) rec;
Index: src/include/access/nbtree.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/access/nbtree.h,v
retrieving revision 1.106
diff -c -r1.106 nbtree.h
*** src/include/access/nbtree.h 1 Nov 2006 19:43:17 -0000 1.106
--- src/include/access/nbtree.h 4 Dec 2006 12:16:31 -0000
***************
*** 254,260 ****
*
* Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record.
* The _L and _R variants indicate whether the inserted tuple went into the
! * left or right split page (and thus, whether otherblk is the right or left
* page of the split pair). The _ROOT variants indicate that we are splitting
* the root page, and thus that a newroot record rather than an insert or
* split record should follow. Note that a split record never carries a
--- 254,261 ----
*
* Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record.
* The _L and _R variants indicate whether the inserted tuple went into the
! * left or right split page (and thus, whether newitemoff and the new item
! * are stored or not.
* page of the split pair). The _ROOT variants indicate that we are splitting
* the root page, and thus that a newroot record rather than an insert or
* split record should follow. Note that a split record never carries a
***************
*** 262,278 ****
*/
typedef struct xl_btree_split
{
! xl_btreetid target; /* inserted tuple id */
! BlockNumber otherblk; /* second block participated in split: */
! /* first one is stored in target' tid */
! BlockNumber leftblk; /* prev/left block */
! BlockNumber rightblk; /* next/right block */
! uint32 level; /* tree level of page being split */
! uint16 leftlen; /* len of left page items below */
! /* LEFT AND RIGHT PAGES TUPLES FOLLOW AT THE END */
} xl_btree_split;
! #define SizeOfBtreeSplit (offsetof(xl_btree_split, leftlen) + sizeof(uint16))
/*
* This is what we need to know about delete of individual leaf index tuples.
--- 263,283 ----
*/
typedef struct xl_btree_split
{
! RelFileNode node;
! BlockNumber leftsib; /* orig page / new left page */
! BlockNumber rightsib; /* new right page */
! OffsetNumber firstright; /* first item stored on right page */
! BlockNumber rnext; /* next/right block pointer */
! uint32 level; /* tree level of page being split */
!
! /* BlockIdData downlink follows if level > 0 */
!
! /* OffsetNumber newitemoff follows in the _L variants. */
! /* New item follows in the _L variants */
! /* RIGHT PAGES TUPLES FOLLOW AT THE END */
} xl_btree_split;
! #define SizeOfBtreeSplit (offsetof(xl_btree_split, level) + sizeof(uint32))
/*
* This is what we need to know about delete of individual leaf index tuples.
crashtest.sh
Description: application/shellscript
---------------------------(end of broadcast)---------------------------
TIP 7: You can help support the PostgreSQL project by donating at
http://www.postgresql.org/about/donate
