Heikki Linnakangas wrote:
Hi,Currently, an index split writes all the data on the split page to WAL. That's a lot of WAL traffic. The tuples that are copied to the right page need to be WAL logged, but the tuples that stay on the original page don't.Here's a patch to do that. It needs further testing, I have used the attached crude crashtest.sh to test the basics, but we need to test the more obscure cases like splitting non-leaf or root page.On a test case that inserts 10000 rows in increasing key order with a 100 characters wide text-field as key, the patch reduced the total generated WAL traffic from 45MB to 33MB, or ~ 25%. Your mileage may vary, depending on the tuple and key sizes, and the order of inserts.Anyone see a problem with this?
-- Heikki Linnakangas EnterpriseDB http://www.enterprisedb.com
crashtest.sh
Description: application/shellscript
Index: src/backend/access/nbtree/nbtinsert.c =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/nbtree/nbtinsert.c,v retrieving revision 1.146 diff -c -r1.146 nbtinsert.c *** src/backend/access/nbtree/nbtinsert.c 11 Nov 2006 01:14:18 -0000 1.146 --- src/backend/access/nbtree/nbtinsert.c 7 Dec 2006 17:16:38 -0000 *************** *** 932,985 **** xl_btree_split xlrec; uint8 xlinfo; XLogRecPtr recptr; ! XLogRecData rdata[4]; ! xlrec.target.node = rel->rd_node; ! ItemPointerSet(&(xlrec.target.tid), itup_blkno, itup_off); if (newitemonleft) ! xlrec.otherblk = BufferGetBlockNumber(rbuf); else ! xlrec.otherblk = BufferGetBlockNumber(buf); ! xlrec.leftblk = lopaque->btpo_prev; ! xlrec.rightblk = ropaque->btpo_next; ! xlrec.level = lopaque->btpo.level; ! /* * Direct access to page is not good but faster - we should implement * some new func in page API. Note we only store the tuples * themselves, knowing that the item pointers are in the same order * and can be reconstructed by scanning the tuples. See comments for * _bt_restore_page(). */ ! xlrec.leftlen = ((PageHeader) leftpage)->pd_special - ! ((PageHeader) leftpage)->pd_upper; ! rdata[0].data = (char *) &xlrec; ! rdata[0].len = SizeOfBtreeSplit; ! rdata[0].buffer = InvalidBuffer; ! rdata[0].next = &(rdata[1]); ! ! rdata[1].data = (char *) leftpage + ((PageHeader) leftpage)->pd_upper; ! rdata[1].len = xlrec.leftlen; ! rdata[1].buffer = InvalidBuffer; ! rdata[1].next = &(rdata[2]); ! ! rdata[2].data = (char *) rightpage + ((PageHeader) rightpage)->pd_upper; ! rdata[2].len = ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper; ! rdata[2].buffer = InvalidBuffer; ! rdata[2].next = NULL; if (!P_RIGHTMOST(ropaque)) { ! rdata[2].next = &(rdata[3]); ! rdata[3].data = NULL; ! rdata[3].len = 0; ! rdata[3].buffer = sbuf; ! rdata[3].buffer_std = true; ! rdata[3].next = NULL; } if (P_ISROOT(oopaque)) xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L_ROOT : XLOG_BTREE_SPLIT_R_ROOT; else --- 932,1026 ---- xl_btree_split xlrec; uint8 xlinfo; XLogRecPtr recptr; ! XLogRecData rdata[7]; ! XLogRecData *lastrdata; ! xlrec.node = rel->rd_node; ! xlrec.leftsib = BufferGetBlockNumber(buf); ! xlrec.rightsib = BufferGetBlockNumber(rbuf); ! xlrec.firstright = firstright; ! xlrec.rnext = ropaque->btpo_next; ! xlrec.level = lopaque->btpo.level; ! ! rdata[0].data = (char *) &xlrec; ! rdata[0].len = SizeOfBtreeSplit; ! rdata[0].buffer = InvalidBuffer; ! ! lastrdata = &rdata[0]; ! ! /* Log downlink on non-leaf pages. */ ! ! if (lopaque->btpo.level > 0) ! { ! lastrdata->next = lastrdata + 1; ! lastrdata++; ! ! lastrdata->data = (char *) &newitem->t_tid.ip_blkid; ! lastrdata->len = sizeof(BlockIdData); ! lastrdata->buffer = InvalidBuffer; ! } ! ! ! /* Log the new item, if it was inserted on the left page. If it was ! * put on the right page, we don't need to explicitly WAL log it ! * because it's included with all the other on the right page. ! */ ! lastrdata->next = lastrdata + 1; ! lastrdata++; if (newitemonleft) ! { ! lastrdata->data = (char *) &newitemoff; ! lastrdata->len = sizeof(OffsetNumber); ! lastrdata->buffer = buf; ! lastrdata->buffer_std = true; ! ! lastrdata->next = lastrdata + 1; ! lastrdata++; ! lastrdata->data = (char *)newitem; ! lastrdata->len = newitemsz; ! lastrdata->buffer = buf; ! lastrdata->buffer_std = true; ! } else ! { ! lastrdata->data = NULL; ! lastrdata->len = 0; ! lastrdata->buffer = buf; ! lastrdata->buffer_std = true; ! } ! /* Log the contents of the right page in the format understood by ! * _bt_restore_page(). We don't set the buffer-field, because we're ! * going to recreate the whole page anyway. ! * * Direct access to page is not good but faster - we should implement * some new func in page API. Note we only store the tuples * themselves, knowing that the item pointers are in the same order * and can be reconstructed by scanning the tuples. See comments for * _bt_restore_page(). */ ! lastrdata->next = lastrdata + 1; ! lastrdata++; ! lastrdata->data = (char *) rightpage + ((PageHeader) rightpage)->pd_upper; ! lastrdata->len = ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper; ! lastrdata->buffer = InvalidBuffer; + /* Log the right sibling, because we've changed it's prev-pointer. */ if (!P_RIGHTMOST(ropaque)) { ! lastrdata->next = lastrdata + 1; ! lastrdata++; ! ! lastrdata->data = NULL; ! lastrdata->len = 0; ! lastrdata->buffer = sbuf; ! lastrdata->buffer_std = true; } + lastrdata->next = NULL; + if (P_ISROOT(oopaque)) xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L_ROOT : XLOG_BTREE_SPLIT_R_ROOT; else Index: src/backend/access/nbtree/nbtxlog.c =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/nbtree/nbtxlog.c,v retrieving revision 1.39 diff -c -r1.39 nbtxlog.c *** src/backend/access/nbtree/nbtxlog.c 1 Nov 2006 19:43:17 -0000 1.39 --- src/backend/access/nbtree/nbtxlog.c 7 Dec 2006 18:58:48 -0000 *************** *** 264,385 **** { xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record); Relation reln; ! BlockNumber targetblk; ! OffsetNumber targetoff; ! BlockNumber leftsib; ! BlockNumber rightsib; ! BlockNumber downlink = 0; ! Buffer buffer; ! Page page; ! BTPageOpaque pageop; ! reln = XLogOpenRelation(xlrec->target.node); ! targetblk = ItemPointerGetBlockNumber(&(xlrec->target.tid)); ! targetoff = ItemPointerGetOffsetNumber(&(xlrec->target.tid)); ! leftsib = (onleft) ? targetblk : xlrec->otherblk; ! rightsib = (onleft) ? xlrec->otherblk : targetblk; ! /* Left (original) sibling */ ! buffer = XLogReadBuffer(reln, leftsib, true); ! Assert(BufferIsValid(buffer)); ! page = (Page) BufferGetPage(buffer); ! _bt_pageinit(page, BufferGetPageSize(buffer)); ! pageop = (BTPageOpaque) PageGetSpecialPointer(page); ! pageop->btpo_prev = xlrec->leftblk; ! pageop->btpo_next = rightsib; ! pageop->btpo.level = xlrec->level; ! pageop->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0; ! pageop->btpo_cycleid = 0; - _bt_restore_page(page, - (char *) xlrec + SizeOfBtreeSplit, - xlrec->leftlen); ! if (onleft && xlrec->level > 0) { ! IndexTuple itup; ! /* extract downlink in the target tuple */ ! itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, targetoff)); ! downlink = ItemPointerGetBlockNumber(&(itup->t_tid)); ! Assert(ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY); } ! PageSetLSN(page, lsn); ! PageSetTLI(page, ThisTimeLineID); ! MarkBufferDirty(buffer); ! UnlockReleaseBuffer(buffer); ! /* Right (new) sibling */ ! buffer = XLogReadBuffer(reln, rightsib, true); ! Assert(BufferIsValid(buffer)); ! page = (Page) BufferGetPage(buffer); ! _bt_pageinit(page, BufferGetPageSize(buffer)); ! pageop = (BTPageOpaque) PageGetSpecialPointer(page); ! pageop->btpo_prev = leftsib; ! pageop->btpo_next = xlrec->rightblk; ! pageop->btpo.level = xlrec->level; ! pageop->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0; ! pageop->btpo_cycleid = 0; - _bt_restore_page(page, - (char *) xlrec + SizeOfBtreeSplit + xlrec->leftlen, - record->xl_len - SizeOfBtreeSplit - xlrec->leftlen); ! if (!onleft && xlrec->level > 0) { ! IndexTuple itup; - /* extract downlink in the target tuple */ - itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, targetoff)); - downlink = ItemPointerGetBlockNumber(&(itup->t_tid)); - Assert(ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY); } ! PageSetLSN(page, lsn); ! PageSetTLI(page, ThisTimeLineID); ! MarkBufferDirty(buffer); ! UnlockReleaseBuffer(buffer); ! /* Fix left-link of right (next) page */ ! if (!(record->xl_info & XLR_BKP_BLOCK_1)) { ! if (xlrec->rightblk != P_NONE) { ! buffer = XLogReadBuffer(reln, xlrec->rightblk, false); if (BufferIsValid(buffer)) { ! page = (Page) BufferGetPage(buffer); ! if (XLByteLE(lsn, PageGetLSN(page))) { ! UnlockReleaseBuffer(buffer); ! } ! else ! { ! pageop = (BTPageOpaque) PageGetSpecialPointer(page); ! pageop->btpo_prev = rightsib; PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); - UnlockReleaseBuffer(buffer); } } } } - /* Forget any split this insertion completes */ - if (xlrec->level > 0) - forget_matching_split(xlrec->target.node, downlink, false); - /* The job ain't done till the parent link is inserted... */ ! log_incomplete_split(xlrec->target.node, ! leftsib, rightsib, isroot); } static void --- 264,426 ---- { xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record); Relation reln; ! Buffer lbuf, rbuf; ! Page lpage, rpage; ! BTPageOpaque ropaque, lopaque; ! char *datapos; ! int datalen; ! bool bkp_left = record->xl_info & XLR_BKP_BLOCK_1; ! bool bkp_nextsib = record->xl_info & XLR_BKP_BLOCK_3; ! OffsetNumber newitemoff; ! IndexTuple newitem = NULL; ! Size newitemsz = 0; ! reln = XLogOpenRelation(xlrec->node); ! datapos = (char *) xlrec + SizeOfBtreeSplit; ! datalen = record->xl_len - SizeOfBtreeSplit; ! /* Forget any split this insertion completes */ ! if (xlrec->level > 0) ! { ! BlockNumber downlink = BlockIdGetBlockNumber((BlockId) datapos); ! datapos += sizeof(BlockIdData); ! datalen -= sizeof(BlockIdData); ! ! forget_matching_split(xlrec->node, downlink, false); ! } ! /* Extract newitem and newitemoff */ ! ! if (!bkp_left && onleft) { ! /* Extract the offset of the new tuple and it's contents */ ! memcpy(&newitemoff, datapos, sizeof(OffsetNumber)); ! datapos += sizeof(OffsetNumber); ! datalen -= sizeof(OffsetNumber); ! newitem = (IndexTuple) datapos; ! newitemsz = IndexTupleDSize(*newitem); /* XXX: alignment? */ ! datapos += newitemsz; ! datalen -= newitemsz; } ! /* Reconstruct right (new) sibling */ ! rbuf = XLogReadBuffer(reln, xlrec->rightsib, true); ! Assert(BufferIsValid(rbuf)); ! rpage = (Page) BufferGetPage(rbuf); ! _bt_pageinit(rpage, BufferGetPageSize(rbuf)); ! ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); ! ropaque->btpo_prev = xlrec->leftsib; ! ropaque->btpo_next = xlrec->rnext; ! ropaque->btpo.level = xlrec->level; ! ropaque->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0; ! ropaque->btpo_cycleid = 0; ! _bt_restore_page(rpage, datapos, datalen); ! ! PageSetLSN(rpage, lsn); ! PageSetTLI(rpage, ThisTimeLineID); ! MarkBufferDirty(rbuf); ! ! /* don't release the buffer yet, because reconstructing the left sibling ! * needs to access the data on the right page ! */ ! /* Reconstruct left (original) sibling */ ! ! if(!bkp_left) { ! lbuf = XLogReadBuffer(reln, xlrec->leftsib, false); ! ! if (BufferIsValid(lbuf)) ! { ! lpage = (Page) BufferGetPage(lbuf); ! lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage); ! ! if (!XLByteLE(lsn, PageGetLSN(lpage))) ! { ! /* Remove the items from the left page that were copied to ! * right page, and add the new item if it was inserted to ! * left page. ! */ ! OffsetNumber off; ! OffsetNumber maxoff = PageGetMaxOffsetNumber(lpage); ! ItemId hiItemId; ! Item hiItem; ! ! for(off = maxoff ; off >= xlrec->firstright; off--) ! PageIndexTupleDelete(lpage, off); ! ! if (onleft) ! { ! if (PageAddItem(lpage, (Item) newitem, newitemsz, ! newitemoff, LP_USED) == InvalidOffsetNumber) ! elog(PANIC, "can't add new item to left sibling after split"); ! } ! /* Set high key equal to the first key on the right page */ ! hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque)); ! hiItem = PageGetItem(rpage, hiItemId); ! ! if(!P_RIGHTMOST(lopaque)) ! { ! /* but remove the old high key first */ ! PageIndexTupleDelete(lpage, P_HIKEY); ! } ! ! if(PageAddItem(lpage, hiItem, ItemIdGetLength(hiItemId), ! P_HIKEY, LP_USED) == InvalidOffsetNumber) ! elog(PANIC, "can't add high key after split to left page"); ! ! /* Fix opaque fields */ ! lopaque->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0; ! lopaque->btpo_next = xlrec->rightsib; ! lopaque->btpo_cycleid = 0; ! ! PageSetLSN(lpage, lsn); ! PageSetTLI(lpage, ThisTimeLineID); ! MarkBufferDirty(lbuf); ! } ! ! UnlockReleaseBuffer(lbuf); ! } } ! UnlockReleaseBuffer(rbuf); ! /* Fix left-link of the page to the right of the new right sibling */ ! if (!bkp_nextsib) { ! if (xlrec->rnext != P_NONE) { ! Buffer buffer = XLogReadBuffer(reln, xlrec->rnext, false); if (BufferIsValid(buffer)) { ! Page page = (Page) BufferGetPage(buffer); ! if (!XLByteLE(lsn, PageGetLSN(page))) { ! BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page); ! pageop->btpo_prev = xlrec->rightsib; PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } + UnlockReleaseBuffer(buffer); } } } /* The job ain't done till the parent link is inserted... */ ! log_incomplete_split(xlrec->node, ! xlrec->leftsib, xlrec->rightsib, isroot); } static void *************** *** 727,766 **** { xl_btree_split *xlrec = (xl_btree_split *) rec; ! appendStringInfo(buf, "split_l: "); ! out_target(buf, &(xlrec->target)); ! appendStringInfo(buf, "; oth %u; rgh %u", ! xlrec->otherblk, xlrec->rightblk); break; } case XLOG_BTREE_SPLIT_R: { xl_btree_split *xlrec = (xl_btree_split *) rec; ! appendStringInfo(buf, "split_r: "); ! out_target(buf, &(xlrec->target)); ! appendStringInfo(buf, "; oth %u; rgh %u", ! xlrec->otherblk, xlrec->rightblk); break; } case XLOG_BTREE_SPLIT_L_ROOT: { xl_btree_split *xlrec = (xl_btree_split *) rec; ! appendStringInfo(buf, "split_l_root: "); ! out_target(buf, &(xlrec->target)); ! appendStringInfo(buf, "; oth %u; rgh %u", ! xlrec->otherblk, xlrec->rightblk); break; } case XLOG_BTREE_SPLIT_R_ROOT: { xl_btree_split *xlrec = (xl_btree_split *) rec; ! appendStringInfo(buf, "split_r_root: "); ! out_target(buf, &(xlrec->target)); ! appendStringInfo(buf, "; oth %u; rgh %u", ! xlrec->otherblk, xlrec->rightblk); break; } case XLOG_BTREE_DELETE: --- 768,815 ---- { xl_btree_split *xlrec = (xl_btree_split *) rec; ! appendStringInfo(buf, "split_l: rel %u/%u/%u ", ! xlrec->node.spcNode, xlrec->node.dbNode, ! xlrec->node.relNode); ! appendStringInfo(buf, "left %u, right %u off %u level %u", ! xlrec->leftsib, xlrec->rightsib, ! xlrec->firstright, xlrec->level); break; } case XLOG_BTREE_SPLIT_R: { xl_btree_split *xlrec = (xl_btree_split *) rec; ! appendStringInfo(buf, "split_r: rel %u/%u/%u ", ! xlrec->node.spcNode, xlrec->node.dbNode, ! xlrec->node.relNode); ! appendStringInfo(buf, "left %u, right %u off %u level %u", ! xlrec->leftsib, xlrec->rightsib, ! xlrec->firstright, xlrec->level); break; } case XLOG_BTREE_SPLIT_L_ROOT: { xl_btree_split *xlrec = (xl_btree_split *) rec; ! appendStringInfo(buf, "split_l_root: rel %u/%u/%u ", ! xlrec->node.spcNode, xlrec->node.dbNode, ! xlrec->node.relNode); ! appendStringInfo(buf, "left %u, right %u off %u level %u", ! xlrec->leftsib, xlrec->rightsib, ! xlrec->firstright, xlrec->level); break; } case XLOG_BTREE_SPLIT_R_ROOT: { xl_btree_split *xlrec = (xl_btree_split *) rec; ! appendStringInfo(buf, "split_r_root: rel %u/%u/%u ", ! xlrec->node.spcNode, xlrec->node.dbNode, ! xlrec->node.relNode); ! appendStringInfo(buf, "left %u, right %u off %u level %u", ! xlrec->leftsib, xlrec->rightsib, ! xlrec->firstright, xlrec->level); break; } case XLOG_BTREE_DELETE: Index: src/include/access/nbtree.h =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/access/nbtree.h,v retrieving revision 1.106 diff -c -r1.106 nbtree.h *** src/include/access/nbtree.h 1 Nov 2006 19:43:17 -0000 1.106 --- src/include/access/nbtree.h 7 Dec 2006 17:16:38 -0000 *************** *** 254,260 **** * * Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record. * The _L and _R variants indicate whether the inserted tuple went into the ! * left or right split page (and thus, whether otherblk is the right or left * page of the split pair). The _ROOT variants indicate that we are splitting * the root page, and thus that a newroot record rather than an insert or * split record should follow. Note that a split record never carries a --- 254,261 ---- * * Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record. * The _L and _R variants indicate whether the inserted tuple went into the ! * left or right split page (and thus, whether newitemoff and the new item ! * are stored or not. * page of the split pair). The _ROOT variants indicate that we are splitting * the root page, and thus that a newroot record rather than an insert or * split record should follow. Note that a split record never carries a *************** *** 262,278 **** */ typedef struct xl_btree_split { ! xl_btreetid target; /* inserted tuple id */ ! BlockNumber otherblk; /* second block participated in split: */ ! /* first one is stored in target' tid */ ! BlockNumber leftblk; /* prev/left block */ ! BlockNumber rightblk; /* next/right block */ ! uint32 level; /* tree level of page being split */ ! uint16 leftlen; /* len of left page items below */ ! /* LEFT AND RIGHT PAGES TUPLES FOLLOW AT THE END */ } xl_btree_split; ! #define SizeOfBtreeSplit (offsetof(xl_btree_split, leftlen) + sizeof(uint16)) /* * This is what we need to know about delete of individual leaf index tuples. --- 263,283 ---- */ typedef struct xl_btree_split { ! RelFileNode node; ! BlockNumber leftsib; /* orig page / new left page */ ! BlockNumber rightsib; /* new right page */ ! OffsetNumber firstright; /* first item stored on right page */ ! BlockNumber rnext; /* next/right block pointer */ ! uint32 level; /* tree level of page being split */ ! ! /* BlockIdData downlink follows if level > 0 */ ! ! /* OffsetNumber newitemoff follows in the _L variants. */ ! /* New item follows in the _L variants */ ! /* RIGHT PAGES TUPLES FOLLOW AT THE END */ } xl_btree_split; ! #define SizeOfBtreeSplit (offsetof(xl_btree_split, level) + sizeof(uint32)) /* * This is what we need to know about delete of individual leaf index tuples.
---------------------------(end of broadcast)--------------------------- TIP 1: if posting/reading through Usenet, please send an appropriate subscribe-nomail command to [EMAIL PROTECTED] so that your message can get through to the mailing list cleanly