We can make markpos fast, if we make the copy lazily in _bt_steppage, see attached patch.
I did some micro-benchmarking of merge join performance, see attached test. Test results, on my laptop:
8_1_STABLE: 1.77 s HEAD, with patch: 1.65 s HEAD, without patch: 2.46 s The results are pretty stable, within 0.1 s. -- Heikki Linnakangas EnterpriseDB http://www.enterprisedb.com
mergejointest.sh
Description: application/shellscript
Index: src/backend/access/nbtree/nbtree.c =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/nbtree/nbtree.c,v retrieving revision 1.149 diff -c -r1.149 nbtree.c *** src/backend/access/nbtree/nbtree.c 10 May 2006 23:18:39 -0000 1.149 --- src/backend/access/nbtree/nbtree.c 23 Aug 2006 11:54:09 -0000 *************** *** 368,373 **** --- 368,374 ---- { so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData)); so->currPos.buf = so->markPos.buf = InvalidBuffer; + so->markItemIndex = -1; if (scan->numberOfKeys > 0) so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData)); else *************** *** 392,397 **** --- 393,399 ---- ReleaseBuffer(so->markPos.buf); so->markPos.buf = InvalidBuffer; } + so->markItemIndex = -1; /* * Reset the scan keys. Note that keys ordering stuff moved to _bt_first. *************** *** 430,435 **** --- 432,438 ---- ReleaseBuffer(so->markPos.buf); so->markPos.buf = InvalidBuffer; } + so->markItemIndex = -1; if (so->killedItems != NULL) pfree(so->killedItems); *************** *** 456,469 **** so->markPos.buf = InvalidBuffer; } ! /* bump pin on current buffer for assignment to mark buffer */ if (BTScanPosIsValid(so->currPos)) { ! IncrBufferRefCount(so->currPos.buf); ! memcpy(&so->markPos, &so->currPos, ! offsetof(BTScanPosData, items[1]) + ! so->currPos.lastItem * sizeof(BTScanPosItem)); ! } PG_RETURN_VOID(); } --- 459,472 ---- so->markPos.buf = InvalidBuffer; } ! /* Record the current itemIndex we're on. If we later step to next page ! * before releasing the marked position, _bt_steppage makes a full copy ! * of the currPos-struct. */ if (BTScanPosIsValid(so->currPos)) { ! so->markItemIndex = so->currPos.itemIndex; ! } else ! so->markItemIndex = -1; PG_RETURN_VOID(); } *************** *** 477,500 **** IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); BTScanOpaque so = (BTScanOpaque) scan->opaque; ! /* we aren't holding any read locks, but gotta drop the pin */ ! if (BTScanPosIsValid(so->currPos)) { ! /* Before leaving current page, deal with any killed items */ ! if (so->numKilled > 0 && ! so->currPos.buf != so->markPos.buf) ! _bt_killitems(scan, false); ! ReleaseBuffer(so->currPos.buf); ! so->currPos.buf = InvalidBuffer; ! } ! ! /* bump pin on marked buffer */ ! if (BTScanPosIsValid(so->markPos)) { ! IncrBufferRefCount(so->markPos.buf); ! memcpy(&so->currPos, &so->markPos, ! offsetof(BTScanPosData, items[1]) + ! so->markPos.lastItem * sizeof(BTScanPosItem)); } PG_RETURN_VOID(); --- 480,512 ---- IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); BTScanOpaque so = (BTScanOpaque) scan->opaque; ! if (so->markItemIndex != -1) { ! /* The restore position was on the same page. ! * Just restore the itemIndex */ ! so->currPos.itemIndex = so->markItemIndex; ! } ! else { ! /* we aren't holding any read locks, but gotta drop the pin */ ! if (BTScanPosIsValid(so->currPos)) ! { ! /* Before leaving current page, deal with any killed items */ ! if (so->numKilled > 0 && ! so->currPos.buf != so->markPos.buf) ! _bt_killitems(scan, false); ! ReleaseBuffer(so->currPos.buf); ! so->currPos.buf = InvalidBuffer; ! } ! ! if (BTScanPosIsValid(so->markPos)) ! { ! /* bump pin on marked buffer for assignment to current buffer */ ! IncrBufferRefCount(so->markPos.buf); ! memcpy(&so->currPos, &so->markPos, ! offsetof(BTScanPosData, items[1]) + ! so->markPos.lastItem * sizeof(BTScanPosItem)); ! } } PG_RETURN_VOID(); Index: src/backend/access/nbtree/nbtsearch.c =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/nbtree/nbtsearch.c,v retrieving revision 1.105 diff -c -r1.105 nbtsearch.c *** src/backend/access/nbtree/nbtsearch.c 7 May 2006 01:21:30 -0000 1.105 --- src/backend/access/nbtree/nbtsearch.c 23 Aug 2006 11:11:45 -0000 *************** *** 1055,1060 **** --- 1055,1074 ---- rel = scan->indexRelation; + /* Before we modify currPos, make a copy of the items if there + * was a marked position that needs them. */ + if (so->markItemIndex != -1) + { + /* bump pin on current buffer for assignment to marked buffer */ + IncrBufferRefCount(so->currPos.buf); + memcpy(&so->markPos, &so->currPos, + offsetof(BTScanPosData, items[1]) + + so->currPos.lastItem * sizeof(BTScanPosItem)); + + so->markPos.itemIndex = so->markItemIndex; + so->markItemIndex = -1; + } + if (ScanDirectionIsForward(dir)) { /* Walk right to the next page with data */ Index: src/include/access/nbtree.h =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/access/nbtree.h,v retrieving revision 1.103 diff -c -r1.103 nbtree.h *** src/include/access/nbtree.h 7 Aug 2006 16:57:57 -0000 1.103 --- src/include/access/nbtree.h 23 Aug 2006 11:07:47 -0000 *************** *** 441,446 **** --- 441,450 ---- /* keep these last in struct for efficiency */ BTScanPosData currPos; /* current position data */ BTScanPosData markPos; /* marked position, if any */ + int markItemIndex; /* if the marked position is on the same + page as current position, we don't use + markPos, but just keep the marked + itemIndex in markItemIndex */ } BTScanOpaqueData; typedef BTScanOpaqueData *BTScanOpaque;
---------------------------(end of broadcast)--------------------------- TIP 4: Have you searched our list archives? http://archives.postgresql.org