Hi hackers! Here is prototype of procrastinating GiST. Or lazy GiST. Or buffered GiST. Indeed, there is nothing fractal about it.
The concept is leaf tuples stall on internal pages. From time to time they are pushed down in batches. This code is far from perfect, I've coded this only for the research purposes. I have tested code with insertion of random 3D cubes. On 1 million of insertions classic GiST is 8% faster, on 3M performance is equal, on 30M lazy GiST if 12% faster. I've tested it with SSD disk, may be with HDD difference will be more significant. Test scenario is here https://github.com/x4m/postgres_g/blob/bufdev/test.sql In theory, this is cache friendly implementation (but not cache oblivious) and it must be faster even for small datasets without huge disk work. But it has there extra cost of coping batch of tuples to palloc`d array, couln't avoid that. This is just a proof-of-concept for performance measrures: 1. make check fails for two tests 2. WAL is broken 3. code is a mess in some places I'm not sure 12% of performance worth it. I'll appreciate any ideas on what to do next. Best regards, Andrey Borodin. 2013-02-13 22:54 GMT+05:00 Simon Riggs <si...@2ndquadrant.com>: > On 13 February 2013 16:48, Heikki Linnakangas <hlinnakan...@vmware.com> > wrote: > > On 13.02.2013 18:20, Tom Lane wrote: > >> > >> Heikki Linnakangas<hlinnakan...@vmware.com> writes: > >>> > >>> The basic idea of a fractal tree index is to attach a buffer to every > >>> non-leaf page. On insertion, instead of descending all the way down to > >>> the correct leaf page, the new tuple is put on the buffer at the root > >>> page. When that buffer fills up, all the tuples in the buffer are > >>> cascaded down to the buffers on the next level pages. And recursively, > >>> whenever a buffer fills up at any level, it's flushed to the next > level. > >> > >> > >> [ scratches head... ] What's "fractal" about that? Or is that just a > >> content-free marketing name for this technique? > > > > > > I'd call it out as a marketing name. I guess it's fractal in the sense > that > > all levels of the tree can hold "leaf tuples" in the buffers; the > structure > > looks the same no matter how deep you zoom, like a fractal.. But > "Buffered" > > would be more appropriate IMO. > > I hope for their sake there is more to it than that. It's hard to see > how buffering can be patented. > > -- > Simon Riggs http://www.2ndQuadrant.com/ > PostgreSQL Development, 24x7 Support, Training & Services > >
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index b8aa9bc..29770d2 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -265,6 +265,7 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, bool is_rootsplit; int npage; + is_rootsplit = (blkno == GIST_ROOT_BLKNO); /* @@ -524,6 +525,8 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, gistfillbuffer(page, itup, ntup, InvalidOffsetNumber); } + GistPageGetOpaque(page)->nlazy=1; //DO NOT FORGET: remark pages after split + MarkBufferDirty(buffer); if (BufferIsValid(leftchildbuf)) @@ -589,6 +592,7 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, * this routine assumes it is invoked in a short-lived memory context, * so it does not bother releasing palloc'd allocations. */ + void gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate) { @@ -597,7 +601,6 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate) GISTInsertStack firststack; GISTInsertStack *stack; GISTInsertState state; - bool xlocked = false; memset(&state, 0, sizeof(GISTInsertState)); state.freespace = freespace; @@ -610,6 +613,21 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate) firststack.downlinkoffnum = InvalidOffsetNumber; state.stack = stack = &firststack; + + gistdoinsertloop(r,itup,freespace, giststate, stack, state, 0); +} + +void +gistdoinsertloop(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate,GISTInsertStack *givenstack, GISTInsertState state, GISTInsertStack *nolazy) +{ + ItemId iid; + IndexTuple idxtuple; + + bool xlocked = false; + GISTInsertStack *stack = givenstack; + + + /* * Walk down along the path of smallest penalty, updating the parent * pointers with the key we're inserting as we go. If we crash in the @@ -685,9 +703,86 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate) GISTInsertStack *item; OffsetNumber downlinkoffnum; + + + if(stack!=nolazy) + { + if(GistPageCanStoreLazy(stack->page, itup)) + { + if (!xlocked) + { + LockBuffer(stack->buffer, GIST_UNLOCK); + LockBuffer(stack->buffer, GIST_EXCLUSIVE); + xlocked = true; + stack->page = (Page) BufferGetPage(stack->buffer); + + if (PageGetLSN(stack->page) != stack->lsn) + { + /* the page was changed while we unlocked it, retry */ + continue; + } + } + + //elog(WARNING,"writing lazy tuple"); + + GistTupleSetLazy(itup); + gistinserttuple(&state, stack, giststate, itup, + InvalidOffsetNumber); + LockBuffer(stack->buffer, GIST_UNLOCK); + xlocked = false; + + break; + } + else if (GistPageGetOpaque(stack->page)->nlazy) + { + OffsetNumber i; + OffsetNumber maxoff = PageGetMaxOffsetNumber(stack->page); + + //elog(WARNING,"starting push"); + if (!xlocked) + { + //elog(WARNING,"xloc for push"); + LockBuffer(stack->buffer, GIST_UNLOCK); + LockBuffer(stack->buffer, GIST_EXCLUSIVE); + xlocked = true; + stack->page = (Page) BufferGetPage(stack->buffer); + + if (PageGetLSN(stack->page) != stack->lsn) + { + /* the page was changed while we unlocked it, retry */ + continue; + } + } + + int nlen; + IndexTuple* lazys = gistextractlazy(stack->page,&nlen); + + LockBuffer(stack->buffer, GIST_UNLOCK); + + xlocked = false; + + for (i = 0; i < nlen; i++) + { + IndexTuple ltup = lazys[i]; + //elog(WARNING,"push %d",i); + GistTupleUnsetLazy(ltup); + gistdoinsertloop(r,ltup,freespace,giststate,stack,state,stack); + //elog(WARNING,"push out, on page %d tuples", PageGetMaxOffsetNumber(stack->page)); + } + LockBuffer(stack->buffer, GIST_SHARE); + GistPageGetOpaque(stack->page)->nlazy = 0; + } + + } + else + { + //elog(WARNING,"skipped laziness"); + } + downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate); iid = PageGetItemId(stack->page, downlinkoffnum); idxtuple = (IndexTuple) PageGetItem(stack->page, iid); + //elog(WARNING,"picked %d",downlinkoffnum); childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); /* @@ -753,6 +848,7 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate) continue; } } + LockBuffer(stack->buffer, GIST_UNLOCK); xlocked = false; @@ -825,12 +921,20 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate) InvalidOffsetNumber); LockBuffer(stack->buffer, GIST_UNLOCK); - /* Release any pins we might still hold before exiting */ - for (; stack; stack = stack->parent) - ReleaseBuffer(stack->buffer); + break; } } + + + /* Release any pins we might still hold before exiting */ + for (; stack; stack = stack->parent) + { + if(nolazy != stack) + ReleaseBuffer(stack->buffer); + if(stack == givenstack) + break; + } } /* @@ -1256,6 +1360,7 @@ gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack, GISTPageSplitInfo *left; IndexTuple tuples[2]; + /* A split always contains at least two halves */ Assert(list_length(splitinfo) >= 2); diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c index 5ba7d0a..c51c842 100644 --- a/src/backend/access/gist/gistget.c +++ b/src/backend/access/gist/gistget.c @@ -197,7 +197,7 @@ gistindex_keytest(IndexScanDesc scan, gistdentryinit(giststate, key->sk_attno - 1, &de, datum, r, page, offset, - FALSE, isNull); + FALSE, isNull, GistTupleIsLazy(tuple)); /* * Call the Consistent function to evaluate the test. The @@ -258,7 +258,7 @@ gistindex_keytest(IndexScanDesc scan, gistdentryinit(giststate, key->sk_attno - 1, &de, datum, r, page, offset, - FALSE, isNull); + FALSE, isNull, GistTupleIsLazy(tuple)); /* * Call the Distance function to evaluate the distance. The @@ -422,7 +422,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances, if (!match) continue; - if (tbm && GistPageIsLeaf(page)) + if (tbm && (GistPageIsLeaf(page) || GistTupleIsLazy(it))) { /* * getbitmap scan, so just push heap tuple TIDs into the bitmap @@ -431,7 +431,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances, tbm_add_tuples(tbm, &it->t_tid, 1, recheck); (*ntids)++; } - else if (scan->numberOfOrderBys == 0 && GistPageIsLeaf(page)) + else if (scan->numberOfOrderBys == 0 && (GistPageIsLeaf(page) || GistTupleIsLazy(it))) { /* * Non-ordered scan, so report tuples in so->pageData[] @@ -466,7 +466,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances, /* Create new GISTSearchItem for this item */ item = palloc(SizeOfGISTSearchItem(scan->numberOfOrderBys)); - if (GistPageIsLeaf(page)) + if ((GistPageIsLeaf(page) || GistTupleIsLazy(it))) { /* Creating heap-tuple GISTSearchItem */ item->blkno = InvalidBlockNumber; diff --git a/src/backend/access/gist/gistsplit.c b/src/backend/access/gist/gistsplit.c index d394969..c9680db 100644 --- a/src/backend/access/gist/gistsplit.c +++ b/src/backend/access/gist/gistsplit.c @@ -643,7 +643,7 @@ gistSplitByKey(Relation r, Page page, IndexTuple *itup, int len, &IsNull); gistdentryinit(giststate, attno, &(entryvec->vector[i]), datum, r, page, i, - FALSE, IsNull); + FALSE, IsNull, GistTupleIsLazy(itup[i-1])); if (IsNull) offNullTuples[nOffNullTuples++] = i; } diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index 887c58b..897e7f0 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -105,6 +105,45 @@ gistextractpage(Page page, int *len /* out */ ) return itvec; } + +IndexTuple * +gistextractlazy(Page page, int *len /* out */ ) +{ + OffsetNumber i, + maxoff; + IndexTuple *itvec; + OffsetNumber* itemNos; + //elog(WARNING,"extracting for push"); + + maxoff = PageGetMaxOffsetNumber(page); + *len = 0; + itvec = palloc(sizeof(IndexTuple) * maxoff); + itemNos = palloc(sizeof(OffsetNumber) * maxoff); + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + ItemId tid = PageGetItemId(page, i); + IndexTuple it = (IndexTuple) PageGetItem(page, tid); + if(GistTupleIsLazy(it)) + { + //elog(WARNING,"extract item %d",i); + IndexTuple copy = palloc(tid->lp_len); + //elog(WARNING,"memmove",i); + memmove(copy,it,tid->lp_len); + //elog(WARNING,"offset %d to index %d",i,*len); + itemNos[*len] = i; + //elog(WARNING,"ptr",i); + itvec[*len] = copy; + *len = *len+1; + //elog(WARNING,"done",i); + } + } + + //elog(WARNING,"deleting %d",*len); + PageIndexMultiDelete(page,itemNos,*len); + + return itvec; +} + /* * join two vectors into one */ @@ -178,7 +217,7 @@ gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len, evec->vector + evec->n, datum, NULL, NULL, (OffsetNumber) 0, - FALSE, IsNull); + FALSE, IsNull, GistTupleIsLazy(itvec[j])); evec->n++; } @@ -302,7 +341,7 @@ gistDeCompressAtt(GISTSTATE *giststate, Relation r, IndexTuple tuple, Page p, datum = index_getattr(tuple, i + 1, giststate->tupdesc, &isnull[i]); gistdentryinit(giststate, i, &attdata[i], datum, r, p, o, - FALSE, isnull[i]); + FALSE, isnull[i], GistTupleIsLazy(tuple)); } } @@ -438,6 +477,9 @@ gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */ bool zero_penalty; int j; + if(GistTupleIsLazy(itup)) + continue; + zero_penalty = true; /* Loop over index attributes. */ @@ -450,7 +492,7 @@ gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */ /* Compute penalty for this column. */ datum = index_getattr(itup, j + 1, giststate->tupdesc, &IsNull); gistdentryinit(giststate, j, &entry, datum, r, p, i, - FALSE, IsNull); + FALSE, IsNull, GistTupleIsLazy(itup)); usize = gistpenalty(giststate, j, &entry, IsNull, &identry[j], isnull[j]); if (usize > 0) @@ -542,7 +584,7 @@ gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */ void gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e, Datum k, Relation r, Page pg, OffsetNumber o, - bool l, bool isNull) + bool l, bool isNull, bool lazy) { if (!isNull) { @@ -557,9 +599,20 @@ gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e, if (dep != e) gistentryinit(*e, dep->key, dep->rel, dep->page, dep->offset, dep->leafkey); + if(lazy) + { + dep->leafpage = false; + } } else + { gistentryinit(*e, (Datum) 0, r, pg, o, l); + + if(lazy) + { + e->leafpage = false; + } + } } IndexTuple diff --git a/src/include/access/gist.h b/src/include/access/gist.h index 4343d6f..ffa9048 100644 --- a/src/include/access/gist.h +++ b/src/include/access/gist.h @@ -59,6 +59,7 @@ typedef struct GISTPageOpaqueData { PageGistNSN nsn; /* this value must change on page split */ BlockNumber rightlink; /* next page if any */ + uint16 nlazy; uint16 flags; /* see bit definitions above */ uint16 gist_page_id; /* for identification of GiST indexes */ } GISTPageOpaqueData; @@ -125,12 +126,13 @@ typedef struct GISTENTRY Page page; OffsetNumber offset; bool leafkey; + bool leafpage; } GISTENTRY; #define GistPageGetOpaque(page) ( (GISTPageOpaque) PageGetSpecialPointer(page) ) #define GistPageIsLeaf(page) ( GistPageGetOpaque(page)->flags & F_LEAF) -#define GIST_LEAF(entry) (GistPageIsLeaf((entry)->page)) +#define GIST_LEAF(entry) (((entry)->leafpage)) #define GistPageIsDeleted(page) ( GistPageGetOpaque(page)->flags & F_DELETED) #define GistPageSetDeleted(page) ( GistPageGetOpaque(page)->flags |= F_DELETED) @@ -168,6 +170,6 @@ typedef struct */ #define gistentryinit(e, k, r, pg, o, l) \ do { (e).key = (k); (e).rel = (r); (e).page = (pg); \ - (e).offset = (o); (e).leafkey = (l); } while (0) + (e).offset = (o); (e).leafkey = (l); (e).leafpage = (pg) && GistPageIsLeaf(pg);} while (0) #endif /* GIST_H */ diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 78e87a6..669d697 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -320,6 +320,7 @@ typedef struct #define GistTupleIsInvalid(itup) ( ItemPointerGetOffsetNumber( &((itup)->t_tid) ) == TUPLE_IS_INVALID ) #define GistTupleSetValid(itup) ItemPointerSetOffsetNumber( &((itup)->t_tid), TUPLE_IS_VALID ) +#define GistPageCanStoreLazy(page,itup) ((int) ((PageHeader) page)->pd_upper - (int) ((PageHeader) page)->pd_lower - IndexTupleSize(itup) > 1024) @@ -435,6 +436,13 @@ extern void gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *GISTstate); +extern void gistdoinsertloop(Relation r, + IndexTuple itup, + Size freespace, + GISTSTATE *GISTstate, + GISTInsertStack *stack, + GISTInsertState state, + GISTInsertStack *nolazy); /* A List of these is returned from gistplacetopage() in *splitinfo */ typedef struct @@ -498,6 +506,7 @@ extern Buffer gistNewBuffer(Relation r); extern void gistfillbuffer(Page page, IndexTuple *itup, int len, OffsetNumber off); extern IndexTuple *gistextractpage(Page page, int *len /* out */ ); +extern IndexTuple *gistextractlazy(Page page, int *len /* out */ ); extern IndexTuple *gistjoinvector( IndexTuple *itvec, int *len, IndexTuple *additvec, int addlen); @@ -519,7 +528,7 @@ extern OffsetNumber gistchoose(Relation r, Page p, extern void GISTInitBuffer(Buffer b, uint32 f); extern void gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e, Datum k, Relation r, Page pg, OffsetNumber o, - bool l, bool isNull); + bool l, bool isNull, bool lazy); extern float gistpenalty(GISTSTATE *giststate, int attno, GISTENTRY *key1, bool isNull1, diff --git a/src/include/access/itup.h b/src/include/access/itup.h index 8350fa0..d96dd94 100644 --- a/src/include/access/itup.h +++ b/src/include/access/itup.h @@ -72,6 +72,11 @@ typedef IndexAttributeBitMapData *IndexAttributeBitMap; #define IndexTupleHasNulls(itup) ((((IndexTuple) (itup))->t_info & INDEX_NULL_MASK)) #define IndexTupleHasVarwidths(itup) ((((IndexTuple) (itup))->t_info & INDEX_VAR_MASK)) +#define TUPLE_IS_LAZY 0x2000 +#define GistTupleIsLazy(itup) (((IndexTuple)itup)->t_info & TUPLE_IS_LAZY) +#define GistTupleSetLazy(itup) do { ((IndexTuple)itup)->t_info = ((IndexTuple)itup)->t_info | TUPLE_IS_LAZY; } while(0) +#define GistTupleUnsetLazy(itup) do { ((IndexTuple)itup)->t_info = ((IndexTuple)itup)->t_info & ~TUPLE_IS_LAZY; } while(0) + /* * Takes an infomask as argument (primarily because this needs to be usable diff --git a/test.sh b/test.sh new file mode 100755 index 0000000..3767958 --- /dev/null +++ b/test.sh @@ -0,0 +1,2 @@ +#!/bin/sh +/home/x4m/project/bin/psql postgres</home/x4m/pgsql/test.sql diff --git a/test.sql b/test.sql new file mode 100644 index 0000000..34a89d9 --- /dev/null +++ b/test.sql @@ -0,0 +1,27 @@ +\timing +SET client_min_messages = 'DEBUG5'; +SET log_min_messages = 'DEBUG5'; +SET wal_level = 'minimal'; + +create extension if not exists cube; + +begin transaction; +SELECT setseed(.43); + + +create table dataTable(c cube); +create index idx on dataTable using gist(c); + +insert into dataTable(c) select cube(array[random(),random(),random()]) from generate_series(1,1e2,1) x,generate_series(1,1e2,1) y,generate_series(1,3e3,1) z; + + +/*create table queries(id int,l1 float,l2 float,l3 float, u1 float,u2 float, u3 float, q cube); +insert into queries(id,l1,l2,l3) select s,random()/1.3,random()/1.3,random()/1.3 from generate_series(1,1e4,1) s; +update queries set q = cube(array[l1,l2,l3],array[l1+0.01001,l2+0.10001,l3+0.10001]); + +select id,(select count(*) from dataTable dt where dt.c<@q) from queries order by id;*/ + +rollback; +--drop index idx; +--drop table dataTable; +--drop table queries;
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers