Re: [PATCHES] Hash Index Build Patch

Bruce Momjian Fri, 28 Sep 2007 17:18:17 -0700

This has been saved for the 8.4 release:

        http://momjian.postgresql.org/cgi-bin/pgpatches_hold


---------------------------------------------------------------------------

Tom Raney wrote:
> Hello All,
> 
> We have prepared a patch (against CVS HEAD)for the TODO item:
> * Hash
>    -During index creation, pre-sort the tuples to improve build speed
>       http://archives.postgresql.org/pgsql-hackers/2007-03/msg01199.php
> 
> Details of this patch's performance improvements can be found at 
> http://web.cecs.pdx.edu/~raneyt/pg/.
>  
> For example, in one 12 million tuple example, the 8.2.4 hash index
> build required over 2.8 hours while this patch's build required 80
> seconds. Hash build times are now comparable to BTree build times,
> for example, for a 60 million tuple example the patched hash index
> build time is 8.1 minutes and the BTree build time is 4.6 minutes.
> 
> We used spool functions from the BTree code to sort the index
> tuples. Sorting is done on the hash value of the tuples.  The hash
> value depends on the number of primary bucket pages (henceforth
> just bucket pages) that will be required to fit all the index
> tuples. So, before sorting, the base relation is scanned to get
> the total number of tuples. Then, the fill factor (either default
> or user defined, as applicable) is used to get the estimate of the
> number of bucket pages.
> 
> The 8.2.4 code builds the index by inserting one tuple at a time
> and splitting buckets as needed.  We pre-allocate the estimated
> number of bucket pages all at once. This avoids bucket page splits
> and thus redistribution of index tuples between the bucket page
> that caused the split and the newly created bucket page.
> 
> We changed the values of low_mask, high_mask and max_bucket, used
> by hashkey2bucket () while inserting index tuples to bucket pages.
> They are set as follows:
> Low_mask = (power of 2 value-1) less than the estimate.
> High_mask = (power of 2 value-1) greater than the estimate.
> Max_bucket = estimated number of bucket -1 (because bucket numbers
> start from 0).
> 
> (Please note: hashsort.c is a new file and resides in 
> src/backend/access/hash/)
> 
> We have also attached the simple data generator we used in the
> tests.  Our SQL statements were as follows:
> 
> DROP TABLE IF EXISTS test;
> 
> CREATE TABLE test (
>    value      INTEGER
> );
> 
> COPY test FROM 'data';
> \timing
> CREATE INDEX hash ON test USING HASH(value);
> 
> Regards,
> Shreya Bhargava <[EMAIL PROTECTED]>
> Tom Raney <[EMAIL PROTECTED]>
> 

> /*---------------------------------------------------------------------------
>  * hashsort.c
>  *-------------------------------------------------------------------------*/
> 
> /* 
>  * Functions needed to support initializing and sorting tuples in the spool
>  * in hash.c
>  */
> 
> 
> #include "postgres.h"
> 
> #include "access/hash.h"
> #include "miscadmin.h"
> #include "storage/smgr.h"
> #include "utils/tuplesort.h"
> 
> struct HSpool
> {
>       /* State data for tuplesort.c */
>       Tuplesortstate *sortstate; 
>       Relation index;
> };
> 
> /* create and initialize the spool structure */
> HSpool *h_spoolinit(Relation index)
> {
>       HSpool *hspool;
>       int hKbytes;
>       
>       hspool = (HSpool *) palloc0(sizeof(HSpool));
> 
>       hspool->index = index;
> 
>       /* hKbytes is the amount of memory we are going to use
>        * to sort the spool.  
>        */
> 
>       hKbytes = maintenance_work_mem;
>       hspool->sortstate = tuplesort_begin_index(index,false,hKbytes,false);
> 
>       /* Substitute the default compare call-back function
>        * for a specific hash index build compare function.
>        */
> 
>       tuplesort_set_hashindex(hspool->sortstate);
>       
>       return hspool;
> 
> }
> 
> void h_spool(IndexTuple itup, HSpool *hspool)
> {
>       tuplesort_putindextuple(hspool->sortstate, itup);
> }
> 
> /* 
>  * h_bkt_num() estimates the number of buckets
>  * in the final hash table.  
>  *
>  */
> 
> uint32 h_bkt_num(uint32 tuples, Relation rel)
> {
>       int32 ffactor;
>       int32 data_width;
>       int32 item_width;
>       uint32 bkt_num;
> 
>       /*
>        * Calculate the fill factor.  We could get this from the meta page
>        * but at the point this method is called, the meta page has not been
>        * created.
>        *
>        */
> 
>       data_width = get_typavgwidth(RelationGetDescr(rel)->attrs[0]->atttypid,
>                                       
> RelationGetDescr(rel)->attrs[0]->atttypmod);
>       item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) +
>                                       sizeof(ItemIdData);
> 
>       ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / 
> item_width;
>       if (ffactor < 10)
>               ffactor = 10;
> 
>         bkt_num = tuples / ffactor;
>         bkt_num = bkt_num +1;
> 
>       return bkt_num;
> }
> 
> /*    In order to define an order for the index tuples, there must be a mask
>  *    applied to the 32 bit hash value of the index key during the sort 
>  *    process.
>  *    For example, if there are 4,555 buckets approximated, the mask (or 
> modulo) 
>  *    would be 8,191 (hex value 1FFF). 
>  *
>  */
> 
> void h_set_bkt_mask(HSpool *spool, uint32 bkts) {
>       uint32 bkt_pwr2, bkt_mask;
> 
>       bkt_pwr2 = _hash_log2(bkts);
>       bkt_mask = (1<<(bkt_pwr2))-1;
> 
>       tuplesort_set_bktmask(spool->sortstate, bkt_mask);      
> }
> 
> void h_do_sort(HSpool *spool)
> {
>       tuplesort_performsort(spool->sortstate);
> }
> 
> void h_spooldestroy(HSpool *hspool)
> {
>       tuplesort_end(hspool->sortstate);
>       pfree(hspool);
> }
> 
> 
> /* 
>  * h_print_spool() reads the itups back from the spool, which are now
>  * in sorted order by hash value. As each itup is read from the spool, it is
>  * inserted into the hash index.
>  *
>  */
> 
> void h_print_spool(HSpool *spool)
> {
>       bool isfirstnull;
>       bool should_free;
>       IndexTuple itup= NULL;
>       Datum attr1;
>       TupleDesc tupd = RelationGetDescr(spool->index);
>       
>       while (itup = tuplesort_getindextuple(spool->sortstate, true, 
> &should_free))
>       {       
>               
>               attr1 = index_getattr(itup,1, tupd, &isfirstnull);
>               if(!isfirstnull)
>               {
>                       _hash_doinsert(spool->index, itup);
>               }
>       }
> }
> 

> #include <stdio.h>
> #include <stdlib.h>
> 
> 
> extern int main(int argc, char **argv) {
>       char *p;
>       long i, tups, seed;
> 
>       if (argc < 3) {
>               printf("Too few args.  <tuples> <seed>\n");
>               return 0;
>       }
> 
>       tups = strtol(argv[1],&p,0);
>       seed = strtol(argv[2], &p, 0);
>       srand(seed);
> 
>       for (i=0; i< tups; i++) {
>               printf("%d\n", rand());
>       }
> 
>       return 0;
> 
> } 
> *** ../../../src/backend/access/hash/hash.c.orig      2007-09-23 
> 19:01:09.000000000 -0700
> --- ../../../src/backend/access/hash/hash.c   2007-09-24 18:01:27.709487000 
> -0700
> ***************
> *** 27,35 ****
>   /* Working state for hashbuild and its callback */
>   typedef struct
>   {
> !     double          indtuples;
>   } HashBuildState;
>   
>   static void hashbuildCallback(Relation index,
>                                 HeapTuple htup,
>                                 Datum *values,
> --- 27,45 ----
>   /* Working state for hashbuild and its callback */
>   typedef struct
>   {
> !     double          indtuples; /* The current number of index tuples */
> !     Relation        heapRel;   /* The index covers this heap relation */
> !     HSpool          *spool;    /* Used to sort the index tuples before 
> insertion into the index */
>   } HashBuildState;
>   
> + 
> + static void countTupleCallBack(Relation indx,
> +                                 HeapTuple htup,
> +                                 Datum *values,
> +                                 bool *isnull,
> +                                 bool tupleIsAlive,
> +                                 void *state);
> + 
>   static void hashbuildCallback(Relation index,
>                                 HeapTuple htup,
>                                 Datum *values,
> ***************
> *** 40,46 ****
> --- 50,87 ----
>   
>   /*
>    *  hashbuild() -- build a new hash index.
> +  *
> +  *
> +  *   The algorithm:
> +  *    (1) Initialize the build state
> +  *    (2) Scan the heap file to determine the number of rows
> +  *    (3) Transform the heap file tuples into index tuples (itups),
> +  *        while inserting them into a spool.  If the spool overflows
> +  *        memory, sort it into runs and spill it to disk
> +  *    (4) Finish sorting the spool
> +  *    (5) Pre-initialize all the buckets of the final index
> +  *    (6) Insert itups from the spool into the index
> +  *
> +  *   Sorting the tuples before inserting them into the index is a classical
> +  * bulk-load technique, also used in the BTree code.  The sort is done in
> +  * hash value order.
> +  *   Pre-allocating the buckets minimizes the number of overflow pages.
> +  *   The reason for step (2) is that in order to sort, in step (3), one must
> +  * know the hash value, which depends on the number of buckets, which in 
> turn
> +  * depends on the number of itups = the number of rows in the heap file.
> +  *   Steps (3),(4) and (6) parallel similar steps in the BTree code.
> +  *
> +  *   Here is an alternative algorithm:
> +  *    (1') Same as (1)
> +  *    (2') Scan the heap file, counting the number of rows, forming index
> +  *         tuples and inserting them into a spool (the spool is not 
> presorted).
> +  *    (3') Sort the spool
> +  *    (4') same as (5)
> +  *    (5') same as (6)
> +  *    Although this algorithm would be somewhat faster, we prefer the 
> existing
> +  * algorithm because it reuses existing BTree code.
>    */
> + 
>   Datum
>   hashbuild(PG_FUNCTION_ARGS)
>   {
> ***************
> *** 50,55 ****
> --- 91,100 ----
>       IndexBuildResult *result;
>       double          reltuples;
>       HashBuildState buildstate;
> +     uint32          tuples;
> +     HashMetaPage    metap;
> +     Buffer          metabuf;
> +     uint32          num_bkt; /* Estimates number of buckets in the final 
> index */
>   
>       /*
>        * We expect to be called exactly once for any index relation. If that's
> ***************
> *** 59,85 ****
>               elog(ERROR, "index \"%s\" already contains data",
>                        RelationGetRelationName(index));
>   
> !     /* initialize the hash index metadata page */
> !     _hash_metapinit(index);
> ! 
> !     /* build the index */
>       buildstate.indtuples = 0;
>   
> !     /* do the heap scan */
> !     reltuples = IndexBuildHeapScan(heap, index, indexInfo,
> !                                                                
> hashbuildCallback, (void *) &buildstate);
>   
> -     /*
> -      * Return statistics
> -      */
> -     result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
>   
> !     result->heap_tuples = reltuples;
> !     result->index_tuples = buildstate.indtuples;
>   
>       PG_RETURN_POINTER(result);
>   }
>   
>   /*
>    * Per-tuple callback from IndexBuildHeapScan
>    */
> --- 104,183 ----
>               elog(ERROR, "index \"%s\" already contains data",
>                        RelationGetRelationName(index));
>   
> !     /* initialize the build state */
>       buildstate.indtuples = 0;
> +     buildstate.heapRel = heap;
> +     buildstate.spool = h_spoolinit(index);
>   
> !        /*
> !     * Scan the heap file to determine the number of rows
> !     *
> !         */
> !         tuples=0;
> !         IndexBuildHeapScan(heap, index, indexInfo, countTupleCallBack, 
> (void*) &tuples);
> ! 
> !         num_bkt = h_bkt_num(tuples, index); /* calculate the number of 
> buckets in the final index */
> !         h_set_bkt_mask(buildstate.spool, num_bkt); /* set the bucket mask 
> for the compare function */
> ! 
> !         /*
> !      * Pre-initialize all the buckets of the final index
> !          */
> !         _hash_metapinit(index, num_bkt);
> ! 
> !         /*
> !          * Transform the heap file tuples into index tuples (itups),
> !          * while inserting them into a spool.  If the spool overflows
> !          * memory, sort it into runs and spill it to disk
> !          *
> !          */
> !         reltuples = IndexBuildHeapScan(heap, index, indexInfo,
> !                                                                    
> hashbuildCallback, (void *) &buildstate);
> ! 
> ! 
> !         /*
> !          * Finish sorting the spool
> !          */
> !         h_do_sort(buildstate.spool);
> ! 
> !         /*
> !          * Insert itups from the spool into the index
> !          */
> !         h_print_spool(buildstate.spool);
> ! 
> ! 
> !         /* Read the meta page just initialized */
> !         metabuf = _hash_getbuf(index, HASH_METAPAGE, HASH_READ, 
> LH_META_PAGE);
> !         _hash_checkpage(index, metabuf, LH_META_PAGE);
> !         metap   = (HashMetaPage) BufferGetPage(metabuf);
> ! 
> !         /* Gather result, destroy spool, return */
> ! 
> !         result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
> !         _hash_relbuf(index,metabuf);
> !         result->heap_tuples = reltuples;
> !         result->index_tuples = buildstate.indtuples;
>   
>   
> !         h_spooldestroy(buildstate.spool);
>   
>       PG_RETURN_POINTER(result);
>   }
>   
> + 
> + /* This function is used to count the tuples in the relation we are 
> indexing.
> +  */
> + 
> + static void
> + countTupleCallBack(Relation index, HeapTuple htup, Datum *values,
> +                                 bool *isnull,
> +                                 bool tupleIsAlive,
> +                                 void *ptr)
> + {
> +         uint32 *tuples = (uint32 *) ptr;
> +         *tuples += 1;
> + }
> + 
> + 
>   /*
>    * Per-tuple callback from IndexBuildHeapScan
>    */
> ***************
> *** 104,111 ****
>               pfree(itup);
>               return;
>       }
> ! 
> !     _hash_doinsert(index, itup);
>   
>       buildstate->indtuples += 1;
>   
> --- 202,212 ----
>               pfree(itup);
>               return;
>       }
> !         else
> !         {
> !                 /* Place each itup into the spool for sorting */
> !                 h_spool(itup, buildstate->spool);
> !         }
>   
>       buildstate->indtuples += 1;
>   
> *** ../../../src/backend/access/hash/hashpage.c.orig  2007-09-23 
> 19:31:22.000000000 -0700
> --- ../../../src/backend/access/hash/hashpage.c       2007-09-23 
> 20:46:09.000000000 -0700
> ***************
> *** 313,326 ****
>   /*
>    *  _hash_metapinit() -- Initialize the metadata page of a hash index,
>    *                          the two buckets that we begin with and the 
> initial
> !  *                          bitmap page.
>    *
>    * We are fairly cavalier about locking here, since we know that no one else
>    * could be accessing this index.  In particular the rule about not holding
>    * multiple buffer locks is ignored.
>    */
>   void
> ! _hash_metapinit(Relation rel)
>   {
>       HashMetaPage metap;
>       HashPageOpaque pageopaque;
> --- 313,326 ----
>   /*
>    *  _hash_metapinit() -- Initialize the metadata page of a hash index,
>    *                          the two buckets that we begin with and the 
> initial
> !  *                          bitmap page, plus num_bkt buckets.
>    *
>    * We are fairly cavalier about locking here, since we know that no one else
>    * could be accessing this index.  In particular the rule about not holding
>    * multiple buffer locks is ignored.
>    */
>   void
> ! _hash_metapinit(Relation rel, uint32 num_bkt)
>   {
>       HashMetaPage metap;
>       HashPageOpaque pageopaque;
> ***************
> *** 330,342 ****
>       int32           data_width;
>       int32           item_width;
>       int32           ffactor;
> !     uint16          i;
>   
>       /* safety check */
>       if (RelationGetNumberOfBlocks(rel) != 0)
>               elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
>                        RelationGetRelationName(rel));
>   
>       /*
>        * Determine the target fill factor (in tuples per bucket) for this 
> index.
>        * The idea is to make the fill factor correspond to pages about as full
> --- 330,354 ----
>       int32           data_width;
>       int32           item_width;
>       int32           ffactor;
> !     uint32          i;
> !     uint32          lg2buckets;
> !     uint32          pwr2;
> !     BlockNumber     start_blk;      
>   
>       /* safety check */
>       if (RelationGetNumberOfBlocks(rel) != 0)
>               elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
>                        RelationGetRelationName(rel));
>   
> + 
> +         /* The minimum number of buckets is 2, so if we are below
> +          * that number, let's fix that here
> +          */
> + 
> +         if (num_bkt < 2)
> +                 num_bkt = 2;
> + 
> + 
>       /*
>        * Determine the target fill factor (in tuples per bucket) for this 
> index.
>        * The idea is to make the fill factor correspond to pages about as full
> ***************
> *** 401,431 ****
>        * We initialize the index with two buckets, 0 and 1, occupying physical
>        * blocks 1 and 2.      The first freespace bitmap page is in block 3.
>        */
> !     metap->hashm_maxbucket = metap->hashm_lowmask = 1;      /* nbuckets - 1 
> */
> !     metap->hashm_highmask = 3;      /* (nbuckets << 1) - 1 */
>   
>       MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
>       MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
>   
>       metap->hashm_spares[1] = 1; /* the first bitmap page is only spare */
> !     metap->hashm_ovflpoint = 1;
>       metap->hashm_firstfree = 0;
>   
> !     /*
> !      * Initialize the first two buckets
> !      */
> !     for (i = 0; i <= 1; i++)
> !     {
> !             buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i));
> !             pg = BufferGetPage(buf);
> !             pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
> !             pageopaque->hasho_prevblkno = InvalidBlockNumber;
> !             pageopaque->hasho_nextblkno = InvalidBlockNumber;
> !             pageopaque->hasho_bucket = i;
> !             pageopaque->hasho_flag = LH_BUCKET_PAGE;
>               pageopaque->hasho_page_id = HASHO_PAGE_ID;
> !             _hash_wrtbuf(rel, buf);
> !     }
>   
>       /*
>        * Initialize first bitmap page
> --- 413,488 ----
>        * We initialize the index with two buckets, 0 and 1, occupying physical
>        * blocks 1 and 2.      The first freespace bitmap page is in block 3.
>        */
> ! 
> !         /* We need this calculation to correctly set the splits
> !          * and to set the value of the low/high mask.
> !          */
> !         lg2buckets = _hash_log2_floor(num_bkt);
> ! 
> !         pwr2 = 1 << lg2buckets;
> ! 
> ! 
> !         metap->hashm_maxbucket = num_bkt -1;
> !         /* We want the highmask to mask the next larger
> !          * power of 2 value greated than the number of buckets
> !          * we need.  So, if we need 9 buckets, our high mask
> !          * value would be 15.  Our low mask value will be 7
> !          */
> ! 
> !         metap->hashm_highmask = (pwr2 << 1) -1;
> !         metap->hashm_lowmask = pwr2 - 1;
> ! 
>   
>       MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
>       MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
>   
>       metap->hashm_spares[1] = 1; /* the first bitmap page is only spare */
> ! 
> ! 
> !         /*
> !          * No overflows will have occurred during this initialization 
> process
> !          * so we need to just copy the value '1' into each position in the
> !          * hashm_spares array.  This is needed to correctly determine how 
> each
> !          * bucket maps to the logical page on disk.
> !          *
> !          */
> ! 
> !         for (i = 2; i <= _hash_log2(num_bkt); i++)
> !                 metap->hashm_spares[i] = 1;
> ! 
> ! 
> !         /* This value needs to be the ceiling log to properly set the 
> overflow
> !          * beyond our max bucket */
> !         metap->hashm_ovflpoint = _hash_log2(num_bkt);
> ! 
>       metap->hashm_firstfree = 0;
>   
> ! 
> !         /* We need to make sure the file system can handle
> !          * this big block of pages we are going to allocate
> !          * _hash_alloc_buckets() will do that for us
> !          */
> ! 
> !         start_blk = BUCKET_TO_BLKNO(metap, 2);
> !         _hash_alloc_buckets(rel, start_blk, (pwr2<<1)-2);
> ! 
> !         /*
> !          * Initialize the first 'num_bkt' buckets
> !          */
> !         for (i = 0; i < num_bkt; i++)
> !         {
> ! 
> !                 buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i));
> !                 pg = BufferGetPage(buf);
> !                 _hash_pageinit(pg, BufferGetPageSize(buf));
> !                 pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
> !                 pageopaque->hasho_prevblkno = InvalidBlockNumber;
> !                 pageopaque->hasho_nextblkno = InvalidBlockNumber;
> !                 pageopaque->hasho_bucket = i;
> !                 pageopaque->hasho_flag = LH_BUCKET_PAGE;
>               pageopaque->hasho_page_id = HASHO_PAGE_ID;
> !                 _hash_wrtbuf(rel, buf);
> !         }
>   
>       /*
>        * Initialize first bitmap page
> *** ../../../src/backend/access/hash/hashutil.c.orig  2007-09-23 
> 19:47:27.000000000 -0700
> --- ../../../src/backend/access/hash/hashutil.c       2007-09-23 
> 19:49:44.000000000 -0700
> ***************
> *** 120,125 ****
> --- 120,144 ----
>       return bucket;
>   }
>   
> + 
> + /* _hash_log2_floor -- returns floor(lg2(num))
> +  *
> +  */
> + uint32
> + _hash_log2_floor(uint32 num)
> + {
> +         uint32          i, limit;
> +         limit = 1;
> +         for (i=0; limit < num; limit <<= 1, i++)
> +                 ;
> +         if (limit == num)
> +                 return i;
> +         else
> +                 return i-1;
> + 
> + }
> + 
> + 
>   /*
>    * _hash_log2 -- returns ceil(lg2(num))
>    */
> *** ../../../src/backend/access/hash/Makefile.orig    2007-09-23 
> 20:30:39.000000000 -0700
> --- ../../../src/backend/access/hash/Makefile 2007-09-23 20:31:12.000000000 
> -0700
> ***************
> *** 13,19 ****
>   include $(top_builddir)/src/Makefile.global
>   
>   OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashscan.o \
> !        hashsearch.o hashutil.o
>   
>   all: SUBSYS.o
>   
> --- 13,19 ----
>   include $(top_builddir)/src/Makefile.global
>   
>   OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashscan.o \
> !        hashsearch.o hashutil.o hashsort.o
>   
>   all: SUBSYS.o
>   
> *** ../../../src/backend/access/hash/README.orig      2007-09-24 
> 00:07:32.000000000 -0700
> --- ../../../src/backend/access/hash/README   2007-09-24 19:18:10.312700000 
> -0700
> ***************
> *** 105,110 ****
> --- 105,135 ----
>   number 3, which is the first bitmap page and is allocated during index
>   creation.
>   
> + Building an index on a full table
> + ----------------------------------
> + 
> + An index can be created on a table already loaded with tuples. Before it
> + is built, an estimate of the number of bucket pages that might be needed to
> + fit all the index tuples, is calculated.This is done by scanning the base
> + relation and counting the number of tuples to be indexed. A Fill factor
> + (either DEFAULT or user-defined as applicable) is then used to get the 
> estimate,
> + which is the number of bucket pages initialized. If the estimate falls 
> below two,
> + then a minimum of two bucket pages is initialized. However, the number of 
> bucket
> + pages allocated is always a power of 2 value greater than the estimate.
> + I.e., if the estimated number of buckets is 3124, then the number of buckets
> + allocated is 4096 (and the number of bucket pages initialized is 3124).
> + If the estimate is 5456, then number allocated is 8192 (and the number 
> initialized
> + is 5456) and so on.
> + 
> + A spool holds all the index tuples before they are inserted into the
> + index pages. Content of the spool is sorted on the hash value order
> + so that there will be less backtracking to the bucket pages we have already
> + visited.
> + 
> + The intention of creating as many as the estimated number of pages is to
> + avoid bucket page splits at splitpoint S and hence, redistribution of 
> tuples.
> + Since we create all the bucket pages we need and insert the tuples based on 
> the
> + bucket number they belong to, there will not be any need for splitting.
>   
>   Lock definitions
>   ----------------
> *** ../../../src/backend/utils/sort/tuplesort.c.orig  2007-09-23 
> 19:51:30.000000000 -0700
> --- ../../../src/backend/utils/sort/tuplesort.c       2007-09-24 
> 00:03:25.000000000 -0700
> ***************
> *** 341,346 ****
> --- 341,347 ----
>       Relation        indexRel;
>       ScanKey         indexScanKey;
>       bool            enforceUnique;  /* complain if we find duplicate tuples 
> */
> +     uint32          bkt_mask;       /* Bucket mask for hash sort */
>   
>       /*
>        * These variables are specific to the Datum case; they are set by
> ***************
> *** 439,444 ****
> --- 440,448 ----
>   static void reversedirection_heap(Tuplesortstate *state);
>   static int comparetup_index(const SortTuple *a, const SortTuple *b,
>                                Tuplesortstate *state);
> + static int comparetup_hashindex(const SortTuple *a, const SortTuple *b,
> +                                Tuplesortstate *state);
> + 
>   static void copytup_index(Tuplesortstate *state, SortTuple *stup, void 
> *tup);
>   static void writetup_index(Tuplesortstate *state, int tapenum,
>                          SortTuple *stup);
> ***************
> *** 2621,2626 ****
> --- 2625,2673 ----
>                                                               &stup->isnull1);
>   }
>   
> + /*
> +  * Set state parameters for sorting hash index tuples
> +  */
> + void tuplesort_set_hashindex(Tuplesortstate *state)
> + {
> +         state->comparetup = comparetup_hashindex;
> +         state->indexScanKey = NULL; /* Scan key is not needed in hash index 
> */
> +         state->enforceUnique = false;   /* no reason to enforce uniqueness 
> with a hash table */
> + 
> + }
> + 
> + 
> + void tuplesort_set_bktmask(Tuplesortstate *state, uint32 mask)
> + {
> +         state->bkt_mask = mask;
> + }
> + 
> + 
> + /* Special compare function called for the hash sort algorithm
> +  * We are comparing hash values of the key, which then are masked to
> +  * determine the proper hash table bucket.
> +  */
> + 
> + 
> + static int comparetup_hashindex(const SortTuple *a, const SortTuple *b, 
> Tuplesortstate *state)
> + {
> +         uint32  bkta, bktb;
> + 
> +         /* Allow interupting long sorts */
> +         CHECK_FOR_INTERRUPTS();
> + 
> +         bkta = (_hash_datum2hashkey(state->indexRel, a->datum1)) & 
> (uint32)state->bkt_mask;
> +         bktb = (_hash_datum2hashkey(state->indexRel, b->datum1)) & 
> (uint32)state->bkt_mask;
> + 
> +         if (bkta > bktb)
> +                 return 1;
> +         else if (bkta < bktb)
> +                 return -1;
> +         else
> +                 return 0;
> + 
> + }
> + 
>   static void
>   reversedirection_heap(Tuplesortstate *state)
>   {
> *** ../../../src/include/access/hash.h.orig   2007-09-23 17:50:40.000000000 
> -0700
> --- ../../../src/include/access/hash.h        2007-09-23 17:40:21.000000000 
> -0700
> ***************
> *** 282,287 ****
> --- 282,297 ----
>                                                               Bucket bucket, 
> BlockNumber bucket_blkno,
>                                                               
> BufferAccessStrategy bstrategy);
>   
> + /*hashsort.c*/
> + typedef struct HSpool HSpool;
> + extern HSpool *h_spoolinit(Relation index);
> + extern void h_spool(IndexTuple itup, HSpool *hspool);
> + extern uint32 h_bkt_num(uint32 tuples, Relation rel);
> + extern void h_set_bkt_mask(HSpool *hspool, uint32 bkts);
> + extern void h_do_sort(HSpool *hspool);
> + extern void h_print_spool(HSpool *hspool);
> + extern void h_spooldestroy(HSpool *hspool);
> + 
>   /* hashpage.c */
>   extern void _hash_getlock(Relation rel, BlockNumber whichlock, int access);
>   extern bool _hash_try_getlock(Relation rel, BlockNumber whichlock, int 
> access);
> ***************
> *** 298,304 ****
>   extern void _hash_wrtbuf(Relation rel, Buffer buf);
>   extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access,
>                                  int to_access);
> ! extern void _hash_metapinit(Relation rel);
>   extern void _hash_pageinit(Page page, Size size);
>   extern void _hash_expandtable(Relation rel, Buffer metabuf);
>   
> --- 308,314 ----
>   extern void _hash_wrtbuf(Relation rel, Buffer buf);
>   extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access,
>                                  int to_access);
> ! extern void _hash_metapinit(Relation rel, uint32 pages);
>   extern void _hash_pageinit(Page page, Size size);
>   extern void _hash_expandtable(Relation rel, Buffer metabuf);
>   
> ***************
> *** 320,325 ****
> --- 330,336 ----
>   extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
>                                        uint32 highmask, uint32 lowmask);
>   extern uint32 _hash_log2(uint32 num);
> + extern uint32 _hash_log2_floor(uint32 num);
>   extern void _hash_checkpage(Relation rel, Buffer buf, int flags);
>   
>   /* hash.c */
> *** ../../../src/include/utils/tuplesort.h.orig       2007-09-23 
> 18:52:07.000000000 -0700
> --- ../../../src/include/utils/tuplesort.h    2007-09-23 18:59:13.000000000 
> -0700
> ***************
> *** 55,60 ****
> --- 55,63 ----
>                                         Oid sortOperator, bool nullsFirstFlag,
>                                         int workMem, bool randomAccess);
>   
> + extern void tuplesort_set_hashindex(Tuplesortstate *state);
> + extern void tuplesort_set_bktmask(Tuplesortstate *state, uint32 mask);
> + 
>   extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound);
>   
>   extern void tuplesort_puttupleslot(Tuplesortstate *state,

> 
> ---------------------------(end of broadcast)---------------------------
> TIP 4: Have you searched our list archives?
> 
>                http://archives.postgresql.org

-- 
  Bruce Momjian  <[EMAIL PROTECTED]>        http://momjian.us
  EnterpriseDB                             http://postgres.enterprisedb.com

  + If your life is a hard drive, Christ can be your backup. +

---------------------------(end of broadcast)---------------------------
TIP 4: Have you searched our list archives?

               http://archives.postgresql.org

Re: [PATCHES] Hash Index Build Patch

Reply via email to