Re: [HACKERS] Fix for seg picksplit function

Alexander Korotkov Sat, 20 Nov 2010 05:16:03 -0800

On Sat, Nov 20, 2010 at 6:46 AM, Robert Haas <robertmh...@gmail.com> wrote:


> Well, the problem with just comparing on < is that it takes very
> little account of the upper bounds.  I think the cases where a simple
> split would hurt you the most are those where examining the upper
> bound is necessary to to get a good split.

Yes, also such asymmetric solution seems not beautiful enough for me :).
It's easy to sort segs by their center, in this case lower and upper bound
will be used equally. New patch is attached. I checked it on various data
distributions.

1) Uniform distribution
test=# insert into seg_test (select (a || ' .. ' || a + 0.00005*b)::seg from
(select random() as a, random() as b from generate_series(1,1000000)) x);
INSERT 0 1000000
Time: 79121,830 ms
test=# create index seg_test_idx on seg_test using gist (a);
CREATE INDEX
Time: 176409,434 ms
test=# explain (buffers, analyze) select * from seg_test where a @> '0.5 ..
0.5'::seg;
                                                        QUERY PLAN


--------------------------------------------------------------------------------------------------------------------------
 Bitmap Heap Scan on seg_test  (cost=28.19..2500.32 rows=1000 width=12)
(actual time=0.251..0.886 rows=27 loops=1)
   Recheck Cond: (a @> '0.5'::seg)
   Buffers: shared hit=3 read=27
   ->  Bitmap Index Scan on seg_test_idx  (cost=0.00..27.94 rows=1000
width=0) (actual time=0.193..0.193 rows=27 loops=1)
         Index Cond: (a @> '0.5'::seg)
         Buffers: shared hit=3
 Total runtime: 1.091 ms
(7 rows)

Time: 41,884 ms

2) Natural distribution (Box–Muller transform was used for data generation)
test=# insert into seg_test (select ( a - 0.00005*abs(b) || ' .. ' || a +
0.00005*abs(b))::seg from (select
cos(2.0*pi()*random())*sqrt(-2.0*ln(random())) as a,
cos(2.0*pi()*random())*sqrt(-2.0*ln(random())) as b from
generate_series(1,1000000)) x);
INSERT 0 1000000
Time: 98614,305 ms
test=# create index seg_test_idx on seg_test using gist(a);
CREATE INDEX
Time: 212513,540 ms
test=# explain (buffers, analyze) select * from seg_test where a @> '0.3 ..
0.3'::seg;
                                                        QUERY PLAN


--------------------------------------------------------------------------------------------------------------------------
 Bitmap Heap Scan on seg_test  (cost=28.18..2500.31 rows=1000 width=12)
(actual time=0.132..0.428 rows=27 loops=1)
   Recheck Cond: (a @> '0.3'::seg)
   Buffers: shared hit=3 read=27
   ->  Bitmap Index Scan on seg_test_idx  (cost=0.00..27.93 rows=1000
width=0) (actual time=0.103..0.103 rows=27 loops=1)
         Index Cond: (a @> '0.3'::seg)
         Buffers: shared hit=3
 Total runtime: 0.504 ms
(7 rows)

Time: 0,967 ms

3) Many distinct values
test=# insert into seg_test (select (a||'..'||(a+1))::seg from (select
(random()*13000)::integer as a from generate_series(1,1000000)) x);
INSERT 0 1000000
Time: 90775,952 ms
test=# create index seg_test_idx on seg_test using gist(a);
CREATE INDEX
Time: 200960,758 ms
test=# explain (buffers, analyze) select * from seg_test where a @> '700.0
.. 700.0'::seg;
                                                        QUERY PLAN


---------------------------------------------------------------------------------------------------------------------------
 Bitmap Heap Scan on seg_test  (cost=28.19..2500.33 rows=1000 width=12)
(actual time=0.358..3.531 rows=138 loops=1)
   Recheck Cond: (a @> '700.0'::seg)
   Buffers: shared hit=3 read=135
   ->  Bitmap Index Scan on seg_test_idx  (cost=0.00..27.94 rows=1000
width=0) (actual time=0.270..0.270 rows=138 loops=1)
         Index Cond: (a @> '700.0'::seg)
         Buffers: shared hit=3
 Total runtime: 3.882 ms
(7 rows)

Time: 5,271 ms

----
With best regards,
Alexander Korotkov.

*** a/contrib/seg/seg.c
--- b/contrib/seg/seg.c
***************
*** 292,329 **** gseg_penalty(GISTENTRY *origentry, GISTENTRY *newentry, float *result)
  	return (result);
  }
  
  
  
  /*
  ** The GiST PickSplit method for segments
! ** We use Guttman's poly time split algorithm
  */
  GIST_SPLITVEC *
  gseg_picksplit(GistEntryVector *entryvec,
  			   GIST_SPLITVEC *v)
  {
! 	OffsetNumber i,
! 				j;
! 	SEG		   *datum_alpha,
! 			   *datum_beta;
  	SEG		   *datum_l,
! 			   *datum_r;
! 	SEG		   *union_d,
! 			   *union_dl,
! 			   *union_dr;
! 	SEG		   *inter_d;
! 	bool		firsttime;
! 	float		size_alpha,
! 				size_beta,
! 				size_union,
! 				size_inter;
! 	float		size_waste,
! 				waste;
! 	float		size_l,
! 				size_r;
  	int			nbytes;
! 	OffsetNumber seed_1 = 1,
! 				seed_2 = 2;
  	OffsetNumber *left,
  			   *right;
  	OffsetNumber maxoff;
--- 292,340 ----
  	return (result);
  }
  
+ /*
+  * Auxiliary structure for picksplit method.
+  */
+ typedef struct
+ {
+ 	OffsetNumber index;
+ 	float center;
+ 	SEG *data;
+ } PickSplitSortItem;
  
+ /*
+  * Compare function for PickSplitSortItem based on seg_cmp.
+  */
+ static int
+ sort_item_cmp(const void *a, const void *b)
+ {
+ 	PickSplitSortItem *i1 = (PickSplitSortItem *)a;
+ 	PickSplitSortItem *i2 = (PickSplitSortItem *)b;
+ 	if (i1->center < i2->center)
+ 		return -1;
+ 	else if (i1->center == i2->center)
+ 		return 0;
+ 	else
+ 		return 1;
+ }
  
  /*
  ** The GiST PickSplit method for segments
! ** Algorithm based on sorting. Incoming array of segs is sorting using seg_cmp
! ** function. After that first half of segs goes to the left datum, and the
! ** second half of segs goes to the right datum.
  */
  GIST_SPLITVEC *
  gseg_picksplit(GistEntryVector *entryvec,
  			   GIST_SPLITVEC *v)
  {
! 	OffsetNumber i;
  	SEG		   *datum_l,
! 			   *datum_r,
! 			   *seg;
! 	PickSplitSortItem	*sortItems;
  	int			nbytes;
! 	OffsetNumber seed_2;
  	OffsetNumber *left,
  			   *right;
  	OffsetNumber maxoff;
***************
*** 332,443 **** gseg_picksplit(GistEntryVector *entryvec,
  	fprintf(stderr, "picksplit\n");
  #endif
  
! 	maxoff = entryvec->n - 2;
! 	nbytes = (maxoff + 2) * sizeof(OffsetNumber);
  	v->spl_left = (OffsetNumber *) palloc(nbytes);
  	v->spl_right = (OffsetNumber *) palloc(nbytes);
  
! 	firsttime = true;
! 	waste = 0.0;
! 
! 	for (i = FirstOffsetNumber; i < maxoff; i = OffsetNumberNext(i))
  	{
! 		datum_alpha = (SEG *) DatumGetPointer(entryvec->vector[i].key);
! 		for (j = OffsetNumberNext(i); j <= maxoff; j = OffsetNumberNext(j))
! 		{
! 			datum_beta = (SEG *) DatumGetPointer(entryvec->vector[j].key);
! 
! 			/* compute the wasted space by unioning these guys */
! 			/* size_waste = size_union - size_inter; */
! 			union_d = seg_union(datum_alpha, datum_beta);
! 			rt_seg_size(union_d, &size_union);
! 			inter_d = seg_inter(datum_alpha, datum_beta);
! 			rt_seg_size(inter_d, &size_inter);
! 			size_waste = size_union - size_inter;
! 
! 			/*
! 			 * are these a more promising split that what we've already seen?
! 			 */
! 			if (size_waste > waste || firsttime)
! 			{
! 				waste = size_waste;
! 				seed_1 = i;
! 				seed_2 = j;
! 				firsttime = false;
! 			}
! 		}
  	}
  
  	left = v->spl_left;
  	v->spl_nleft = 0;
  	right = v->spl_right;
  	v->spl_nright = 0;
  
- 	datum_alpha = (SEG *) DatumGetPointer(entryvec->vector[seed_1].key);
- 	datum_l = seg_union(datum_alpha, datum_alpha);
- 	rt_seg_size(datum_l, &size_l);
- 	datum_beta = (SEG *) DatumGetPointer(entryvec->vector[seed_2].key);
- 	datum_r = seg_union(datum_beta, datum_beta);
- 	rt_seg_size(datum_r, &size_r);
- 
  	/*
! 	 * Now split up the regions between the two seeds.	An important property
! 	 * of this split algorithm is that the split vector v has the indices of
! 	 * items to be split in order in its left and right vectors.  We exploit
! 	 * this property by doing a merge in the code that actually splits the
! 	 * page.
! 	 *
! 	 * For efficiency, we also place the new index tuple in this loop. This is
! 	 * handled at the very end, when we have placed all the existing tuples
! 	 * and i == maxoff + 1.
  	 */
! 
! 	maxoff = OffsetNumberNext(maxoff);
! 	for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
  	{
! 		/*
! 		 * If we've already decided where to place this item, just put it on
! 		 * the right list.	Otherwise, we need to figure out which page needs
! 		 * the least enlargement in order to store the item.
! 		 */
! 
! 		if (i == seed_1)
! 		{
! 			*left++ = i;
! 			v->spl_nleft++;
! 			continue;
! 		}
! 		else if (i == seed_2)
! 		{
! 			*right++ = i;
! 			v->spl_nright++;
! 			continue;
! 		}
! 
! 		/* okay, which page needs least enlargement? */
! 		datum_alpha = (SEG *) DatumGetPointer(entryvec->vector[i].key);
! 		union_dl = seg_union(datum_l, datum_alpha);
! 		union_dr = seg_union(datum_r, datum_alpha);
! 		rt_seg_size(union_dl, &size_alpha);
! 		rt_seg_size(union_dr, &size_beta);
  
! 		/* pick which page to add it to */
! 		if (size_alpha - size_l < size_beta - size_r)
! 		{
! 			datum_l = union_dl;
! 			size_l = size_alpha;
! 			*left++ = i;
! 			v->spl_nleft++;
! 		}
! 		else
! 		{
! 			datum_r = union_dr;
! 			size_r = size_alpha;
! 			*right++ = i;
! 			v->spl_nright++;
! 		}
  	}
- 	*left = *right = FirstOffsetNumber; /* sentinel value, see dosplit() */
  
  	v->spl_ldatum = PointerGetDatum(datum_l);
  	v->spl_rdatum = PointerGetDatum(datum_r);
--- 343,403 ----
  	fprintf(stderr, "picksplit\n");
  #endif
  
! 	maxoff = entryvec->n - 1;
! 	nbytes = (maxoff + 1) * sizeof(OffsetNumber);
! 	sortItems = (PickSplitSortItem *)palloc(maxoff * sizeof(PickSplitSortItem));
  	v->spl_left = (OffsetNumber *) palloc(nbytes);
  	v->spl_right = (OffsetNumber *) palloc(nbytes);
  
! 	/*
! 	 * Preparing auxiliary array and sorting.
! 	 */
! 	for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
  	{
! 		seg = (SEG *) DatumGetPointer(entryvec->vector[i].key);
! 		sortItems[i - FirstOffsetNumber].index = i;
! 		sortItems[i - FirstOffsetNumber].data = seg;
! 		/*
! 		 * In center calculation we're getting half of lower and upper bounds
! 		 * before summation in order to aviod possible overflow.
! 		 */
! 		sortItems[i - FirstOffsetNumber].center = seg->lower*0.5f + seg->upper*0.5f;
  	}
+ 	qsort(sortItems, maxoff, sizeof(PickSplitSortItem), sort_item_cmp);
+ 	seed_2 = maxoff / 2;
  
  	left = v->spl_left;
  	v->spl_nleft = 0;
  	right = v->spl_right;
  	v->spl_nright = 0;
  
  	/*
! 	 * First half of segs goes to the left datum.
  	 */
! 	datum_l = (SEG *)palloc(sizeof(SEG));
! 	memcpy(datum_l, sortItems[0].data, sizeof(SEG));
! 	*left++ = sortItems[0].index;
! 	v->spl_nleft++;
! 	for (i = 1; i < seed_2; i++)
  	{
! 		datum_l = seg_union(datum_l, sortItems[i].data);
! 		*left++ = sortItems[i].index;
! 		v->spl_nleft++;
! 	}
  
! 	/*
! 	 * Second half of segs goes to the right datum.
! 	 */
! 	datum_r = (SEG *)palloc(sizeof(SEG));
! 	memcpy(datum_r, sortItems[seed_2].data, sizeof(SEG));
! 	*right++ = sortItems[seed_2].index;
! 	v->spl_nright++;
! 	for (i = seed_2 + 1; i < maxoff; i++)
! 	{
! 		datum_r = seg_union(datum_r, sortItems[i].data);
! 		*right++ = sortItems[i].index;
! 		v->spl_nright++;
  	}
  
  	v->spl_ldatum = PointerGetDatum(datum_l);
  	v->spl_rdatum = PointerGetDatum(datum_r);

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] Fix for seg picksplit function

Reply via email to