On Sat, Nov 20, 2010 at 6:46 AM, Robert Haas <[email protected]> wrote:
> Well, the problem with just comparing on < is that it takes very
> little account of the upper bounds. I think the cases where a simple
> split would hurt you the most are those where examining the upper
> bound is necessary to to get a good split.
Yes, also such asymmetric solution seems not beautiful enough for me :).
It's easy to sort segs by their center, in this case lower and upper bound
will be used equally. New patch is attached. I checked it on various data
distributions.
1) Uniform distribution
test=# insert into seg_test (select (a || ' .. ' || a + 0.00005*b)::seg from
(select random() as a, random() as b from generate_series(1,1000000)) x);
INSERT 0 1000000
Time: 79121,830 ms
test=# create index seg_test_idx on seg_test using gist (a);
CREATE INDEX
Time: 176409,434 ms
test=# explain (buffers, analyze) select * from seg_test where a @> '0.5 ..
0.5'::seg;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------
Bitmap Heap Scan on seg_test (cost=28.19..2500.32 rows=1000 width=12)
(actual time=0.251..0.886 rows=27 loops=1)
Recheck Cond: (a @> '0.5'::seg)
Buffers: shared hit=3 read=27
-> Bitmap Index Scan on seg_test_idx (cost=0.00..27.94 rows=1000
width=0) (actual time=0.193..0.193 rows=27 loops=1)
Index Cond: (a @> '0.5'::seg)
Buffers: shared hit=3
Total runtime: 1.091 ms
(7 rows)
Time: 41,884 ms
2) Natural distribution (Box–Muller transform was used for data generation)
test=# insert into seg_test (select ( a - 0.00005*abs(b) || ' .. ' || a +
0.00005*abs(b))::seg from (select
cos(2.0*pi()*random())*sqrt(-2.0*ln(random())) as a,
cos(2.0*pi()*random())*sqrt(-2.0*ln(random())) as b from
generate_series(1,1000000)) x);
INSERT 0 1000000
Time: 98614,305 ms
test=# create index seg_test_idx on seg_test using gist(a);
CREATE INDEX
Time: 212513,540 ms
test=# explain (buffers, analyze) select * from seg_test where a @> '0.3 ..
0.3'::seg;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------
Bitmap Heap Scan on seg_test (cost=28.18..2500.31 rows=1000 width=12)
(actual time=0.132..0.428 rows=27 loops=1)
Recheck Cond: (a @> '0.3'::seg)
Buffers: shared hit=3 read=27
-> Bitmap Index Scan on seg_test_idx (cost=0.00..27.93 rows=1000
width=0) (actual time=0.103..0.103 rows=27 loops=1)
Index Cond: (a @> '0.3'::seg)
Buffers: shared hit=3
Total runtime: 0.504 ms
(7 rows)
Time: 0,967 ms
3) Many distinct values
test=# insert into seg_test (select (a||'..'||(a+1))::seg from (select
(random()*13000)::integer as a from generate_series(1,1000000)) x);
INSERT 0 1000000
Time: 90775,952 ms
test=# create index seg_test_idx on seg_test using gist(a);
CREATE INDEX
Time: 200960,758 ms
test=# explain (buffers, analyze) select * from seg_test where a @> '700.0
.. 700.0'::seg;
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------
Bitmap Heap Scan on seg_test (cost=28.19..2500.33 rows=1000 width=12)
(actual time=0.358..3.531 rows=138 loops=1)
Recheck Cond: (a @> '700.0'::seg)
Buffers: shared hit=3 read=135
-> Bitmap Index Scan on seg_test_idx (cost=0.00..27.94 rows=1000
width=0) (actual time=0.270..0.270 rows=138 loops=1)
Index Cond: (a @> '700.0'::seg)
Buffers: shared hit=3
Total runtime: 3.882 ms
(7 rows)
Time: 5,271 ms
----
With best regards,
Alexander Korotkov.
*** a/contrib/seg/seg.c
--- b/contrib/seg/seg.c
***************
*** 292,329 **** gseg_penalty(GISTENTRY *origentry, GISTENTRY *newentry, float *result)
return (result);
}
/*
** The GiST PickSplit method for segments
! ** We use Guttman's poly time split algorithm
*/
GIST_SPLITVEC *
gseg_picksplit(GistEntryVector *entryvec,
GIST_SPLITVEC *v)
{
! OffsetNumber i,
! j;
! SEG *datum_alpha,
! *datum_beta;
SEG *datum_l,
! *datum_r;
! SEG *union_d,
! *union_dl,
! *union_dr;
! SEG *inter_d;
! bool firsttime;
! float size_alpha,
! size_beta,
! size_union,
! size_inter;
! float size_waste,
! waste;
! float size_l,
! size_r;
int nbytes;
! OffsetNumber seed_1 = 1,
! seed_2 = 2;
OffsetNumber *left,
*right;
OffsetNumber maxoff;
--- 292,340 ----
return (result);
}
+ /*
+ * Auxiliary structure for picksplit method.
+ */
+ typedef struct
+ {
+ OffsetNumber index;
+ float center;
+ SEG *data;
+ } PickSplitSortItem;
+ /*
+ * Compare function for PickSplitSortItem based on seg_cmp.
+ */
+ static int
+ sort_item_cmp(const void *a, const void *b)
+ {
+ PickSplitSortItem *i1 = (PickSplitSortItem *)a;
+ PickSplitSortItem *i2 = (PickSplitSortItem *)b;
+ if (i1->center < i2->center)
+ return -1;
+ else if (i1->center == i2->center)
+ return 0;
+ else
+ return 1;
+ }
/*
** The GiST PickSplit method for segments
! ** Algorithm based on sorting. Incoming array of segs is sorting using seg_cmp
! ** function. After that first half of segs goes to the left datum, and the
! ** second half of segs goes to the right datum.
*/
GIST_SPLITVEC *
gseg_picksplit(GistEntryVector *entryvec,
GIST_SPLITVEC *v)
{
! OffsetNumber i;
SEG *datum_l,
! *datum_r,
! *seg;
! PickSplitSortItem *sortItems;
int nbytes;
! OffsetNumber seed_2;
OffsetNumber *left,
*right;
OffsetNumber maxoff;
***************
*** 332,443 **** gseg_picksplit(GistEntryVector *entryvec,
fprintf(stderr, "picksplit\n");
#endif
! maxoff = entryvec->n - 2;
! nbytes = (maxoff + 2) * sizeof(OffsetNumber);
v->spl_left = (OffsetNumber *) palloc(nbytes);
v->spl_right = (OffsetNumber *) palloc(nbytes);
! firsttime = true;
! waste = 0.0;
!
! for (i = FirstOffsetNumber; i < maxoff; i = OffsetNumberNext(i))
{
! datum_alpha = (SEG *) DatumGetPointer(entryvec->vector[i].key);
! for (j = OffsetNumberNext(i); j <= maxoff; j = OffsetNumberNext(j))
! {
! datum_beta = (SEG *) DatumGetPointer(entryvec->vector[j].key);
!
! /* compute the wasted space by unioning these guys */
! /* size_waste = size_union - size_inter; */
! union_d = seg_union(datum_alpha, datum_beta);
! rt_seg_size(union_d, &size_union);
! inter_d = seg_inter(datum_alpha, datum_beta);
! rt_seg_size(inter_d, &size_inter);
! size_waste = size_union - size_inter;
!
! /*
! * are these a more promising split that what we've already seen?
! */
! if (size_waste > waste || firsttime)
! {
! waste = size_waste;
! seed_1 = i;
! seed_2 = j;
! firsttime = false;
! }
! }
}
left = v->spl_left;
v->spl_nleft = 0;
right = v->spl_right;
v->spl_nright = 0;
- datum_alpha = (SEG *) DatumGetPointer(entryvec->vector[seed_1].key);
- datum_l = seg_union(datum_alpha, datum_alpha);
- rt_seg_size(datum_l, &size_l);
- datum_beta = (SEG *) DatumGetPointer(entryvec->vector[seed_2].key);
- datum_r = seg_union(datum_beta, datum_beta);
- rt_seg_size(datum_r, &size_r);
-
/*
! * Now split up the regions between the two seeds. An important property
! * of this split algorithm is that the split vector v has the indices of
! * items to be split in order in its left and right vectors. We exploit
! * this property by doing a merge in the code that actually splits the
! * page.
! *
! * For efficiency, we also place the new index tuple in this loop. This is
! * handled at the very end, when we have placed all the existing tuples
! * and i == maxoff + 1.
*/
!
! maxoff = OffsetNumberNext(maxoff);
! for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
{
! /*
! * If we've already decided where to place this item, just put it on
! * the right list. Otherwise, we need to figure out which page needs
! * the least enlargement in order to store the item.
! */
!
! if (i == seed_1)
! {
! *left++ = i;
! v->spl_nleft++;
! continue;
! }
! else if (i == seed_2)
! {
! *right++ = i;
! v->spl_nright++;
! continue;
! }
!
! /* okay, which page needs least enlargement? */
! datum_alpha = (SEG *) DatumGetPointer(entryvec->vector[i].key);
! union_dl = seg_union(datum_l, datum_alpha);
! union_dr = seg_union(datum_r, datum_alpha);
! rt_seg_size(union_dl, &size_alpha);
! rt_seg_size(union_dr, &size_beta);
! /* pick which page to add it to */
! if (size_alpha - size_l < size_beta - size_r)
! {
! datum_l = union_dl;
! size_l = size_alpha;
! *left++ = i;
! v->spl_nleft++;
! }
! else
! {
! datum_r = union_dr;
! size_r = size_alpha;
! *right++ = i;
! v->spl_nright++;
! }
}
- *left = *right = FirstOffsetNumber; /* sentinel value, see dosplit() */
v->spl_ldatum = PointerGetDatum(datum_l);
v->spl_rdatum = PointerGetDatum(datum_r);
--- 343,403 ----
fprintf(stderr, "picksplit\n");
#endif
! maxoff = entryvec->n - 1;
! nbytes = (maxoff + 1) * sizeof(OffsetNumber);
! sortItems = (PickSplitSortItem *)palloc(maxoff * sizeof(PickSplitSortItem));
v->spl_left = (OffsetNumber *) palloc(nbytes);
v->spl_right = (OffsetNumber *) palloc(nbytes);
! /*
! * Preparing auxiliary array and sorting.
! */
! for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
{
! seg = (SEG *) DatumGetPointer(entryvec->vector[i].key);
! sortItems[i - FirstOffsetNumber].index = i;
! sortItems[i - FirstOffsetNumber].data = seg;
! /*
! * In center calculation we're getting half of lower and upper bounds
! * before summation in order to aviod possible overflow.
! */
! sortItems[i - FirstOffsetNumber].center = seg->lower*0.5f + seg->upper*0.5f;
}
+ qsort(sortItems, maxoff, sizeof(PickSplitSortItem), sort_item_cmp);
+ seed_2 = maxoff / 2;
left = v->spl_left;
v->spl_nleft = 0;
right = v->spl_right;
v->spl_nright = 0;
/*
! * First half of segs goes to the left datum.
*/
! datum_l = (SEG *)palloc(sizeof(SEG));
! memcpy(datum_l, sortItems[0].data, sizeof(SEG));
! *left++ = sortItems[0].index;
! v->spl_nleft++;
! for (i = 1; i < seed_2; i++)
{
! datum_l = seg_union(datum_l, sortItems[i].data);
! *left++ = sortItems[i].index;
! v->spl_nleft++;
! }
! /*
! * Second half of segs goes to the right datum.
! */
! datum_r = (SEG *)palloc(sizeof(SEG));
! memcpy(datum_r, sortItems[seed_2].data, sizeof(SEG));
! *right++ = sortItems[seed_2].index;
! v->spl_nright++;
! for (i = seed_2 + 1; i < maxoff; i++)
! {
! datum_r = seg_union(datum_r, sortItems[i].data);
! *right++ = sortItems[i].index;
! v->spl_nright++;
}
v->spl_ldatum = PointerGetDatum(datum_l);
v->spl_rdatum = PointerGetDatum(datum_r);
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers