On Sat, Nov 20, 2010 at 6:46 AM, Robert Haas <robertmh...@gmail.com> wrote:
> Well, the problem with just comparing on < is that it takes very > little account of the upper bounds. I think the cases where a simple > split would hurt you the most are those where examining the upper > bound is necessary to to get a good split. Yes, also such asymmetric solution seems not beautiful enough for me :). It's easy to sort segs by their center, in this case lower and upper bound will be used equally. New patch is attached. I checked it on various data distributions. 1) Uniform distribution test=# insert into seg_test (select (a || ' .. ' || a + 0.00005*b)::seg from (select random() as a, random() as b from generate_series(1,1000000)) x); INSERT 0 1000000 Time: 79121,830 ms test=# create index seg_test_idx on seg_test using gist (a); CREATE INDEX Time: 176409,434 ms test=# explain (buffers, analyze) select * from seg_test where a @> '0.5 .. 0.5'::seg; QUERY PLAN -------------------------------------------------------------------------------------------------------------------------- Bitmap Heap Scan on seg_test (cost=28.19..2500.32 rows=1000 width=12) (actual time=0.251..0.886 rows=27 loops=1) Recheck Cond: (a @> '0.5'::seg) Buffers: shared hit=3 read=27 -> Bitmap Index Scan on seg_test_idx (cost=0.00..27.94 rows=1000 width=0) (actual time=0.193..0.193 rows=27 loops=1) Index Cond: (a @> '0.5'::seg) Buffers: shared hit=3 Total runtime: 1.091 ms (7 rows) Time: 41,884 ms 2) Natural distribution (Box–Muller transform was used for data generation) test=# insert into seg_test (select ( a - 0.00005*abs(b) || ' .. ' || a + 0.00005*abs(b))::seg from (select cos(2.0*pi()*random())*sqrt(-2.0*ln(random())) as a, cos(2.0*pi()*random())*sqrt(-2.0*ln(random())) as b from generate_series(1,1000000)) x); INSERT 0 1000000 Time: 98614,305 ms test=# create index seg_test_idx on seg_test using gist(a); CREATE INDEX Time: 212513,540 ms test=# explain (buffers, analyze) select * from seg_test where a @> '0.3 .. 0.3'::seg; QUERY PLAN -------------------------------------------------------------------------------------------------------------------------- Bitmap Heap Scan on seg_test (cost=28.18..2500.31 rows=1000 width=12) (actual time=0.132..0.428 rows=27 loops=1) Recheck Cond: (a @> '0.3'::seg) Buffers: shared hit=3 read=27 -> Bitmap Index Scan on seg_test_idx (cost=0.00..27.93 rows=1000 width=0) (actual time=0.103..0.103 rows=27 loops=1) Index Cond: (a @> '0.3'::seg) Buffers: shared hit=3 Total runtime: 0.504 ms (7 rows) Time: 0,967 ms 3) Many distinct values test=# insert into seg_test (select (a||'..'||(a+1))::seg from (select (random()*13000)::integer as a from generate_series(1,1000000)) x); INSERT 0 1000000 Time: 90775,952 ms test=# create index seg_test_idx on seg_test using gist(a); CREATE INDEX Time: 200960,758 ms test=# explain (buffers, analyze) select * from seg_test where a @> '700.0 .. 700.0'::seg; QUERY PLAN --------------------------------------------------------------------------------------------------------------------------- Bitmap Heap Scan on seg_test (cost=28.19..2500.33 rows=1000 width=12) (actual time=0.358..3.531 rows=138 loops=1) Recheck Cond: (a @> '700.0'::seg) Buffers: shared hit=3 read=135 -> Bitmap Index Scan on seg_test_idx (cost=0.00..27.94 rows=1000 width=0) (actual time=0.270..0.270 rows=138 loops=1) Index Cond: (a @> '700.0'::seg) Buffers: shared hit=3 Total runtime: 3.882 ms (7 rows) Time: 5,271 ms ---- With best regards, Alexander Korotkov.
*** a/contrib/seg/seg.c --- b/contrib/seg/seg.c *************** *** 292,329 **** gseg_penalty(GISTENTRY *origentry, GISTENTRY *newentry, float *result) return (result); } /* ** The GiST PickSplit method for segments ! ** We use Guttman's poly time split algorithm */ GIST_SPLITVEC * gseg_picksplit(GistEntryVector *entryvec, GIST_SPLITVEC *v) { ! OffsetNumber i, ! j; ! SEG *datum_alpha, ! *datum_beta; SEG *datum_l, ! *datum_r; ! SEG *union_d, ! *union_dl, ! *union_dr; ! SEG *inter_d; ! bool firsttime; ! float size_alpha, ! size_beta, ! size_union, ! size_inter; ! float size_waste, ! waste; ! float size_l, ! size_r; int nbytes; ! OffsetNumber seed_1 = 1, ! seed_2 = 2; OffsetNumber *left, *right; OffsetNumber maxoff; --- 292,340 ---- return (result); } + /* + * Auxiliary structure for picksplit method. + */ + typedef struct + { + OffsetNumber index; + float center; + SEG *data; + } PickSplitSortItem; + /* + * Compare function for PickSplitSortItem based on seg_cmp. + */ + static int + sort_item_cmp(const void *a, const void *b) + { + PickSplitSortItem *i1 = (PickSplitSortItem *)a; + PickSplitSortItem *i2 = (PickSplitSortItem *)b; + if (i1->center < i2->center) + return -1; + else if (i1->center == i2->center) + return 0; + else + return 1; + } /* ** The GiST PickSplit method for segments ! ** Algorithm based on sorting. Incoming array of segs is sorting using seg_cmp ! ** function. After that first half of segs goes to the left datum, and the ! ** second half of segs goes to the right datum. */ GIST_SPLITVEC * gseg_picksplit(GistEntryVector *entryvec, GIST_SPLITVEC *v) { ! OffsetNumber i; SEG *datum_l, ! *datum_r, ! *seg; ! PickSplitSortItem *sortItems; int nbytes; ! OffsetNumber seed_2; OffsetNumber *left, *right; OffsetNumber maxoff; *************** *** 332,443 **** gseg_picksplit(GistEntryVector *entryvec, fprintf(stderr, "picksplit\n"); #endif ! maxoff = entryvec->n - 2; ! nbytes = (maxoff + 2) * sizeof(OffsetNumber); v->spl_left = (OffsetNumber *) palloc(nbytes); v->spl_right = (OffsetNumber *) palloc(nbytes); ! firsttime = true; ! waste = 0.0; ! ! for (i = FirstOffsetNumber; i < maxoff; i = OffsetNumberNext(i)) { ! datum_alpha = (SEG *) DatumGetPointer(entryvec->vector[i].key); ! for (j = OffsetNumberNext(i); j <= maxoff; j = OffsetNumberNext(j)) ! { ! datum_beta = (SEG *) DatumGetPointer(entryvec->vector[j].key); ! ! /* compute the wasted space by unioning these guys */ ! /* size_waste = size_union - size_inter; */ ! union_d = seg_union(datum_alpha, datum_beta); ! rt_seg_size(union_d, &size_union); ! inter_d = seg_inter(datum_alpha, datum_beta); ! rt_seg_size(inter_d, &size_inter); ! size_waste = size_union - size_inter; ! ! /* ! * are these a more promising split that what we've already seen? ! */ ! if (size_waste > waste || firsttime) ! { ! waste = size_waste; ! seed_1 = i; ! seed_2 = j; ! firsttime = false; ! } ! } } left = v->spl_left; v->spl_nleft = 0; right = v->spl_right; v->spl_nright = 0; - datum_alpha = (SEG *) DatumGetPointer(entryvec->vector[seed_1].key); - datum_l = seg_union(datum_alpha, datum_alpha); - rt_seg_size(datum_l, &size_l); - datum_beta = (SEG *) DatumGetPointer(entryvec->vector[seed_2].key); - datum_r = seg_union(datum_beta, datum_beta); - rt_seg_size(datum_r, &size_r); - /* ! * Now split up the regions between the two seeds. An important property ! * of this split algorithm is that the split vector v has the indices of ! * items to be split in order in its left and right vectors. We exploit ! * this property by doing a merge in the code that actually splits the ! * page. ! * ! * For efficiency, we also place the new index tuple in this loop. This is ! * handled at the very end, when we have placed all the existing tuples ! * and i == maxoff + 1. */ ! ! maxoff = OffsetNumberNext(maxoff); ! for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { ! /* ! * If we've already decided where to place this item, just put it on ! * the right list. Otherwise, we need to figure out which page needs ! * the least enlargement in order to store the item. ! */ ! ! if (i == seed_1) ! { ! *left++ = i; ! v->spl_nleft++; ! continue; ! } ! else if (i == seed_2) ! { ! *right++ = i; ! v->spl_nright++; ! continue; ! } ! ! /* okay, which page needs least enlargement? */ ! datum_alpha = (SEG *) DatumGetPointer(entryvec->vector[i].key); ! union_dl = seg_union(datum_l, datum_alpha); ! union_dr = seg_union(datum_r, datum_alpha); ! rt_seg_size(union_dl, &size_alpha); ! rt_seg_size(union_dr, &size_beta); ! /* pick which page to add it to */ ! if (size_alpha - size_l < size_beta - size_r) ! { ! datum_l = union_dl; ! size_l = size_alpha; ! *left++ = i; ! v->spl_nleft++; ! } ! else ! { ! datum_r = union_dr; ! size_r = size_alpha; ! *right++ = i; ! v->spl_nright++; ! } } - *left = *right = FirstOffsetNumber; /* sentinel value, see dosplit() */ v->spl_ldatum = PointerGetDatum(datum_l); v->spl_rdatum = PointerGetDatum(datum_r); --- 343,403 ---- fprintf(stderr, "picksplit\n"); #endif ! maxoff = entryvec->n - 1; ! nbytes = (maxoff + 1) * sizeof(OffsetNumber); ! sortItems = (PickSplitSortItem *)palloc(maxoff * sizeof(PickSplitSortItem)); v->spl_left = (OffsetNumber *) palloc(nbytes); v->spl_right = (OffsetNumber *) palloc(nbytes); ! /* ! * Preparing auxiliary array and sorting. ! */ ! for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { ! seg = (SEG *) DatumGetPointer(entryvec->vector[i].key); ! sortItems[i - FirstOffsetNumber].index = i; ! sortItems[i - FirstOffsetNumber].data = seg; ! /* ! * In center calculation we're getting half of lower and upper bounds ! * before summation in order to aviod possible overflow. ! */ ! sortItems[i - FirstOffsetNumber].center = seg->lower*0.5f + seg->upper*0.5f; } + qsort(sortItems, maxoff, sizeof(PickSplitSortItem), sort_item_cmp); + seed_2 = maxoff / 2; left = v->spl_left; v->spl_nleft = 0; right = v->spl_right; v->spl_nright = 0; /* ! * First half of segs goes to the left datum. */ ! datum_l = (SEG *)palloc(sizeof(SEG)); ! memcpy(datum_l, sortItems[0].data, sizeof(SEG)); ! *left++ = sortItems[0].index; ! v->spl_nleft++; ! for (i = 1; i < seed_2; i++) { ! datum_l = seg_union(datum_l, sortItems[i].data); ! *left++ = sortItems[i].index; ! v->spl_nleft++; ! } ! /* ! * Second half of segs goes to the right datum. ! */ ! datum_r = (SEG *)palloc(sizeof(SEG)); ! memcpy(datum_r, sortItems[seed_2].data, sizeof(SEG)); ! *right++ = sortItems[seed_2].index; ! v->spl_nright++; ! for (i = seed_2 + 1; i < maxoff; i++) ! { ! datum_r = seg_union(datum_r, sortItems[i].data); ! *right++ = sortItems[i].index; ! v->spl_nright++; } v->spl_ldatum = PointerGetDatum(datum_l); v->spl_rdatum = PointerGetDatum(datum_r);
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers