Hi, Attached is a WIP/PoC fix addressing the O(N^2) behavior in ANALYZE with high statistic target values. It needs more work, but it's good enough to show some measurements.
For benchmark, I've created a simple 2-column table, with MCV list on
those two columns:
CREATE TABLE t (a int, b int);
CREATE STATISTICS s (mcv) ON a,b FROM t;
and then loaded data sets with different numbers of random combinations,
determined by number of values in each column. For example with 10 values
in a column, you get ~100 combinations.
INSERT INTO t
SELECT 10*random(), 10*random() FROM generate_series(1,3e6);
The 3M rows is picked because that's the sample size with target 10000.
The results with different statistic targets look like this:
1) master
values 100 1000 5000 10000
====================================================
10 103 586 2419 3041
100 116 789 4778 8934
1000 116 690 3162 499748
2) patched
values 100 1000 5000 10000
====================================================
10 113 606 2460 3716
100 143 711 3371 5231
1000 156 994 3836 6002
3) comparison (patched / master)
values 100 1000 5000 10000
====================================================
10 110% 103% 102% 122%
100 123% 90% 71% 59%
1000 134% 144% 121% 1%
So clearly, the issue for large statistic targets is resolved (duration
drops from 500s to just 6s), but there is measurable regression for the
other cases. That needs more investigation & fix before commit.
regards
--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c
index 5fe61ea0a4..6dc4ed37d8 100644
--- a/src/backend/statistics/mcv.c
+++ b/src/backend/statistics/mcv.c
@@ -78,6 +78,9 @@ static MultiSortSupport build_mss(VacAttrStats **stats, int
numattrs);
static SortItem *build_distinct_groups(int numrows, SortItem *items,
MultiSortSupport mss, int *ndistinct);
+static SortItem **compute_column_counts(SortItem *groups, int ngroups,
+
MultiSortSupport mss, int *ncounts);
+
static int count_distinct_groups(int numrows, SortItem *items,
MultiSortSupport mss);
@@ -172,6 +175,8 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset
*attrs,
SortItem *groups;
MCVList *mcvlist = NULL;
MultiSortSupport mss;
+ SortItem **counts;
+ int *ncounts;
attnums = build_attnums_array(attrs, &numattrs);
@@ -188,6 +193,10 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset
*attrs,
/* transform the sorted rows into groups (sorted by frequency) */
groups = build_distinct_groups(nitems, items, mss, &ngroups);
+ /* compute frequencies for values in each column */
+ ncounts = (int *) palloc0(sizeof(int) * numattrs);
+ counts = compute_column_counts(groups, ngroups, mss, ncounts);
+
/*
* Maximum number of MCV items to store, based on the attribute with the
* largest stats target (and the number of groups we have available).
@@ -242,6 +251,16 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset
*attrs,
if (nitems > 0)
{
int j;
+ SortItem key;
+ MultiSortSupport tmp;
+
+ /* used to search values */
+ tmp = (MultiSortSupport) palloc(offsetof(MultiSortSupportData,
ssup)
+
+ sizeof(SortSupportData));
+
+ /* space for search key */
+ key.values = palloc(sizeof(Datum));
+ key.isnull = palloc(sizeof(bool));
/*
* Allocate the MCV list structure, set the global parameters.
@@ -281,16 +300,21 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset
*attrs,
item->base_frequency = 1.0;
for (j = 0; j < numattrs; j++)
{
- int count = 0;
- int k;
+ SortItem *result;
- for (k = 0; k < ngroups; k++)
- {
- if (multi_sort_compare_dim(j,
&groups[i], &groups[k], mss) == 0)
- count += groups[k].count;
- }
+ /* single dimension */
+ tmp->ndims = 1;
+ tmp->ssup[0] = mss->ssup[j];
+
+ /* fill search key */
+ key.values[0] = groups[i].values[j];
+ key.isnull[0] = groups[i].isnull[j];
+
+ result = (SortItem *) bsearch_arg(&key,
counts[j], ncounts[j],
+
sizeof(SortItem),
+
multi_sort_compare, tmp);
- item->base_frequency *= (double) count /
numrows;
+ item->base_frequency *= ((double)
result->count) / numrows;
}
}
}
@@ -419,6 +443,61 @@ build_distinct_groups(int numrows, SortItem *items,
MultiSortSupport mss,
return groups;
}
+static SortItem **
+compute_column_counts(SortItem *groups, int ngroups, MultiSortSupport mss,
+ int *ncounts)
+{
+ int i,
+ j;
+ SortItem **result;
+ MultiSortSupport tmp;
+
+ result = (SortItem **) palloc(sizeof(SortItem *) * mss->ndims);
+ tmp = (MultiSortSupport) palloc(offsetof(MultiSortSupportData, ssup)
+ +
sizeof(SortSupportData));
+
+ for (i = 0; i < mss->ndims; i++)
+ {
+ result[i] = (SortItem *) palloc0(sizeof(SortItem) * ngroups);
+
+ /* single dimension */
+ tmp->ndims = 1;
+ tmp->ssup[0] = mss->ssup[i];
+
+ /* extract data for the dimension */
+ for (j = 0; j < ngroups; j++)
+ {
+ result[i][j].values = palloc(sizeof(Datum));
+ result[i][j].isnull = palloc(sizeof(bool));
+
+ result[i][j].values[0] = groups[j].values[i];
+ result[i][j].isnull[0] = groups[j].isnull[i];
+ result[i][j].count = groups[j].count;
+ }
+
+ /* sort the values, deduplicate */
+ qsort_arg((void *) result[i], ngroups, sizeof(SortItem),
+ multi_sort_compare, tmp);
+
+ ncounts[i] = 1;
+ for (j = 1; j < ngroups; j++)
+ {
+ if (multi_sort_compare(&result[i][(ncounts[i] - 1)],
&result[i][j], tmp) == 0)
+ {
+ result[i][(ncounts[i] - 1)].count +=
result[i][j].count;
+ continue;
+ }
+
+ /* */
+ if (ncounts[i] != j)
+ result[i][ncounts[i]] = result[i][j];
+
+ ncounts[i]++;
+ }
+ }
+
+ return result;
+}
/*
* statext_mcv_load
test.sql
Description: application/sql
