On 01.03.2013 16:22, Alexander Korotkov wrote:
On Tue, Mar 12, 2013 at 8:03 PM, Heikki Linnakangas<hlinnakan...@vmware.com
wrote:

So, after some hacking, I ended up with this version. I find it more
readable, I hope I didn't miss anything. This seems to produce results that
are close, but not identical, to the original patch. I'm not sure where the
discrepancy is coming from, or which patch is more correct in that respect.
I'll continue from this tomorrow, but if you have the time, please take a
look and let me know what you think.

I've read your explanation and version of patch. In general it seems
correct to me.
There is one point why I have breaked up abstraction in some functions is
infinities. For example, in calc_length_hist_frac one or both of length1
and length2 can be infinity. In the line
frac = area / (length2 - length1);
you can get NaN result. I've especially adjusted the code to get more of
less correct result in this case.

Hmm, good point. I think I managed to fix those cases in the attached version. Is there any other corner case that I missed?

Another minor note about this line
bin_width *= get_position(typcache, lower,&hist_lower[i],
  &hist_lower[i + 1]);
ITSM it sould looks like
bin_width -= 1.0 - get_position(typcache, lower,&hist_lower[i],
   &hist_lower[i + 1]);
Imagine lower and upper bounds fall into same histogram bin. In this case
we should subtract lengths of both parts which were cut in the left and in
the right.

Yes, true. There's one negation too many above, though; should be:

bin_width -= get_position(typcache, lower,&hist_lower[i],
   &hist_lower[i + 1]);

Fixed that. Barring any more issues, I'll read through this once more tomorrow and commit.

- Heikki
*** a/src/backend/utils/adt/rangetypes_selfuncs.c
--- b/src/backend/utils/adt/rangetypes_selfuncs.c
***************
*** 20,25 ****
--- 20,26 ----
  #include "access/htup_details.h"
  #include "catalog/pg_operator.h"
  #include "catalog/pg_statistic.h"
+ #include "utils/builtins.h"
  #include "utils/lsyscache.h"
  #include "utils/rangetypes.h"
  #include "utils/selfuncs.h"
***************
*** 39,44 **** static int rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value,
--- 40,60 ----
  			   RangeBound *hist, int hist_length, bool equal);
  static float8 get_position(TypeCacheEntry *typcache, RangeBound *value,
  			 RangeBound *hist1, RangeBound *hist2);
+ static float8 get_len_position(double value, double hist1, double hist2);
+ static float8 get_distance(TypeCacheEntry *typcache, RangeBound *bound1,
+ 															RangeBound *bound2);
+ static int length_hist_bsearch(Datum *length_hist_values,
+ 					int length_hist_nvalues, double value, bool equal);
+ static double calc_length_hist_frac(Datum *length_hist_values,
+ 									int length_hist_nvalues, double length1, double length2, bool equal);
+ static double calc_hist_selectivity_contained(TypeCacheEntry *typcache,
+ 								RangeBound *lower, RangeBound *upper,
+ 								RangeBound *hist_lower, int hist_nvalues,
+ 							Datum *length_hist_values, int length_hist_nvalues);
+ static double calc_hist_selectivity_contains(TypeCacheEntry *typcache,
+ 							   RangeBound *lower, RangeBound *upper,
+ 							   RangeBound *hist_lower, int hist_nvalues,
+ 							Datum *length_hist_values, int length_hist_nvalues);
  
  /*
   * Returns a default selectivity estimate for given operator, when we don't
***************
*** 213,219 **** calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
  		/* Try to get fraction of empty ranges */
  		if (get_attstatsslot(vardata->statsTuple,
  							 vardata->atttype, vardata->atttypmod,
! 							 STATISTIC_KIND_RANGE_EMPTY_FRAC, InvalidOid,
  							 NULL,
  							 NULL, NULL,
  							 &numbers, &nnumbers))
--- 229,235 ----
  		/* Try to get fraction of empty ranges */
  		if (get_attstatsslot(vardata->statsTuple,
  							 vardata->atttype, vardata->atttypmod,
! 							 STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM, InvalidOid,
  							 NULL,
  							 NULL, NULL,
  							 &numbers, &nnumbers))
***************
*** 332,337 **** calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata,
--- 348,355 ----
  {
  	Datum	   *hist_values;
  	int			nhist;
+ 	Datum	   *length_hist_values;
+ 	int			length_nhist;
  	RangeBound *hist_lower;
  	RangeBound *hist_upper;
  	int			i;
***************
*** 365,370 **** calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata,
--- 383,403 ----
  			elog(ERROR, "bounds histogram contains an empty range");
  	}
  
+ 	/* @> and @< also need a histogram of range lengths */
+ 	if (operator == OID_RANGE_CONTAINS_OP ||
+ 		operator == OID_RANGE_CONTAINED_OP)
+ 	{
+ 		if (!(HeapTupleIsValid(vardata->statsTuple) &&
+ 			  get_attstatsslot(vardata->statsTuple,
+ 							   vardata->atttype, vardata->atttypmod,
+ 							   STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
+ 							   InvalidOid,
+ 							   NULL,
+ 							   &length_hist_values, &length_nhist,
+ 							   NULL, NULL)))
+ 			return -1.0;
+ 	}
+ 
  	/* Extract the bounds of the constant value. */
  	range_deserialize(typcache, constval, &const_lower, &const_upper, &empty);
  	Assert (!empty);
***************
*** 440,445 **** calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata,
--- 473,481 ----
  			/*
  			 * A && B <=> NOT (A << B OR A >> B).
  			 *
+ 			 * Since A << B and A >> B are mutually exclusive events we can sum
+ 			 * their probabilities to find probability of (A << B OR A >> B).
+ 			 *
  			 * "range @> elem" is equivalent to "range && [elem,elem]". The
  			 * caller already constructed the singular range from the element
  			 * constant, so just treat it the same as &&.
***************
*** 454,462 **** calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata,
  			break;
  
  		case OID_RANGE_CONTAINS_OP:
  		case OID_RANGE_CONTAINED_OP:
! 			/* TODO: not implemented yet */
! 			hist_selec = -1.0;
  			break;
  
  		default:
--- 490,525 ----
  			break;
  
  		case OID_RANGE_CONTAINS_OP:
+ 			hist_selec =
+ 				calc_hist_selectivity_contains(typcache, &const_lower,
+ 											&const_upper, hist_lower, nhist,
+ 											length_hist_values, length_nhist);
+ 			break;
+ 
  		case OID_RANGE_CONTAINED_OP:
! 			if (const_lower.infinite)
! 			{
! 				/*
! 				 * Lower bound no longer matters. Just estimate the fraction
! 				 * with an upper bound <= const uppert bound
! 				 */
! 				hist_selec =
! 					calc_hist_selectivity_scalar(typcache, &const_upper,
! 												 hist_upper, nhist, true);
! 			}
! 			else if (const_upper.infinite)
! 			{
! 				hist_selec =
! 					1.0 - calc_hist_selectivity_scalar(typcache, &const_lower,
! 													   hist_lower, nhist, false);
! 			}
! 			else
! 			{
! 				hist_selec =
! 					calc_hist_selectivity_contained(typcache, &const_lower,
! 													&const_upper, hist_lower, nhist,
! 													length_hist_values, length_nhist);
! 			}
  			break;
  
  		default:
***************
*** 497,504 **** calc_hist_selectivity_scalar(TypeCacheEntry *typcache, RangeBound *constbound,
  
  /*
   * Binary search on an array of range bounds. Returns greatest index of range
!  * bound in array which is less than given range bound. If all range bounds in
!  * array are greater or equal than given range bound, return -1.
   */
  static int
  rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist,
--- 560,572 ----
  
  /*
   * Binary search on an array of range bounds. Returns greatest index of range
!  * bound in array which is less(less or equal) than given range bound. If all
!  * range bounds in array are greater or equal(greater) than given range bound,
!  * return -1. When "equal" flag is set conditions in brackets are used.
!  *
!  * This function is used in scalar operators selectivity estimation. Another
!  * goal of this function is to found an histogram bin where to stop
!  * interpolation of portion of bounds which are less or equal to given bound.
   */
  static int
  rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist,
***************
*** 522,527 **** rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist,
--- 590,625 ----
  	return lower;
  }
  
+ 
+ /*
+  * Binary search on length histogram. Returns greatest index of range length in
+  * histogram which is less than (less than or equal) the given length value. If
+  * all lengths in the histogram are greater than (greater than or equal) the
+  * given length, returns -1.
+  */
+ static int
+ length_hist_bsearch(Datum *length_hist_values, int length_hist_nvalues,
+ 					double value, bool equal)
+ {
+ 	int			lower = -1,
+ 				upper = length_hist_nvalues - 1,
+ 				middle;
+ 
+ 	while (lower < upper)
+ 	{
+ 		double middleval;
+ 
+ 		middle = (lower + upper + 1) / 2;
+ 
+ 		middleval = DatumGetFloat8(length_hist_values[middle]);
+ 		if (middleval < value || (equal && middleval <= value))
+ 			lower = middle;
+ 		else
+ 			upper = middle - 1;
+ 	}
+ 	return lower;
+ }
+ 
  /*
   * Get relative position of value in histogram bin in [0,1] range.
   */
***************
*** 598,600 **** get_position(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist1,
--- 696,1122 ----
  	}
  }
  
+ 
+ /*
+  * Get relative position of value in histogram bin in [0,1] range.
+  */
+ static double
+ get_len_position(double value, double hist1, double hist2)
+ {
+ 	if (!is_infinite(hist1) && !is_infinite(hist2))
+ 	{
+ 		/*
+ 		 * Both bounds are finite. Assuming the subtype's comparison function
+ 		 * works sanely, the value must be finite, too, because it lies
+ 		 * somewhere between the bounds. If it doesn't, just return something.
+ 		 */
+ 		if (is_infinite(value))
+ 			return 0.5;
+ 
+ 		/* Calculate relative position using subdiff function. */
+ 		{
+ 			double d = 1.0 - (hist2 - value) / (hist2 - hist1);
+ 			Assert(!is_infinite(d));
+ 			return d;
+ 		}
+ 	}
+ 	else if (is_infinite(hist1) && !is_infinite(hist2))
+ 	{
+ 		/*
+ 		 * Lower bin boundary is -infinite, upper is finite.
+ 		 * Return 1.0 to indicate the value is infinitely far from the lower
+ 		 * bound.
+ 		 */
+ 		return 1.0;
+ 	}
+ 	else if (is_infinite(hist1) && is_infinite(hist2))
+ 	{
+ 		/* same as above, but in reverse */
+ 		return 0.0;
+ 	}
+ 	else
+ 	{
+ 		/*
+ 		 * If both bin boundaries are infinite, they should be equal to each
+ 		 * other, and the value should also be infinite and equal to both
+ 		 * bounds. (But don't Assert that, to avoid crashing if a user creates
+ 		 * a datatype with a broken comparison function).
+ 		 *
+ 		 * Assume the value to lie in the middle of the infinite bounds.
+ 		 */
+ 		return 0.5;
+ 	}
+ }
+ 
+ /*
+  * Measure distance between two range bounds.
+  */
+ static float8
+ get_distance(TypeCacheEntry *typcache, RangeBound *bound1, RangeBound *bound2)
+ {
+ 	bool	has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
+ 
+ 	if (!bound1->infinite && !bound2->infinite)
+ 	{
+ 		/*
+ 		 * No bounds are infinite, use subdiff function or return default
+ 		 * value of 1.0 if no subdiff is available.
+ 		 */
+ 		if (has_subdiff)
+ 			return
+ 				DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
+ 												 typcache->rng_collation,
+ 												 bound2->val,
+ 												 bound1->val));
+ 		else
+ 			return 1.0;
+ 	}
+ 	else if (bound1->infinite && bound2->infinite)
+ 	{
+ 		/* Both bounds are infinite */
+ 		if (bound1->lower == bound2->lower)
+ 			return 0.0;
+ 		else
+ 			return get_float8_infinity();
+ 	}
+ 	else
+ 	{
+ 		/* One bound is infinite, another is not */
+ 		return get_float8_infinity();
+ 	}
+ }
+ 
+ /*
+  * Calculate the average of function P(x), in the interval [length1, length2],
+  * where P(x) is the fraction of tuples with length < x (or length <= x if
+  * 'equal' is true).
+  */
+ static double
+ calc_length_hist_frac(Datum *length_hist_values, int length_hist_nvalues,
+ 					  double length1, double length2, bool equal)
+ {
+ 	double		frac;
+ 	double		A, B, PA, PB;
+ 	double		pos;
+ 	int			i;
+ 	double		area;
+ 
+ 	Assert(length2 >= length1);
+ 
+ 	if (length2 < 0.0)
+ 		return 0.0; /* shouldn't happen, but doesn't hurt to check */
+ 
+ 	if (is_infinite(length2) && equal)
+ 		return 1.0;
+ 
+ 	/*----------
+ 	 * The average of a function between A and B can be calculated by the
+ 	 * formula:
+ 	 *
+ 	 *          B
+ 	 *    1     /
+ 	 * -------  | P(x)dx
+ 	 *  B - A   /
+ 	 *          A
+ 	 *
+ 	 * The geometrical interpretation is that the integral is the area under
+ 	 * the graph of P(x). P(x) is defined by the length histogram. We
+ 	 * calculate the area in a piecewise fashion, iterating through the length
+ 	 * histogram bins. Each bin is a trapezoid:
+ 	 *
+ 	 *       P(x2)
+ 	 *        /|
+ 	 *       / |
+ 	 * P(x1)/  |
+ 	 *     |   |
+ 	 *     |   |
+ 	 *  ---+---+--
+ 	 *     x1  x2
+ 	 *
+ 	 * where x1 and x2 are the boundaries of the current histogram, and P(x1)
+ 	 * and P(x1) are the cumulative fraction of tuples at the boundaries.
+ 	 *
+ 	 * The area of each trapezoid is 1/2 * (P(x2) + P(x1)) * (x2 - x1)
+ 	 *
+ 	 * The first bin contains the lower bound passed by the caller, so we
+ 	 * use linear interpolation between the previous and next histogram bin
+ 	 * boundary to calculate P(x1). Likewise for the last bin: we use linear
+ 	 * interpolation to calculate P(x2). For the bins in between, x1 and x2
+ 	 * lie on histogram bin boundaries, so P(x1) and P(x2) are simply:
+ 	 * P(x1) =    (bin index) / (number of bins)
+ 	 * P(x2) = (bin index + 1 / (number of bins)
+ 	 */
+ 
+ 	/* First bin, the one that contains lower bound */
+ 	i = length_hist_bsearch(length_hist_values, length_hist_nvalues, length1, equal);
+ 	if (i >= length_hist_nvalues - 1)
+ 		return 1.0;
+ 
+ 	if (i < 0)
+ 	{
+ 		i = 0;
+ 		pos = 0.0;
+ 	}
+ 	else
+ 	{
+ 		/* interpolate length1's position in the bin */
+ 		pos = get_len_position(length1,
+ 							   DatumGetFloat8(length_hist_values[i]),
+ 							   DatumGetFloat8(length_hist_values[i + 1]));
+ 	}
+ 	PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
+ 	B = length1;
+ 
+ 	/*
+ 	 * In the degenerate case that length1 == length2, simply return P(length1).
+ 	 * This is not merely an optimization: if length1 == length2, we'd divide
+ 	 * by zero later on.
+ 	 */
+ 	if (length2 == length1)
+ 		return PB;
+ 
+ 	/*
+ 	 * Loop through all the bins, until we hit the last bin, the one that
+ 	 * contains the upper bound. (if lower and upper bounds are in the same
+ 	 * bin, this falls out immediately)
+ 	 */
+ 	area = 0.0;
+ 	for (; i < length_hist_nvalues - 1; i++)
+ 	{
+ 		double bin_upper = DatumGetFloat8(length_hist_values[i + 1]);
+ 
+ 		/* check if we've reached the last bin */
+ 		if (!(bin_upper < length2 || (equal && bin_upper <= length2)))
+ 			break;
+ 
+ 		/* the upper bound of previous bin is the lower bound of this bin */
+ 		A = B; PA = PB;
+ 
+ 		B = bin_upper;
+ 		PB = (double) i / (double) (length_hist_nvalues - 1);
+ 
+ 		/*
+ 		 * Add the area of this trapezoid to the total. The point of the
+ 		 * if-check is to avoid NaN, in the corner case that PA == PB == 0, and
+ 		 * B - A == Inf. The area of a zero-height trapezoid (PA == PB == 0) is
+ 		 * zero, regardless of the width (B - A).
+ 		 */
+ 		if (PA > 0 || PB > 0)
+ 			area += 0.5 * (PB + PA) * (B - A);
+ 	}
+ 
+ 	/* Last bin */
+ 	A = B; PA = PB;
+ 
+ 	B = length2; /* last bin ends at the query upper bound */
+ 	if (i >= length_hist_nvalues - 1)
+ 		pos = 0.0;
+ 	else
+ 	{
+ 		if (DatumGetFloat8(length_hist_values[i]) == DatumGetFloat8(length_hist_values[i + 1]))
+ 			pos = 0.0;
+ 		else
+ 			pos = get_len_position(length2, DatumGetFloat8(length_hist_values[i]), DatumGetFloat8(length_hist_values[i + 1]));
+ 	}
+ 	PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
+ 
+ 	if (PA > 0 || PB > 0)
+ 		area += 0.5 * (PB + PA) * (B - A);
+ 
+ 	/*
+ 	 * Ok, we have calculated the area, ie. the integral. Divide by width to
+ 	 * get the requested average.
+ 	 */
+ 	frac = area / (length2 - length1);
+ 
+ 	return frac;
+ }
+ 
+ /*
+  * Calculate selectivity of "<@" operator using histograms of range lower bounds
+  * and histogram of range lengths.
+  *
+  * Note, this is "var <@ const", ie. estimate the fraction of ranges that
+  * fall within the constant lower and upper bounds.
+  *
+  * The caller has already checked that lower and upper are finite
+  */
+ static double
+ calc_hist_selectivity_contained(TypeCacheEntry *typcache,
+ 								RangeBound *lower, RangeBound *upper,
+ 								RangeBound *hist_lower,	int hist_nvalues,
+ 								Datum *length_hist_values, int length_hist_nvalues)
+ {
+ 	int			i,
+ 				upper_index;
+ 	float8		prev_dist;
+ 	double		bin_width;
+ 	double		upper_bin_width;
+ 	double		sum_frac;
+ 
+ 	/*
+ 	 * Begin by finding the bin containing the upper bound, in the lower bound
+ 	 * histogram. Any range with a lower bound > constant upper bound can't
+ 	 * match, ie. there are no matches in bins greater than upper_index.
+ 	 */
+ 	upper->inclusive = !upper->inclusive;
+ 	upper->lower = true;
+ 	upper_index = rbound_bsearch(typcache, upper, hist_lower, hist_nvalues,
+ 								 false);
+ 
+ 	/*
+ 	 * Calculate upper_bin_width, ie. the fraction of the (upper_index,
+ 	 * upper_index + 1) bin which is greater than upper bound of query range
+ 	 * using linear interpolation of subdiff function.
+ 	 */
+ 	if (upper_index >= 0 && upper_index < hist_nvalues - 1)
+ 		upper_bin_width = get_position(typcache, upper,
+ 									   &hist_lower[upper_index],
+ 									   &hist_lower[upper_index + 1]);
+ 	else
+ 		upper_bin_width = 0.0;
+ 
+ 	/*
+ 	 * In the loop, dist and prev_dist are the distance of the "current" bin's
+ 	 * lower and upper bounds from the constant upper bound.
+ 	 *
+ 	 * bin_width represents the width of the current bin. Normally it is 1.0,
+ 	 * meaning a full width bin, but can be less in the corner cases: start
+ 	 * and end of the loop. We start with bin_width = upper_bin_width, because
+ 	 * we begin at the bin containing the upper bound.
+ 	 */
+ 	prev_dist = 0.0;
+ 	bin_width = upper_bin_width;
+ 
+ 	sum_frac = 0.0;
+ 	for (i = upper_index; i >= 0; i--)
+ 	{
+ 		double		dist;
+ 		double		length_hist_frac;
+ 		bool		final_bin = false;
+ 
+ 		/*
+ 		 * dist -- distance from upper bound of query range to lower bound of
+ 		 * the current bin in the lower bound histogram. Or to the lower bound
+ 		 * of the constant range, if this is the final bin, containing the
+ 		 * constant lower bound.
+ 		 */
+ 		if (range_cmp_bounds(typcache, &hist_lower[i], lower) < 0)
+ 		{
+ 			dist = get_distance(typcache, lower, upper);
+ 			/*
+ 			 * Subtract from bin_width the portion of this bin that we want
+ 			 * to ignore.
+ 			 */
+ 			bin_width -= get_position(typcache, lower, &hist_lower[i],
+ 									  &hist_lower[i + 1]);
+ 			if (bin_width < 0.0)
+ 				bin_width = 0.0;
+ 			final_bin = true;
+ 		}
+ 		else
+ 			dist = get_distance(typcache, &hist_lower[i], upper);
+ 
+ 		/*
+ 		 * Estimate the fraction of tuples in this bin that are narrow enough
+ 		 * to not exceed the distance to the upper bound of the query range.
+ 		 */
+ 		length_hist_frac = calc_length_hist_frac(length_hist_values,
+ 												 length_hist_nvalues,
+ 												 prev_dist, dist, true);
+ 
+ 		/*
+ 		 * Add the fraction of tuples in this bin, with a suitable length,
+ 		 * to the total.
+ 		 */
+ 		sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
+ 
+ 		if (final_bin)
+ 			break;
+ 
+ 		bin_width = 1.0;
+ 		prev_dist = dist;
+ 	}
+ 
+ 	return sum_frac;
+ }
+ 
+ /*
+  * Calculate selectivity of "@>" operator using histograms of range lower bounds
+  * and histogram of range lengths.
+  *
+  * Note, this is "var @> const", ie. estimate the fraction of ranges that
+  * contain the constant lower and upper bounds.
+  */
+ static double
+ calc_hist_selectivity_contains(TypeCacheEntry *typcache,
+ 							   RangeBound *lower, RangeBound *upper,
+ 							   RangeBound *hist_lower, int hist_nvalues,
+ 							   Datum *length_hist_values, int length_hist_nvalues)
+ {
+ 	int			i,
+ 				lower_index;
+ 	double		bin_width,
+ 				lower_bin_width;
+ 	double		sum_frac;
+ 	float8		prev_dist;
+ 
+ 	/* Find the bin containing the lower bound of query range. */
+ 	lower_index = rbound_bsearch(typcache, lower, hist_lower, hist_nvalues,
+ 								 true);
+ 
+ 	/*
+ 	 * Calculate lower_bin_width, ie. the fraction of the of (lower_index,
+ 	 * lower_index + 1) bin which is greater than lower bound of query range
+ 	 * using linear interpolation of subdiff function.
+ 	 */
+ 	if (lower_index >= 0 && lower_index < hist_nvalues - 1)
+ 		lower_bin_width = get_position(typcache, lower, &hist_lower[lower_index],
+ 								  &hist_lower[lower_index + 1]);
+ 	else
+ 		lower_bin_width = 0.0;
+ 
+ 	/*
+ 	 * Loop through all the lower bound bins, smaller than the query lower
+ 	 * bound. In the loop, dist and prev_dist are the distance of the "current"
+ 	 * bin's lower and upper bounds from the constant upper bound. We begin
+ 	 * from query lower bound, and walk backwards, so the first bin's upper
+ 	 * bound is the query lower bound, and its distance to the query upper
+ 	 * bound is the length of the query range.
+ 	 *
+ 	 * bin_width represents the width of the current bin. Normally it is 1.0,
+ 	 * meaning a full width bin, except for the first bin, which is only
+ 	 * counted up to the constant lower bound.
+ 	 */
+ 	prev_dist = get_distance(typcache, lower, upper);
+ 	sum_frac = 0.0;
+ 	bin_width = lower_bin_width;
+ 	for (i = lower_index; i >= 0; i--)
+ 	{
+ 		float8		dist;
+ 		double		length_hist_frac;
+ 
+ 		/*
+ 		 * dist -- distance from upper bound of query range to current
+ 		 * value of lower bound histogram or lower bound of query range (if
+ 		 * we've reach it).
+ 		 */
+ 		dist = get_distance(typcache, &hist_lower[i], upper);
+ 
+ 		/*
+ 		 * Get average fraction of length histogram which covers intervals
+ 		 * longer than (or equal to) distance to upper bound of query range.
+ 		 */
+ 		length_hist_frac =
+ 			1.0 - calc_length_hist_frac(length_hist_values,
+ 										length_hist_nvalues,
+ 										prev_dist, dist, false);
+ 
+ 		sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
+ 
+ 		bin_width = 1.0;
+ 		prev_dist = dist;
+ 	}
+ 
+ 	return sum_frac;
+ }
*** a/src/backend/utils/adt/rangetypes_typanalyze.c
--- b/src/backend/utils/adt/rangetypes_typanalyze.c
***************
*** 29,34 ****
--- 29,36 ----
  #include "utils/builtins.h"
  #include "utils/rangetypes.h"
  
+ static int float8_qsort_cmp(const void *a1, const void *a2);
+ static int range_bound_qsort_cmp(const void *a1, const void *a2, void *arg);
  static void compute_range_stats(VacAttrStats *stats,
  		   AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows);
  
***************
*** 57,62 **** range_typanalyze(PG_FUNCTION_ARGS)
--- 59,84 ----
  }
  
  /*
+  * Comparison function for float8 which are used for measurement of range
+  * size.
+  */
+ static int
+ float8_qsort_cmp(const void *a1, const void *a2)
+ {
+ 	const float8 *f1,
+ 			   *f2;
+ 
+ 	f1 = (const float8 *) a1;
+ 	f2 = (const float8 *) a2;
+ 	if (*f1 < *f2)
+ 		return -1;
+ 	else if (*f1 == *f2)
+ 		return 0;
+ 	else
+ 		return 1;
+ }
+ 
+ /*
   * Comparison function for sorting RangeBounds.
   */
  static int
***************
*** 77,82 **** compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
--- 99,105 ----
  					int samplerows, double totalrows)
  {
  	TypeCacheEntry *typcache = (TypeCacheEntry *) stats->extra_data;
+ 	bool		has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
  	int			null_cnt = 0;
  	int			non_null_cnt = 0;
  	int			non_empty_cnt = 0;
***************
*** 85,94 **** compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
--- 108,119 ----
  	int			slot_idx;
  	int			num_bins = stats->attr->attstattarget;
  	int			num_hist;
+ 	float8	   *lengths;
  	RangeBound *lowers, *uppers;
  	double		total_width = 0;
  
  	/* Allocate memory for arrays of range bounds. */
+ 	lengths = (float8 *) palloc(sizeof(float8) * samplerows);
  	lowers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
  	uppers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
  
***************
*** 101,106 **** compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
--- 126,132 ----
  		RangeType  *range;
  		RangeBound	lower,
  					upper;
+ 		float8		length;
  
  		vacuum_delay_point();
  
***************
*** 122,127 **** compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
--- 148,179 ----
  		range = DatumGetRangeType(value);
  		range_deserialize(typcache, range, &lower, &upper, &empty);
  
+ 		if (empty)
+ 		{
+ 			/* Length of empty range is zero */
+ 			length = 0.0;
+ 		}
+ 		else if (lower.infinite || upper.infinite)
+ 		{
+ 			/* Length of any kind of infinite range is initite */
+ 			length = get_float8_infinity();
+ 		}
+ 		else
+ 		{
+ 			/*
+ 			 * For ordinal range use subdiff function between upper and lower
+ 			 * bound values or use default value of 1.0 if no subdiff is
+ 			 * available.
+ 			 */
+ 			if (has_subdiff)
+ 				length = DatumGetFloat8(FunctionCall2Coll(
+ 												&typcache->rng_subdiff_finfo,
+ 												typcache->rng_collation,
+ 												upper.val, lower.val));
+ 			else
+ 				length = 1.0;
+ 		}
+ 
  		if (!empty)
  		{
  			/* Fill bound values for further usage in histograms */
***************
*** 132,137 **** compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
--- 184,190 ----
  		else
  			empty_cnt++;
  
+ 		lengths[non_null_cnt] = length;
  		non_null_cnt++;
  	}
  
***************
*** 141,146 **** compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
--- 194,200 ----
  	if (non_null_cnt > 0)
  	{
  		Datum	   *bound_hist_values;
+ 		Datum	   *length_hist_values;
  		int			pos,
  					posfrac,
  					delta,
***************
*** 210,221 **** compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
  			slot_idx++;
  		}
  
  		/* Store the fraction of empty ranges */
  		emptyfrac = (float4 *) palloc(sizeof(float4));
  		*emptyfrac = ((double) empty_cnt) / ((double) non_null_cnt);
! 		stats->stakind[slot_idx] = STATISTIC_KIND_RANGE_EMPTY_FRAC;
  		stats->stanumbers[slot_idx] = emptyfrac;
  		stats->numnumbers[slot_idx] = 1;
  		slot_idx++;
  
  		MemoryContextSwitchTo(old_cxt);
--- 264,333 ----
  			slot_idx++;
  		}
  
+ 		/*
+ 		 * Generate a length histogram slot entry if we've collected at least
+ 		 * two length values which can be not distinct.
+ 		 */
+ 		if (non_null_cnt >= 2)
+ 		{
+ 			/*
+ 			 * Ascending sort of range lengths for further filling of
+ 			 * histogram
+ 			 */
+ 			qsort(lengths, non_null_cnt, sizeof(float8), float8_qsort_cmp);
+ 
+ 			num_hist = non_null_cnt;
+ 			if (num_hist > num_bins)
+ 				num_hist = num_bins + 1;
+ 
+ 			length_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
+ 
+ 			/*
+ 			 * The object of this loop is to copy the first and last lengths[]
+ 			 * entries along with evenly-spaced values in between. So the i'th
+ 			 * value is lengths[(i * (nvals - 1)) / (num_hist - 1)]. But
+ 			 * computing that subscript directly risks integer overflow when the
+ 			 * stats target is more than a couple thousand.  Instead we add
+ 			 * (nvals - 1) / (num_hist - 1) to pos at each step, tracking the
+ 			 * integral and fractional parts of the sum separately.
+ 			 */
+ 			delta = (non_null_cnt - 1) / (num_hist - 1);
+ 			deltafrac = (non_null_cnt - 1) % (num_hist - 1);
+ 			pos = posfrac = 0;
+ 
+ 			for (i = 0; i < num_hist; i++)
+ 			{
+ 				length_hist_values[i] = Float8GetDatum(lengths[pos]);
+ 				pos += delta;
+ 				posfrac += deltafrac;
+ 				if (posfrac >= (num_hist - 1))
+ 				{
+ 					/* fractional part exceeds 1, carry to integer part */
+ 					pos++;
+ 					posfrac -= (num_hist - 1);
+ 				}
+ 			}
+ 			stats->staop[slot_idx] = Float8LessOperator;
+ 			stats->stavalues[slot_idx] = length_hist_values;
+ 			stats->numvalues[slot_idx] = num_hist;
+ 			/* We are storing float8 values */
+ 			stats->statypid[slot_idx] = FLOAT8OID;
+ 			stats->statyplen[slot_idx] = sizeof(float8);
+ #ifdef USE_FLOAT8_BYVAL
+ 			stats->statypbyval[slot_idx] = true;
+ #else
+ 			stats->statypbyval[slot_idx] = false;
+ #endif
+ 			stats->statypalign[slot_idx] = 'd';
+ 		}
+ 
  		/* Store the fraction of empty ranges */
  		emptyfrac = (float4 *) palloc(sizeof(float4));
  		*emptyfrac = ((double) empty_cnt) / ((double) non_null_cnt);
! 
  		stats->stanumbers[slot_idx] = emptyfrac;
  		stats->numnumbers[slot_idx] = 1;
+ 		stats->stakind[slot_idx] = STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM;
  		slot_idx++;
  
  		MemoryContextSwitchTo(old_cxt);
*** a/src/include/catalog/pg_operator.h
--- b/src/include/catalog/pg_operator.h
***************
*** 527,532 **** DATA(insert OID = 671 (  "<>"	   PGNSP PGUID b f f	701  701	16 671 670 float8ne
--- 527,533 ----
  DESCR("not equal");
  DATA(insert OID = 672 (  "<"	   PGNSP PGUID b f f	701  701	16 674 675 float8lt scalarltsel scalarltjoinsel ));
  DESCR("less than");
+ #define Float8LessOperator	672
  DATA(insert OID = 673 (  "<="	   PGNSP PGUID b f f	701  701	16 675 674 float8le scalarltsel scalarltjoinsel ));
  DESCR("less than or equal");
  DATA(insert OID = 674 (  ">"	   PGNSP PGUID b f f	701  701	16 672 673 float8gt scalargtsel scalargtjoinsel ));
*** a/src/include/catalog/pg_statistic.h
--- b/src/include/catalog/pg_statistic.h
***************
*** 269,279 **** typedef FormData_pg_statistic *Form_pg_statistic;
  #define STATISTIC_KIND_DECHIST	5
  
  /*
!  * An "empty frac" slot describes the fraction of empty ranges in a range-type
!  * column.  stavalues is not used and should be NULL.  stanumbers contains a
!  * single entry, the fraction of empty ranges (0.0 to 1.0).
   */
! #define STATISTIC_KIND_RANGE_EMPTY_FRAC  6
  
  /*
   * A "bounds histogram" slot is similar to STATISTIC_KIND_HISTOGRAM, but for
--- 269,283 ----
  #define STATISTIC_KIND_DECHIST	5
  
  /*
!  * A "length histogram" slot describes the distribution of range lengths in
!  * rows of a range-type column. stanumbers contains a single entry, the
!  * fraction of empty ranges. stavalues is a histogram of non-empty lengths, in
!  * a format similar to STATISTIC_KIND_HISTOGRAM: it contains M (>=2) range
!  * values that divide the column data values into M-1 bins of approximately
!  * equal population. The lengths are stores as float8s, as measured by the
!  * range type's subdiff function. Only non-null rows are considered.
   */
! #define STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM  6
  
  /*
   * A "bounds histogram" slot is similar to STATISTIC_KIND_HISTOGRAM, but for
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to