Repository: spark
Updated Branches:
  refs/heads/master 13e489b67 -> bdb5e55c2


[SPARK-21322][SQL][FOLLOWUP] support histogram in filter cardinality estimation

## What changes were proposed in this pull request?

some code cleanup/refactor and naming improvement.

## How was this patch tested?

existing tests

Author: Wenchen Fan <wenc...@databricks.com>

Closes #19952 from cloud-fan/minor.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bdb5e55c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bdb5e55c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bdb5e55c

Branch: refs/heads/master
Commit: bdb5e55c2a67d16a36ad6baa22296d714d3525af
Parents: 13e489b
Author: Wenchen Fan <wenc...@databricks.com>
Authored: Wed Dec 13 14:49:15 2017 +0800
Committer: Wenchen Fan <wenc...@databricks.com>
Committed: Wed Dec 13 14:49:15 2017 +0800

----------------------------------------------------------------------
 .../statsEstimation/EstimationUtils.scala       | 109 ++++++-------
 .../statsEstimation/FilterEstimation.scala      | 152 ++++++++++---------
 2 files changed, 134 insertions(+), 127 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/bdb5e55c/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
index 2f416f2..6f868cb 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
@@ -115,14 +115,10 @@ object EstimationUtils {
   }
 
   /**
-   * Returns the number of the first bin into which a column value falls for a 
specified
+   * Returns the index of the first bin into which the given value falls for a 
specified
    * numeric equi-height histogram.
-   *
-   * @param value a literal value of a column
-   * @param bins an array of bins for a given numeric equi-height histogram
-   * @return the id of the first bin into which a column value falls.
    */
-  def findFirstBinForValue(value: Double, bins: Array[HistogramBin]): Int = {
+  private def findFirstBinForValue(value: Double, bins: Array[HistogramBin]): 
Int = {
     var i = 0
     while ((i < bins.length) && (value > bins(i).hi)) {
       i += 1
@@ -131,14 +127,10 @@ object EstimationUtils {
   }
 
   /**
-   * Returns the number of the last bin into which a column value falls for a 
specified
+   * Returns the index of the last bin into which the given value falls for a 
specified
    * numeric equi-height histogram.
-   *
-   * @param value a literal value of a column
-   * @param bins an array of bins for a given numeric equi-height histogram
-   * @return the id of the last bin into which a column value falls.
    */
-  def findLastBinForValue(value: Double, bins: Array[HistogramBin]): Int = {
+  private def findLastBinForValue(value: Double, bins: Array[HistogramBin]): 
Int = {
     var i = bins.length - 1
     while ((i >= 0) && (value < bins(i).lo)) {
       i -= 1
@@ -147,65 +139,76 @@ object EstimationUtils {
   }
 
   /**
-   * Returns a percentage of a bin holding values for column value in the 
range of
-   * [lowerValue, higherValue]
-   *
-   * @param higherValue a given upper bound value of a specified column value 
range
-   * @param lowerValue a given lower bound value of a specified column value 
range
-   * @param bin a single histogram bin
-   * @return the percentage of a single bin holding values in [lowerValue, 
higherValue].
+   * Returns the possibility of the given histogram bin holding values within 
the given range
+   * [lowerBound, upperBound].
    */
-  private def getOccupation(
-      higherValue: Double,
-      lowerValue: Double,
+  private def binHoldingRangePossibility(
+      upperBound: Double,
+      lowerBound: Double,
       bin: HistogramBin): Double = {
-    assert(bin.lo <= lowerValue && lowerValue <= higherValue && higherValue <= 
bin.hi)
+    assert(bin.lo <= lowerBound && lowerBound <= upperBound && upperBound <= 
bin.hi)
     if (bin.hi == bin.lo) {
       // the entire bin is covered in the range
       1.0
-    } else if (higherValue == lowerValue) {
+    } else if (upperBound == lowerBound) {
       // set percentage to 1/NDV
       1.0 / bin.ndv.toDouble
     } else {
       // Use proration since the range falls inside this bin.
-      math.min((higherValue - lowerValue) / (bin.hi - bin.lo), 1.0)
+      math.min((upperBound - lowerBound) / (bin.hi - bin.lo), 1.0)
     }
   }
 
   /**
-   * Returns the number of bins for column values in [lowerValue, higherValue].
-   * The column value distribution is saved in an equi-height histogram.  The 
return values is a
-   * double value is because we may return a portion of a bin. For example, a 
predicate
-   * "column = 8" may return the number of bins 0.2 if the holding bin has 5 
distinct values.
+   * Returns the number of histogram bins holding values within the given range
+   * [lowerBound, upperBound].
+   *
+   * Note that the returned value is double type, because the range boundaries 
usually occupy a
+   * portion of a bin. An extreme case is [value, value] which is generated by 
equal predicate
+   * `col = value`, we can get higher accuracy by allowing returning portion 
of histogram bins.
    *
-   * @param higherId id of the high end bin holding the high end value of a 
column range
-   * @param lowerId id of the low end bin holding the low end value of a 
column range
-   * @param higherEnd a given upper bound value of a specified column value 
range
-   * @param lowerEnd a given lower bound value of a specified column value 
range
-   * @param histogram a numeric equi-height histogram
-   * @return the number of bins for column values in [lowerEnd, higherEnd].
+   * @param upperBound the highest value of the given range
+   * @param upperBoundInclusive whether the upperBound is included in the range
+   * @param lowerBound the lowest value of the given range
+   * @param lowerBoundInclusive whether the lowerBound is included in the range
+   * @param bins an array of bins for a given numeric equi-height histogram
    */
-  def getOccupationBins(
-      higherId: Int,
-      lowerId: Int,
-      higherEnd: Double,
-      lowerEnd: Double,
-      histogram: Histogram): Double = {
-    assert(lowerId <= higherId)
-
-    if (lowerId == higherId) {
-      val curBin = histogram.bins(lowerId)
-      getOccupation(higherEnd, lowerEnd, curBin)
+  def numBinsHoldingRange(
+      upperBound: Double,
+      upperBoundInclusive: Boolean,
+      lowerBound: Double,
+      lowerBoundInclusive: Boolean,
+      bins: Array[HistogramBin]): Double = {
+    assert(bins.head.lo <= lowerBound && lowerBound <= upperBound && 
upperBound <= bins.last.hi,
+      "Given range does not fit in the given histogram.")
+    assert(upperBound != lowerBound || upperBoundInclusive || 
lowerBoundInclusive,
+      s"'$lowerBound < value < $upperBound' is an invalid range.")
+
+    val upperBinIndex = if (upperBoundInclusive) {
+      findLastBinForValue(upperBound, bins)
+    } else {
+      findFirstBinForValue(upperBound, bins)
+    }
+    val lowerBinIndex = if (lowerBoundInclusive) {
+      findFirstBinForValue(lowerBound, bins)
+    } else {
+      findLastBinForValue(lowerBound, bins)
+    }
+    assert(lowerBinIndex <= upperBinIndex, "Invalid histogram data.")
+
+
+    if (lowerBinIndex == upperBinIndex) {
+      binHoldingRangePossibility(upperBound, lowerBound, bins(lowerBinIndex))
     } else {
-      // compute how much lowerEnd/higherEnd occupies its bin
-      val lowerCurBin = histogram.bins(lowerId)
-      val lowerPart = getOccupation(lowerCurBin.hi, lowerEnd, lowerCurBin)
+      // Computes the occupied portion of bins of the upperBound and 
lowerBound.
+      val lowerBin = bins(lowerBinIndex)
+      val lowerPart = binHoldingRangePossibility(lowerBin.hi, lowerBound, 
lowerBin)
 
-      val higherCurBin = histogram.bins(higherId)
-      val higherPart = getOccupation(higherEnd, higherCurBin.lo, higherCurBin)
+      val higherBin = bins(upperBinIndex)
+      val higherPart = binHoldingRangePossibility(upperBound, higherBin.lo, 
higherBin)
 
-      // the total length is lowerPart + higherPart + bins between them
-      lowerPart + higherPart + higherId - lowerId - 1
+      // The total number of bins is lowerPart + higherPart + bins between them
+      lowerPart + higherPart + upperBinIndex - lowerBinIndex - 1
     }
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/bdb5e55c/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
index f52a15e..850dd1b 100755
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
@@ -336,43 +336,12 @@ case class FilterEstimation(plan: Filter) extends Logging 
{
         // returns 1/ndv if there is no histogram
         Some(1.0 / BigDecimal(ndv))
       } else {
-        // We compute filter selectivity using Histogram information.
-        val datum = EstimationUtils.toDecimal(literal.value, 
literal.dataType).toDouble
-        val histogram = colStat.histogram.get
-        val hgmBins = histogram.bins
-
-        // find bins where column's current min and max locate.  Note that a 
column's [min, max]
-        // range may change due to another condition applied earlier.
-        val min = EstimationUtils.toDecimal(colStat.min.get, 
literal.dataType).toDouble
-        val max = EstimationUtils.toDecimal(colStat.max.get, 
literal.dataType).toDouble
-        val minBinId = EstimationUtils.findFirstBinForValue(min, hgmBins)
-        val maxBinId = EstimationUtils.findLastBinForValue(max, hgmBins)
-
-        // compute how many bins the column's current valid range [min, max] 
occupies.
-        // Note that a column's [min, max] range may vary after we apply some 
filter conditions.
-        val validRangeBins = EstimationUtils.getOccupationBins(maxBinId, 
minBinId, max,
-          min, histogram)
-
-        val lowerBinId = EstimationUtils.findFirstBinForValue(datum, hgmBins)
-        val higherBinId = EstimationUtils.findLastBinForValue(datum, hgmBins)
-        assert(lowerBinId <= higherBinId)
-        val lowerBinNdv = hgmBins(lowerBinId).ndv
-        val higherBinNdv = hgmBins(higherBinId).ndv
-        // assume uniform distribution in each bin
-        val occupiedBins = if (lowerBinId == higherBinId) {
-          1.0 / lowerBinNdv
-        } else {
-          (1.0 / lowerBinNdv) +   // lowest bin
-            (higherBinId - lowerBinId - 1) + // middle bins
-            (1.0 / higherBinNdv)  // highest bin
-        }
-        Some(occupiedBins / validRangeBins)
+        Some(computeEqualityPossibilityByHistogram(literal, colStat))
       }
 
     } else {  // not in interval
       Some(0.0)
     }
-
   }
 
   /**
@@ -542,11 +511,7 @@ case class FilterEstimation(plan: Filter) extends Logging {
             }
         }
       } else {
-        val numericHistogram = colStat.histogram.get
-        val datum = EstimationUtils.toDecimal(literal.value, 
literal.dataType).toDouble
-        val max = EstimationUtils.toDecimal(colStat.max.get, 
literal.dataType).toDouble
-        val min = EstimationUtils.toDecimal(colStat.min.get, 
literal.dataType).toDouble
-        percent = computePercentByEquiHeightHgm(op, numericHistogram, max, 
min, datum)
+        percent = computeComparisonPossibilityByHistogram(op, literal, colStat)
       }
 
       if (update) {
@@ -574,51 +539,90 @@ case class FilterEstimation(plan: Filter) extends Logging 
{
   }
 
   /**
-   * Returns the selectivity percentage for binary condition in the column's
-   * current valid range [min, max]
-   *
-   * @param op a binary comparison operator
-   * @param histogram a numeric equi-height histogram
-   * @param max the upper bound of the current valid range for a given column
-   * @param min the lower bound of the current valid range for a given column
-   * @param datumNumber the numeric value of a literal
-   * @return the selectivity percentage for a condition in the current range.
+   * Computes the possibility of an equality predicate using histogram.
    */
+  private def computeEqualityPossibilityByHistogram(
+      literal: Literal, colStat: ColumnStat): Double = {
+    val datum = EstimationUtils.toDecimal(literal.value, 
literal.dataType).toDouble
+    val histogram = colStat.histogram.get
 
-  def computePercentByEquiHeightHgm(
-      op: BinaryComparison,
-      histogram: Histogram,
-      max: Double,
-      min: Double,
-      datumNumber: Double): Double = {
     // find bins where column's current min and max locate.  Note that a 
column's [min, max]
     // range may change due to another condition applied earlier.
-    val minBinId = EstimationUtils.findFirstBinForValue(min, histogram.bins)
-    val maxBinId = EstimationUtils.findLastBinForValue(max, histogram.bins)
+    val min = EstimationUtils.toDecimal(colStat.min.get, 
literal.dataType).toDouble
+    val max = EstimationUtils.toDecimal(colStat.max.get, 
literal.dataType).toDouble
 
     // compute how many bins the column's current valid range [min, max] 
occupies.
-    // Note that a column's [min, max] range may vary after we apply some 
filter conditions.
-    val minToMaxLength = EstimationUtils.getOccupationBins(maxBinId, minBinId, 
max, min, histogram)
-
-    val datumInBinId = op match {
-      case LessThan(_, _) | GreaterThanOrEqual(_, _) =>
-        EstimationUtils.findFirstBinForValue(datumNumber, histogram.bins)
-      case LessThanOrEqual(_, _) | GreaterThan(_, _) =>
-        EstimationUtils.findLastBinForValue(datumNumber, histogram.bins)
-    }
+    val numBinsHoldingEntireRange = EstimationUtils.numBinsHoldingRange(
+      upperBound = max,
+      upperBoundInclusive = true,
+      lowerBound = min,
+      lowerBoundInclusive = true,
+      histogram.bins)
+
+    val numBinsHoldingDatum = EstimationUtils.numBinsHoldingRange(
+      upperBound = datum,
+      upperBoundInclusive = true,
+      lowerBound = datum,
+      lowerBoundInclusive = true,
+      histogram.bins)
+
+    numBinsHoldingDatum / numBinsHoldingEntireRange
+  }
 
-    op match {
-      // LessThan and LessThanOrEqual share the same logic,
-      // but their datumInBinId may be different
-      case LessThan(_, _) | LessThanOrEqual(_, _) =>
-        EstimationUtils.getOccupationBins(datumInBinId, minBinId, datumNumber, 
min,
-          histogram) / minToMaxLength
-      // GreaterThan and GreaterThanOrEqual share the same logic,
-      // but their datumInBinId may be different
-      case GreaterThan(_, _) | GreaterThanOrEqual(_, _) =>
-        EstimationUtils.getOccupationBins(maxBinId, datumInBinId, max, 
datumNumber,
-          histogram) / minToMaxLength
+  /**
+   * Computes the possibility of a comparison predicate using histogram.
+   */
+  private def computeComparisonPossibilityByHistogram(
+      op: BinaryComparison, literal: Literal, colStat: ColumnStat): Double = {
+    val datum = EstimationUtils.toDecimal(literal.value, 
literal.dataType).toDouble
+    val histogram = colStat.histogram.get
+
+    // find bins where column's current min and max locate.  Note that a 
column's [min, max]
+    // range may change due to another condition applied earlier.
+    val min = EstimationUtils.toDecimal(colStat.min.get, 
literal.dataType).toDouble
+    val max = EstimationUtils.toDecimal(colStat.max.get, 
literal.dataType).toDouble
+
+    // compute how many bins the column's current valid range [min, max] 
occupies.
+    val numBinsHoldingEntireRange = EstimationUtils.numBinsHoldingRange(
+      max, upperBoundInclusive = true, min, lowerBoundInclusive = true, 
histogram.bins)
+
+    val numBinsHoldingRange = op match {
+      // LessThan and LessThanOrEqual share the same logic, the only 
difference is whether to
+      // include the upperBound in the range.
+      case _: LessThan =>
+        EstimationUtils.numBinsHoldingRange(
+          upperBound = datum,
+          upperBoundInclusive = false,
+          lowerBound = min,
+          lowerBoundInclusive = true,
+          histogram.bins)
+      case _: LessThanOrEqual =>
+        EstimationUtils.numBinsHoldingRange(
+          upperBound = datum,
+          upperBoundInclusive = true,
+          lowerBound = min,
+          lowerBoundInclusive = true,
+          histogram.bins)
+
+      // GreaterThan and GreaterThanOrEqual share the same logic, the only 
difference is whether to
+      // include the lowerBound in the range.
+      case _: GreaterThan =>
+        EstimationUtils.numBinsHoldingRange(
+          upperBound = max,
+          upperBoundInclusive = true,
+          lowerBound = datum,
+          lowerBoundInclusive = false,
+          histogram.bins)
+      case _: GreaterThanOrEqual =>
+        EstimationUtils.numBinsHoldingRange(
+          upperBound = max,
+          upperBoundInclusive = true,
+          lowerBound = datum,
+          lowerBoundInclusive = true,
+          histogram.bins)
     }
+
+    numBinsHoldingRange / numBinsHoldingEntireRange
   }
 
   /**


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to