[GitHub] spark pull request #19479: [SPARK-17074] [SQL] Generate equi-height histogra...

wzhfy Wed, 01 Nov 2017 23:40:49 -0700

Github user wzhfy commented on a diff in the pull request:

    https://github.com/apache/spark/pull/19479#discussion_r148451915
  
    --- Diff: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
 ---
    @@ -216,65 +219,61 @@ object ColumnStat extends Logging {
         }
       }
     
    -  /**
    -   * Constructs an expression to compute column statistics for a given 
column.
    -   *
    -   * The expression should create a single struct column with the 
following schema:
    -   * distinctCount: Long, min: T, max: T, nullCount: Long, avgLen: Long, 
maxLen: Long
    -   *
    -   * Together with [[rowToColumnStat]], this function is used to create 
[[ColumnStat]] and
    -   * as a result should stay in sync with it.
    -   */
    -  def statExprs(col: Attribute, relativeSD: Double): CreateNamedStruct = {
    -    def struct(exprs: Expression*): CreateNamedStruct = 
CreateStruct(exprs.map { expr =>
    -      expr.transformUp { case af: AggregateFunction => 
af.toAggregateExpression() }
    -    })
    -    val one = Literal(1, LongType)
    +  private def convertToHistogram(s: String): EquiHeightHistogram = {
    +    val idx = s.indexOf(",")
    +    if (idx <= 0) {
    +      throw new AnalysisException("Failed to parse histogram.")
    +    }
    +    val height = s.substring(0, idx).toDouble
    +    val pattern = "Bucket\\(([^,]+), ([^,]+), ([^\\)]+)\\)".r
    +    val buckets = pattern.findAllMatchIn(s).map { m =>
    +      EquiHeightBucket(m.group(1).toDouble, m.group(2).toDouble, 
m.group(3).toLong)
    +    }.toSeq
    +    EquiHeightHistogram(height, buckets)
    +  }
     
    -    // the approximate ndv (num distinct value) should never be larger 
than the number of rows
    -    val numNonNulls = if (col.nullable) Count(col) else Count(one)
    -    val ndv = Least(Seq(HyperLogLogPlusPlus(col, relativeSD), numNonNulls))
    -    val numNulls = Subtract(Count(one), numNonNulls)
    -    val defaultSize = Literal(col.dataType.defaultSize, LongType)
    +}
     
    -    def fixedLenTypeStruct(castType: DataType) = {
    -      // For fixed width types, avg size should be the same as max size.
    -      struct(ndv, Cast(Min(col), castType), Cast(Max(col), castType), 
numNulls, defaultSize,
    -        defaultSize)
    -    }
    +/**
    + * There are a few types of histograms in state-of-the-art estimation 
methods. E.g. equi-width
    + * histogram, equi-height histogram, frequency histogram (value-frequency 
pairs) and hybrid
    + * histogram, etc.
    + * Currently in Spark, we support equi-height histogram since it is good 
at handling skew
    + * distribution, and also provides reasonable accuracy in other cases.
    + * We can add other histograms in the future, which will make estimation 
logic more complicated.
    + * This is because we will have to deal with computation between different 
types of histograms in
    + * some cases, e.g. for join columns.
    + */
    +trait Histogram
     
    -    col.dataType match {
    -      case dt: IntegralType => fixedLenTypeStruct(dt)
    -      case _: DecimalType => fixedLenTypeStruct(col.dataType)
    -      case dt @ (DoubleType | FloatType) => fixedLenTypeStruct(dt)
    -      case BooleanType => fixedLenTypeStruct(col.dataType)
    -      case DateType => fixedLenTypeStruct(col.dataType)
    -      case TimestampType => fixedLenTypeStruct(col.dataType)
    -      case BinaryType | StringType =>
    -        // For string and binary type, we don't store min/max.
    -        val nullLit = Literal(null, col.dataType)
    -        struct(
    -          ndv, nullLit, nullLit, numNulls,
    -          // Set avg/max size to default size if all the values are null 
or there is no value.
    -          Coalesce(Seq(Ceil(Average(Length(col))), defaultSize)),
    -          Coalesce(Seq(Cast(Max(Length(col)), LongType), defaultSize)))
    -      case _ =>
    -        throw new AnalysisException("Analyzing column statistics is not 
supported for column " +
    -            s"${col.name} of data type: ${col.dataType}.")
    -    }
    -  }
    +/**
    + * Equi-height histogram represents column value distribution by a 
sequence of buckets. Each bucket
    + * has a value range and contains approximately the same number of rows.
    + * @param height number of rows in each bucket
    + * @param ehBuckets equi-height histogram buckets
    + */
    +case class EquiHeightHistogram(height: Double, ehBuckets: 
Seq[EquiHeightBucket]) extends Histogram {
    --- End diff --
    
    Histogram is generated per column. If we compute histograms for tens of 
columns, the number of parameters will become thousands or even tens of 
thousands. This can influence metastore access performance.



---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request #19479: [SPARK-17074] [SQL] Generate equi-height histogra...

Reply via email to