Github user wzhfy commented on a diff in the pull request: https://github.com/apache/spark/pull/19479#discussion_r147887335 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala --- @@ -216,65 +218,61 @@ object ColumnStat extends Logging { } } - /** - * Constructs an expression to compute column statistics for a given column. - * - * The expression should create a single struct column with the following schema: - * distinctCount: Long, min: T, max: T, nullCount: Long, avgLen: Long, maxLen: Long - * - * Together with [[rowToColumnStat]], this function is used to create [[ColumnStat]] and - * as a result should stay in sync with it. - */ - def statExprs(col: Attribute, relativeSD: Double): CreateNamedStruct = { - def struct(exprs: Expression*): CreateNamedStruct = CreateStruct(exprs.map { expr => - expr.transformUp { case af: AggregateFunction => af.toAggregateExpression() } - }) - val one = Literal(1, LongType) + private def convertToHistogram(s: String): EquiHeightHistogram = { + val idx = s.indexOf(",") + if (idx <= 0) { + throw new AnalysisException("Failed to parse histogram.") + } + val height = s.substring(0, idx).toDouble + val pattern = "Bucket\\(([^,]+), ([^,]+), ([^\\)]+)\\)".r + val buckets = pattern.findAllMatchIn(s).map { m => + EquiHeightBucket(m.group(1).toDouble, m.group(2).toDouble, m.group(3).toLong) + }.toSeq + EquiHeightHistogram(height, buckets) + } - // the approximate ndv (num distinct value) should never be larger than the number of rows - val numNonNulls = if (col.nullable) Count(col) else Count(one) - val ndv = Least(Seq(HyperLogLogPlusPlus(col, relativeSD), numNonNulls)) - val numNulls = Subtract(Count(one), numNonNulls) - val defaultSize = Literal(col.dataType.defaultSize, LongType) +} - def fixedLenTypeStruct(castType: DataType) = { - // For fixed width types, avg size should be the same as max size. - struct(ndv, Cast(Min(col), castType), Cast(Max(col), castType), numNulls, defaultSize, - defaultSize) - } +/** + * There are a few types of histograms in state-of-the-art estimation methods. E.g. equi-width + * histogram, equi-height histogram, frequency histogram (value-frequency pairs) and hybrid + * histogram, etc. + * Currently in Spark, we support equi-height histogram since it is good at handling skew + * distribution, and also provides reasonable accuracy in other cases. + * We can add other histograms in the future, which will make estimation logic more complicated. --- End diff -- It's not in high priority, here I just want to say it's doable, but will complicate the estimation logic.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org