Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/19479#discussion_r149058921 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala --- @@ -275,6 +317,118 @@ object ColumnStat extends Logging { avgLen = row.getLong(4), maxLen = row.getLong(5) ) + if (row.isNullAt(6)) { + cs + } else { + val ndvs = row.getArray(6).toLongArray() + assert(percentiles.get.length == ndvs.length + 1) + val endpoints = percentiles.get.map(_.toString.toDouble) + // Construct equi-height histogram + val buckets = ndvs.zipWithIndex.map { case (ndv, i) => + EquiHeightBucket(endpoints(i), endpoints(i + 1), ndv) + } + val nonNullRows = rowCount - cs.nullCount + val ehHistogram = EquiHeightHistogram(nonNullRows.toDouble / ndvs.length, buckets) + cs.copy(histogram = Some(ehHistogram)) + } + } + +} + +/** + * There are a few types of histograms in state-of-the-art estimation methods. E.g. equi-width + * histogram, equi-height histogram, frequency histogram (value-frequency pairs) and hybrid + * histogram, etc. + * Currently in Spark, we support equi-height histogram since it is good at handling skew + * distribution, and also provides reasonable accuracy in other cases. + * We can add other histograms in the future, which will make estimation logic more complicated. + * This is because we will have to deal with computation between different types of histograms in + * some cases, e.g. for join columns. + */ +trait Histogram + +/** + * Equi-height histogram represents column value distribution by a sequence of buckets. Each bucket + * has a value range and contains approximately the same number of rows. + * @param height number of rows in each bucket + * @param ehBuckets equi-height histogram buckets + */ +case class EquiHeightHistogram(height: Double, ehBuckets: Array[EquiHeightBucket]) + extends Histogram { + + // Only for histogram equality test. + override def equals(other: Any): Boolean = other match { + case otherEHH: EquiHeightHistogram => + height == otherEHH.height && ehBuckets.sameElements(otherEHH.ehBuckets) + case _ => false + } + + override def hashCode(): Int = super.hashCode() +} + +/** + * A bucket in an equi-height histogram. We use double type for lower/higher bound for simplicity. + * @param lo lower bound of the value range in this bucket + * @param hi higher bound of the value range in this bucket + * @param ndv approximate number of distinct values in this bucket + */ +case class EquiHeightBucket(lo: Double, hi: Double, ndv: Long) + +object HistogramSerializer { + // A flag to indicate the type of histogram + val EQUI_HEIGHT_HISTOGRAM_TYPE: Byte = 1 + + /** + * Serializes a given histogram to a string. For advanced statistics like histograms, sketches, + * etc, we don't provide readability for their serialized formats in metastore (as + * string-to-string table properties). This is because it's hard or unnatural for these + * statistics to be human readable. For example, histogram is probably split into multiple + * key-value properties, instead of a single, self-described property. And for + * count-min-sketch, it's essentially unnatural to make it a readable string. + */ + final def serialize(histogram: Histogram): String = histogram match { + case h: EquiHeightHistogram => + // type + numBuckets + height + numBuckets * (lo + hi + ndv) --- End diff -- what's the common size of `numBuckets`? If it's large enough, we may need to consider compression.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org