Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/20935#discussion_r178444321 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala --- @@ -322,19 +323,75 @@ private[columnar] final class DecimalColumnStats(precision: Int, scale: Int) ext Array[Any](lower, upper, nullCount, count, sizeInBytes) } -private[columnar] final class ObjectColumnStats(dataType: DataType) extends ColumnStats { - val columnType = ColumnType(dataType) +private abstract class OrderableSafeColumnStats[T](dataType: DataType) extends ColumnStats { + protected var upper: T = _ + protected var lower: T = _ + + private val columnType = ColumnType(dataType) + private val ordering = dataType match { + case x if RowOrdering.isOrderable(dataType) => + Option(TypeUtils.getInterpretedOrdering(x)) + case _ => None + } override def gatherStats(row: InternalRow, ordinal: Int): Unit = { if (!row.isNullAt(ordinal)) { - val size = columnType.actualSize(row, ordinal) - sizeInBytes += size + sizeInBytes += columnType.actualSize(row, ordinal) count += 1 + ordering.foreach { order => + val value = getValue(row, ordinal) + if (upper == null || order.gt(value, upper)) upper = copy(value) + if (lower == null || order.lt(value, lower)) lower = copy(value) + } } else { - gatherNullStats + gatherNullStats() } } + def getValue(row: InternalRow, ordinal: Int): T + + def copy(value: T): T + + override def collectedStatistics: Array[Any] = + Array[Any](lower, upper, nullCount, count, sizeInBytes) +} + +private[columnar] final class ArrayColumnStats(dataType: DataType) + extends OrderableSafeColumnStats[ArrayData](dataType) { + override def getValue(row: InternalRow, ordinal: Int): ArrayData = row.getArray(ordinal) + + override def copy(value: ArrayData): ArrayData = value.copy() +} + +private[columnar] final class StructColumnStats(dataType: DataType) + extends OrderableSafeColumnStats[InternalRow](dataType) { --- End diff -- InternalRow -> UnsafeRow? Looks like for struct, the column type is specified for `UnsafeRow`.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org