Github user ron8hu commented on a diff in the pull request: https://github.com/apache/spark/pull/19783#discussion_r154255068 --- Diff: sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala --- @@ -578,6 +590,112 @@ class FilterEstimationSuite extends StatsEstimationTestBase { expectedRowCount = 5) } + // The following test cases have histogram information collected for the test column + test("Not(cintHgm < 3 AND null)") { + val condition = Not(And(LessThan(attrIntHgm, Literal(3)), Literal(null, IntegerType))) + validateEstimatedStats( + Filter(condition, childStatsTestPlan(Seq(attrIntHgm), 10L)), + Seq(attrIntHgm -> colStatIntHgm.copy(distinctCount = 6)), + expectedRowCount = 9) + } + + test("cintHgm = 5") { + validateEstimatedStats( + Filter(EqualTo(attrIntHgm, Literal(5)), childStatsTestPlan(Seq(attrIntHgm), 10L)), + Seq(attrIntHgm -> ColumnStat(distinctCount = 1, min = Some(5), max = Some(5), + nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))), + expectedRowCount = 4) + } + + test("cintHgm = 0") { + // This is an out-of-range case since 0 is outside the range [min, max] + validateEstimatedStats( + Filter(EqualTo(attrIntHgm, Literal(0)), childStatsTestPlan(Seq(attrIntHgm), 10L)), + Nil, + expectedRowCount = 0) + } + + test("cintHgm < 3") { + validateEstimatedStats( + Filter(LessThan(attrIntHgm, Literal(3)), childStatsTestPlan(Seq(attrIntHgm), 10L)), + Seq(attrIntHgm -> ColumnStat(distinctCount = 1, min = Some(1), max = Some(3), + nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))), + expectedRowCount = 2) + } + + test("cintHgm < 0") { + // This is a corner case since literal 0 is smaller than min. + validateEstimatedStats( + Filter(LessThan(attrIntHgm, Literal(0)), childStatsTestPlan(Seq(attrIntHgm), 10L)), + Nil, + expectedRowCount = 0) + } + + test("cintHgm <= 3") { + validateEstimatedStats( + Filter(LessThanOrEqual(attrIntHgm, Literal(3)), childStatsTestPlan(Seq(attrIntHgm), 10L)), + Seq(attrIntHgm -> ColumnStat(distinctCount = 1, min = Some(1), max = Some(3), + nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))), + expectedRowCount = 2) + } + + test("cintHgm > 6") { + validateEstimatedStats( + Filter(GreaterThan(attrIntHgm, Literal(6)), childStatsTestPlan(Seq(attrIntHgm), 10L)), + Seq(attrIntHgm -> ColumnStat(distinctCount = 2, min = Some(6), max = Some(10), + nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))), + expectedRowCount = 2) + } + + test("cintHgm > 10") { + // This is a corner case since max value is 10. + validateEstimatedStats( + Filter(GreaterThan(attrIntHgm, Literal(10)), childStatsTestPlan(Seq(attrIntHgm), 10L)), + Nil, + expectedRowCount = 0) + } + + test("cintHgm >= 6") { + validateEstimatedStats( + Filter(GreaterThanOrEqual(attrIntHgm, Literal(6)), childStatsTestPlan(Seq(attrIntHgm), 10L)), + Seq(attrIntHgm -> ColumnStat(distinctCount = 3, min = Some(6), max = Some(10), + nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))), + expectedRowCount = 4) + } + + test("cintHgm IS NULL") { + validateEstimatedStats( + Filter(IsNull(attrIntHgm), childStatsTestPlan(Seq(attrIntHgm), 10L)), + Nil, + expectedRowCount = 0) + } + + test("cintHgm IS NOT NULL") { + validateEstimatedStats( + Filter(IsNotNull(attrIntHgm), childStatsTestPlan(Seq(attrIntHgm), 10L)), + Seq(attrIntHgm -> ColumnStat(distinctCount = 6, min = Some(1), max = Some(10), + nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))), + expectedRowCount = 10) + } + + test("cintHgm > 3 AND cintHgm <= 6") { + val condition = And(GreaterThan(attrIntHgm, + Literal(3)), LessThanOrEqual(attrIntHgm, Literal(6))) + validateEstimatedStats( + Filter(condition, childStatsTestPlan(Seq(attrIntHgm), 10L)), + Seq(attrIntHgm -> ColumnStat(distinctCount = 5, min = Some(3), max = Some(6), + nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))), + expectedRowCount = 8) + } + + test("cintHgm = 3 OR cintHgm = 6") { --- End diff -- We have added histogram test cases for skewed distribution. I will add more histogram test cases for non-skewed distribution.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org