Github user ron8hu commented on a diff in the pull request:

    https://github.com/apache/spark/pull/19783#discussion_r154255068
  
    --- Diff: 
sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
 ---
    @@ -578,6 +590,112 @@ class FilterEstimationSuite extends 
StatsEstimationTestBase {
           expectedRowCount = 5)
       }
     
    +  // The following test cases have histogram information collected for the 
test column
    +  test("Not(cintHgm < 3 AND null)") {
    +    val condition = Not(And(LessThan(attrIntHgm, Literal(3)), 
Literal(null, IntegerType)))
    +    validateEstimatedStats(
    +      Filter(condition, childStatsTestPlan(Seq(attrIntHgm), 10L)),
    +      Seq(attrIntHgm -> colStatIntHgm.copy(distinctCount = 6)),
    +      expectedRowCount = 9)
    +  }
    +
    +  test("cintHgm = 5") {
    +    validateEstimatedStats(
    +      Filter(EqualTo(attrIntHgm, Literal(5)), 
childStatsTestPlan(Seq(attrIntHgm), 10L)),
    +      Seq(attrIntHgm -> ColumnStat(distinctCount = 1, min = Some(5), max = 
Some(5),
    +        nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))),
    +      expectedRowCount = 4)
    +  }
    +
    +  test("cintHgm = 0") {
    +    // This is an out-of-range case since 0 is outside the range [min, max]
    +    validateEstimatedStats(
    +      Filter(EqualTo(attrIntHgm, Literal(0)), 
childStatsTestPlan(Seq(attrIntHgm), 10L)),
    +      Nil,
    +      expectedRowCount = 0)
    +  }
    +
    +  test("cintHgm < 3") {
    +    validateEstimatedStats(
    +      Filter(LessThan(attrIntHgm, Literal(3)), 
childStatsTestPlan(Seq(attrIntHgm), 10L)),
    +      Seq(attrIntHgm -> ColumnStat(distinctCount = 1, min = Some(1), max = 
Some(3),
    +        nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))),
    +      expectedRowCount = 2)
    +  }
    +
    +  test("cintHgm < 0") {
    +    // This is a corner case since literal 0 is smaller than min.
    +    validateEstimatedStats(
    +      Filter(LessThan(attrIntHgm, Literal(0)), 
childStatsTestPlan(Seq(attrIntHgm), 10L)),
    +      Nil,
    +      expectedRowCount = 0)
    +  }
    +
    +  test("cintHgm <= 3") {
    +    validateEstimatedStats(
    +      Filter(LessThanOrEqual(attrIntHgm, Literal(3)), 
childStatsTestPlan(Seq(attrIntHgm), 10L)),
    +      Seq(attrIntHgm -> ColumnStat(distinctCount = 1, min = Some(1), max = 
Some(3),
    +        nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))),
    +      expectedRowCount = 2)
    +  }
    +
    +  test("cintHgm > 6") {
    +    validateEstimatedStats(
    +      Filter(GreaterThan(attrIntHgm, Literal(6)), 
childStatsTestPlan(Seq(attrIntHgm), 10L)),
    +      Seq(attrIntHgm -> ColumnStat(distinctCount = 2, min = Some(6), max = 
Some(10),
    +        nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))),
    +      expectedRowCount = 2)
    +  }
    +
    +  test("cintHgm > 10") {
    +    // This is a corner case since max value is 10.
    +    validateEstimatedStats(
    +      Filter(GreaterThan(attrIntHgm, Literal(10)), 
childStatsTestPlan(Seq(attrIntHgm), 10L)),
    +      Nil,
    +      expectedRowCount = 0)
    +  }
    +
    +  test("cintHgm >= 6") {
    +    validateEstimatedStats(
    +      Filter(GreaterThanOrEqual(attrIntHgm, Literal(6)), 
childStatsTestPlan(Seq(attrIntHgm), 10L)),
    +      Seq(attrIntHgm -> ColumnStat(distinctCount = 3, min = Some(6), max = 
Some(10),
    +        nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))),
    +      expectedRowCount = 4)
    +  }
    +
    +  test("cintHgm IS NULL") {
    +    validateEstimatedStats(
    +      Filter(IsNull(attrIntHgm), childStatsTestPlan(Seq(attrIntHgm), 10L)),
    +      Nil,
    +      expectedRowCount = 0)
    +  }
    +
    +  test("cintHgm IS NOT NULL") {
    +    validateEstimatedStats(
    +      Filter(IsNotNull(attrIntHgm), childStatsTestPlan(Seq(attrIntHgm), 
10L)),
    +      Seq(attrIntHgm -> ColumnStat(distinctCount = 6, min = Some(1), max = 
Some(10),
    +        nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))),
    +      expectedRowCount = 10)
    +  }
    +
    +  test("cintHgm > 3 AND cintHgm <= 6") {
    +    val condition = And(GreaterThan(attrIntHgm,
    +      Literal(3)), LessThanOrEqual(attrIntHgm, Literal(6)))
    +    validateEstimatedStats(
    +      Filter(condition, childStatsTestPlan(Seq(attrIntHgm), 10L)),
    +      Seq(attrIntHgm -> ColumnStat(distinctCount = 5, min = Some(3), max = 
Some(6),
    +        nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))),
    +      expectedRowCount = 8)
    +  }
    +
    +  test("cintHgm = 3 OR cintHgm = 6") {
    --- End diff --
    
    We have added histogram test cases for skewed distribution.  I will add 
more histogram test cases for non-skewed distribution.


---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to