[GitHub] spark pull request #22566: [SPARK-25458][SQL] Support FOR ALL COLUMNS in ANA...

gatorsmile Fri, 28 Sep 2018 00:08:15 -0700

Github user gatorsmile commented on a diff in the pull request:

    https://github.com/apache/spark/pull/22566#discussion_r221158906
  
    --- Diff: 
sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala ---
    @@ -653,6 +653,49 @@ class StatisticsSuite extends 
StatisticsCollectionTestBase with TestHiveSingleto
         }
       }
     
    +  test("collecting statistics for all columns") {
    +    val table = "update_col_stats_table"
    +    withTable(table) {
    +      sql(s"CREATE TABLE $table (c1 INT, c2 STRING, c3 DOUBLE)")
    +      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR ALL COLUMNS")
    +      val fetchedStats0 =
    +        checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = 
Some(0))
    +      assert(fetchedStats0.get.colStats == Map(
    +        "c1" -> CatalogColumnStat(distinctCount = Some(0), min = None, max 
= None,
    +          nullCount = Some(0), avgLen = Some(4), maxLen = Some(4)),
    +        "c3" -> CatalogColumnStat(distinctCount = Some(0), min = None, max 
= None,
    +          nullCount = Some(0), avgLen = Some(8), maxLen = Some(8)),
    +        "c2" -> CatalogColumnStat(distinctCount = Some(0), min = None, max 
= None,
    +          nullCount = Some(0), avgLen = Some(20), maxLen = Some(20))))
    +
    +      // Insert new data and analyze: have the latest column stats.
    +      sql(s"INSERT INTO TABLE $table SELECT 1, 'a', 10.0")
    +      sql(s"INSERT INTO TABLE $table SELECT 1, 'b', null")
    +
    +      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR ALL COLUMNS")
    +      val fetchedStats1 =
    +        checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = 
Some(2))
    +      assert(fetchedStats1.get.colStats == Map(
    +        "c1" -> CatalogColumnStat(distinctCount = Some(1), min = 
Some("1"), max = Some("1"),
    +          nullCount = Some(0), avgLen = Some(4), maxLen = Some(4)),
    +        "c3" -> CatalogColumnStat(distinctCount = Some(1), min = 
Some("10.0"), max = Some("10.0"),
    +          nullCount = Some(1), avgLen = Some(8), maxLen = Some(8)),
    +        "c2" -> CatalogColumnStat(distinctCount = Some(2), min = None, max 
= None,
    +          nullCount = Some(0), avgLen = Some(1), maxLen = Some(1))))
    +
    +      val e1 = intercept[IllegalArgumentException] {
    +        AnalyzeColumnCommand(TableIdentifier(table), Option(Seq("c1")), 
true).run(spark)
    +      }
    +      assert(e1.getMessage.contains("Parameter `columnNames` or 
`allColumns` are" +
    +        " mutually exclusive"))
    +      val e2 = intercept[IllegalArgumentException] {
    +        AnalyzeColumnCommand(TableIdentifier(table), None, 
false).run(spark)
    +      }
    +      assert(e1.getMessage.contains("Parameter `columnNames` or 
`allColumns` are" +
    +        " mutually exclusive"))
    --- End diff --
    
    Could we create a separate test case between 686 and 695



---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request #22566: [SPARK-25458][SQL] Support FOR ALL COLUMNS in ANA...

Reply via email to