Github user maryannxue-databricks commented on a diff in the pull request:

    https://github.com/apache/spark/pull/21531#discussion_r195236676
  
    --- Diff: 
sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala ---
    @@ -96,4 +99,38 @@ class DatasetCacheSuite extends QueryTest with 
SharedSQLContext {
         agged.unpersist()
         assert(agged.storageLevel == StorageLevel.NONE, "The Dataset agged 
should not be cached.")
       }
    +
    +  test("persist and then withColumn") {
    +    val df = Seq(("test", 1)).toDF("s", "i")
    +    // We should not invalidate the cached DataFrame
    +    val df2 = df.withColumn("newColumn", lit(1))
    +
    +    df.cache()
    +    assertCached(df)
    +    assertCached(df2)
    +
    +    df.count()
    +    assertCached(df2)
    +
    +    df.unpersist()
    +    assert(df.storageLevel == StorageLevel.NONE)
    +  }
    +
    +  test("cache UDF result correctly") {
    +    val expensiveUDF = udf({x: Int => Thread.sleep(10000); x})
    +    val df = spark.range(0, 10).toDF("a").withColumn("b", 
expensiveUDF($"a"))
    +    val df2 = df.agg(sum(df("b")))
    +
    +    df.cache()
    +    df.count()
    +
    +    assertCached(df2)
    +
    +    failAfter(5 seconds) {
    --- End diff --
    
    Can you add a comment here, like "udf has been evaluated during caching, 
and thus should not be re-evaluated."


---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to