Github user maryannxue-databricks commented on a diff in the pull request: https://github.com/apache/spark/pull/21531#discussion_r195236676 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala --- @@ -96,4 +99,38 @@ class DatasetCacheSuite extends QueryTest with SharedSQLContext { agged.unpersist() assert(agged.storageLevel == StorageLevel.NONE, "The Dataset agged should not be cached.") } + + test("persist and then withColumn") { + val df = Seq(("test", 1)).toDF("s", "i") + // We should not invalidate the cached DataFrame + val df2 = df.withColumn("newColumn", lit(1)) + + df.cache() + assertCached(df) + assertCached(df2) + + df.count() + assertCached(df2) + + df.unpersist() + assert(df.storageLevel == StorageLevel.NONE) + } + + test("cache UDF result correctly") { + val expensiveUDF = udf({x: Int => Thread.sleep(10000); x}) + val df = spark.range(0, 10).toDF("a").withColumn("b", expensiveUDF($"a")) + val df2 = df.agg(sum(df("b"))) + + df.cache() + df.count() + + assertCached(df2) + + failAfter(5 seconds) { --- End diff -- Can you add a comment here, like "udf has been evaluated during caching, and thus should not be re-evaluated."
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org