[ https://issues.apache.org/jira/browse/SPARK-33941?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17256403#comment-17256403 ]
Maxim Gekk commented on SPARK-33941: ------------------------------------ I am working on a bug fix. > ALTER TABLE .. DROP PARTITION doesn't invalidate the cache > ---------------------------------------------------------- > > Key: SPARK-33941 > URL: https://issues.apache.org/jira/browse/SPARK-33941 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 3.0.1, 3.1.0, 3.2.0 > Reporter: Maxim Gekk > Priority: Major > > 1. Create a partitioned table: > {code:scala} > scala> sql("CREATE TABLE tbl2 (id int, part int) USING parquet PARTITIONED BY > (part)") > res31: org.apache.spark.sql.DataFrame = [] > {code} > 2. Create partitions: > {code:scala} > scala> sql("INSERT INTO tbl2 PARTITION (part=0) SELECT 0") > res32: org.apache.spark.sql.DataFrame = [] > scala> sql("INSERT INTO tbl2 PARTITION (part=1) SELECT 1") > res33: org.apache.spark.sql.DataFrame = [] > {code} > 3. Create dataframes from the table: > {code:scala} > scala> val df1 = spark.table("tbl2") > df1: org.apache.spark.sql.DataFrame = [id: int, part: int] > scala> val df2 = spark.table("tbl2") > df2: org.apache.spark.sql.DataFrame = [id: int, part: int] > {code} > 4. Cache df2 and fill in the cache: > {code:scala} > scala> df2.cache > res34: df2.type = [id: int, part: int] > scala> df1.show(false) > +---+----+ > |id |part| > +---+----+ > |0 |0 | > |1 |1 | > +---+----+ > scala> df2.show(false) > +---+----+ > |id |part| > +---+----+ > |0 |0 | > |1 |1 | > +---+----+ > {code} > 5. Drop a partition: > {code:scala} > scala> sql("ALTER TABLE tbl2 DROP PARTITION(part=0)") > res37: org.apache.spark.sql.DataFrame = [] > scala> df1.show(false) > +---+----+ > |id |part| > +---+----+ > |0 |0 | > |1 |1 | > +---+----+ > scala> df2.show(false) > +---+----+ > |id |part| > +---+----+ > |0 |0 | > |1 |1 | > +---+----+ > {code} > The same without caching: > {code:scala} > scala> sql("CREATE TABLE tbl3 (id int, part int) USING parquet PARTITIONED BY > (part)") > res40: org.apache.spark.sql.DataFrame = [] > scala> sql("INSERT INTO tbl3 PARTITION (part=0) SELECT 0") > res41: org.apache.spark.sql.DataFrame = [] > scala> sql("INSERT INTO tbl3 PARTITION (part=1) SELECT 1") > res42: org.apache.spark.sql.DataFrame = [] > scala> val df3 = spark.table("tbl3") > df3: org.apache.spark.sql.DataFrame = [id: int, part: int] > scala> df3.show(false) > +---+----+ > |id |part| > +---+----+ > |0 |0 | > |1 |1 | > +---+----+ > scala> sql("ALTER TABLE tbl3 DROP PARTITION(part=0)") > res44: org.apache.spark.sql.DataFrame = [] > scala> df3.show(false) > +---+----+ > |id |part| > +---+----+ > |1 |1 | > +---+----+ > {code} -- This message was sent by Atlassian Jira (v8.3.4#803005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org