[spark] branch branch-3.4 updated: [SPARK-45054][SQL] HiveExternalCatalog.listPartitions should restore partition statistics

sunchao Fri, 01 Sep 2023 20:26:33 -0700

This is an automated email from the ASF dual-hosted git repository.

sunchao pushed a commit to branch branch-3.4
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/branch-3.4 by this push:
     new d61bfb87e39 [SPARK-45054][SQL] HiveExternalCatalog.listPartitions 
should restore partition statistics
d61bfb87e39 is described below

commit d61bfb87e39fdce163a2e01b4f039e63d4830ab2
Author: Chao Sun <sunc...@apple.com>
AuthorDate: Fri Sep 1 20:22:34 2023 -0700

    [SPARK-45054][SQL] HiveExternalCatalog.listPartitions should restore 
partition statistics
    
    ### What changes were proposed in this pull request?
    
    Call `restorePartitionMetadata` in `listPartitions` to restore Spark SQL 
statistics.
    
    ### Why are the changes needed?
    
    Currently when `listPartitions` is called, it doesn't restore Spark SQL 
statistics stored in metastore, such as `spark.sql.statistics.totalSize`. This 
means callers who rely on stats from the method call may wrong results.
    
    In particular, when `spark.sql.statistics.size.autoUpdate.enabled` is 
turned on, during insert overwrite Spark will first list partitions and get old 
statistics, and then compare them with new statistics and see which partitions 
need to be updated. This issue will sometimes cause it to update all partitions 
instead of only those partitions that have been touched.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No
    
    ### How was this patch tested?
    
    Added a new test.
    ### Was this patch authored or co-authored using generative AI tooling?
    
    Closes #42777 from sunchao/list-partition-stat.
    
    Authored-by: Chao Sun <sunc...@apple.com>
    Signed-off-by: Chao Sun <sunc...@apple.com>
---
 .../catalyst/catalog/ExternalCatalogSuite.scala    | 25 ++++++++++++++++++++++
 .../spark/sql/hive/HiveExternalCatalog.scala       |  7 ++++--
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
index 32eb8849427..a8f73cebf31 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
@@ -475,6 +475,31 @@ abstract class ExternalCatalogSuite extends SparkFunSuite {
     assert(catalog.listPartitions("db2", "tbl2", Some(Map("a" -> 
"unknown"))).isEmpty)
   }
 
+  test("SPARK-45054: list partitions should restore stats") {
+    val catalog = newBasicCatalog()
+    val stats = Some(CatalogStatistics(sizeInBytes = 1))
+    val newPart = CatalogTablePartition(Map("a" -> "1", "b" -> "2"), 
storageFormat, stats = stats)
+    catalog.alterPartitions("db2", "tbl2", Seq(newPart))
+    val parts = catalog.listPartitions("db2", "tbl2", Some(Map("a" -> "1")))
+
+    assert(parts.length == 1)
+    val part = parts.head
+    assert(part.stats.exists(_.sizeInBytes == 1))
+  }
+
+  test("SPARK-45054: list partitions by filter should restore stats") {
+    val catalog = newBasicCatalog()
+    val stats = Some(CatalogStatistics(sizeInBytes = 1))
+    val newPart = CatalogTablePartition(Map("a" -> "1", "b" -> "2"), 
storageFormat, stats = stats)
+    catalog.alterPartitions("db2", "tbl2", Seq(newPart))
+    val tz = TimeZone.getDefault.getID
+    val parts = catalog.listPartitionsByFilter("db2", "tbl2", Seq($"a".int === 
1), tz)
+
+    assert(parts.length == 1)
+    val part = parts.head
+    assert(part.stats.exists(_.sizeInBytes == 1))
+  }
+
   test("SPARK-21457: list partitions with special chars") {
     val catalog = newBasicCatalog()
     assert(catalog.listPartitions("db2", "tbl1").isEmpty)
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 0c556cd34ed..d1355d4075f 100644
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -1265,13 +1265,14 @@ private[spark] class HiveExternalCatalog(conf: 
SparkConf, hadoopConf: Configurat
       db: String,
       table: String,
       partialSpec: Option[TablePartitionSpec] = None): 
Seq[CatalogTablePartition] = withClient {
-    val partColNameMap = buildLowerCasePartColNameMap(getTable(db, table))
+    val catalogTable = getTable(db, table)
+    val partColNameMap = buildLowerCasePartColNameMap(catalogTable)
     val metaStoreSpec = partialSpec.map(toMetaStorePartitionSpec)
     val res = client.getPartitions(db, table, metaStoreSpec)
       .map { part => part.copy(spec = restorePartitionSpec(part.spec, 
partColNameMap))
     }
 
-    metaStoreSpec match {
+    val parts = metaStoreSpec match {
       // This might be a bug of Hive: When the partition value inside the 
partial partition spec
       // contains dot, and we ask Hive to list partitions w.r.t. the partial 
partition spec, Hive
       // treats dot as matching any single character and may return more 
partitions than we
@@ -1280,6 +1281,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, 
hadoopConf: Configurat
         res.filter(p => isPartialPartitionSpec(spec, 
toMetaStorePartitionSpec(p.spec)))
       case _ => res
     }
+    parts.map(restorePartitionMetadata(_, catalogTable))
   }
 
   override def listPartitionsByFilter(
@@ -1293,6 +1295,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, 
hadoopConf: Configurat
     val clientPrunedPartitions =
       client.getPartitionsByFilter(rawHiveTable, predicates).map { part =>
         part.copy(spec = restorePartitionSpec(part.spec, partColNameMap))
+        restorePartitionMetadata(part, catalogTable)
       }
     prunePartitionsByFilter(catalogTable, clientPrunedPartitions, predicates, 
defaultTimeZoneId)
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.4 updated: [SPARK-45054][SQL] HiveExternalCatalog.listPartitions should restore partition statistics

Reply via email to