Repository: spark
Updated Branches:
  refs/heads/master 78ecb6d45 -> c04cb2d1b


[SPARK-21687][SQL] Spark SQL should set createTime for Hive partition

## What changes were proposed in this pull request?

Set createTime for every hive partition created in Spark SQL, which could be 
used to manage data lifecycle in Hive warehouse. We found  that almost every 
partition modified by spark sql has not been set createTime.

```
mysql> select * from partitions where create_time=0 limit 1\G;
*************************** 1. row ***************************
         PART_ID: 1028584
     CREATE_TIME: 0
LAST_ACCESS_TIME: 1502203611
       PART_NAME: date=20170130
           SD_ID: 1543605
          TBL_ID: 211605
  LINK_TARGET_ID: NULL
1 row in set (0.27 sec)
```

## How was this patch tested?
 N/A

Author: debugger87 <yangchaozhong.2...@gmail.com>
Author: Chaozhong Yang <yangchaozhong.2...@gmail.com>

Closes #18900 from debugger87/fix/set-create-time-for-hive-partition.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c04cb2d1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c04cb2d1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c04cb2d1

Branch: refs/heads/master
Commit: c04cb2d1b72b1edaddf684755f5a9d6aaf00e03b
Parents: 78ecb6d
Author: debugger87 <yangchaozhong.2...@gmail.com>
Authored: Wed Jun 27 11:34:28 2018 -0700
Committer: Xiao Li <gatorsm...@gmail.com>
Committed: Wed Jun 27 11:34:28 2018 -0700

----------------------------------------------------------------------
 .../apache/spark/sql/catalyst/catalog/interface.scala |  6 ++++++
 .../sql/catalyst/catalog/SessionCatalogSuite.scala    |  6 ++++--
 .../results/describe-part-after-analyze.sql.out       | 14 ++++++++++++++
 .../test/resources/sql-tests/results/describe.sql.out |  4 ++++
 .../resources/sql-tests/results/show-tables.sql.out   |  2 ++
 .../apache/spark/sql/hive/client/HiveClientImpl.scala |  4 ++++
 6 files changed, 34 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/c04cb2d1/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index f3e67dc..c6105c5 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -93,12 +93,16 @@ object CatalogStorageFormat {
  * @param spec partition spec values indexed by column name
  * @param storage storage format of the partition
  * @param parameters some parameters for the partition
+ * @param createTime creation time of the partition, in milliseconds
+ * @param lastAccessTime last access time, in milliseconds
  * @param stats optional statistics (number of rows, total size, etc.)
  */
 case class CatalogTablePartition(
     spec: CatalogTypes.TablePartitionSpec,
     storage: CatalogStorageFormat,
     parameters: Map[String, String] = Map.empty,
+    createTime: Long = System.currentTimeMillis,
+    lastAccessTime: Long = -1,
     stats: Option[CatalogStatistics] = None) {
 
   def toLinkedHashMap: mutable.LinkedHashMap[String, String] = {
@@ -109,6 +113,8 @@ case class CatalogTablePartition(
     if (parameters.nonEmpty) {
       map.put("Partition Parameters", s"{${parameters.map(p => p._1 + "=" + 
p._2).mkString(", ")}}")
     }
+    map.put("Created Time", new Date(createTime).toString)
+    map.put("Last Access", new Date(lastAccessTime).toString)
     stats.foreach(s => map.put("Partition Statistics", s.simpleString))
     map
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/c04cb2d1/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
index 6abab00..6a7375e 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
@@ -1114,11 +1114,13 @@ abstract class SessionCatalogSuite extends AnalysisTest 
{
     // And for hive serde table, hive metastore will set some 
values(e.g.transient_lastDdlTime)
     // in table's parameters and storage's properties, here we also ignore 
them.
     val actualPartsNormalize = actualParts.map(p =>
-      p.copy(parameters = Map.empty, storage = p.storage.copy(
+      p.copy(parameters = Map.empty, createTime = -1, lastAccessTime = -1,
+        storage = p.storage.copy(
         properties = Map.empty, locationUri = None, serde = None))).toSet
 
     val expectedPartsNormalize = expectedParts.map(p =>
-        p.copy(parameters = Map.empty, storage = p.storage.copy(
+        p.copy(parameters = Map.empty, createTime = -1, lastAccessTime = -1,
+          storage = p.storage.copy(
           properties = Map.empty, locationUri = None, serde = None))).toSet
 
     actualPartsNormalize == expectedPartsNormalize

http://git-wip-us.apache.org/repos/asf/spark/blob/c04cb2d1/sql/core/src/test/resources/sql-tests/results/describe-part-after-analyze.sql.out
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/resources/sql-tests/results/describe-part-after-analyze.sql.out
 
b/sql/core/src/test/resources/sql-tests/results/describe-part-after-analyze.sql.out
index 58ed201..8ba69c6 100644
--- 
a/sql/core/src/test/resources/sql-tests/results/describe-part-after-analyze.sql.out
+++ 
b/sql/core/src/test/resources/sql-tests/results/describe-part-after-analyze.sql.out
@@ -57,6 +57,8 @@ Database              default
 Table                  t                                           
 Partition Values       [ds=2017-08-01, hr=10]                      
 Location [not included in 
comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=10                       
 
+Created Time [not included in comparison]
+Last Access [not included in comparison]
                                                                    
 # Storage Information                                              
 Location [not included in comparison]sql/core/spark-warehouse/t
@@ -89,6 +91,8 @@ Database              default
 Table                  t                                           
 Partition Values       [ds=2017-08-01, hr=10]                      
 Location [not included in 
comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=10                       
 
+Created Time [not included in comparison]
+Last Access [not included in comparison]
 Partition Statistics   1121 bytes, 3 rows                          
                                                                    
 # Storage Information                                              
@@ -122,6 +126,8 @@ Database                    default
 Table                  t                                           
 Partition Values       [ds=2017-08-01, hr=10]                      
 Location [not included in 
comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=10                       
 
+Created Time [not included in comparison]
+Last Access [not included in comparison]
 Partition Statistics   1121 bytes, 3 rows                          
                                                                    
 # Storage Information                                              
@@ -147,6 +153,8 @@ Database                    default
 Table                  t                                           
 Partition Values       [ds=2017-08-01, hr=11]                      
 Location [not included in 
comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=11                       
 
+Created Time [not included in comparison]
+Last Access [not included in comparison]
 Partition Statistics   1098 bytes, 4 rows                          
                                                                    
 # Storage Information                                              
@@ -180,6 +188,8 @@ Database                    default
 Table                  t                                           
 Partition Values       [ds=2017-08-01, hr=10]                      
 Location [not included in 
comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=10                       
 
+Created Time [not included in comparison]
+Last Access [not included in comparison]
 Partition Statistics   1121 bytes, 3 rows                          
                                                                    
 # Storage Information                                              
@@ -205,6 +215,8 @@ Database                    default
 Table                  t                                           
 Partition Values       [ds=2017-08-01, hr=11]                      
 Location [not included in 
comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=11                       
 
+Created Time [not included in comparison]
+Last Access [not included in comparison]
 Partition Statistics   1098 bytes, 4 rows                          
                                                                    
 # Storage Information                                              
@@ -230,6 +242,8 @@ Database                    default
 Table                  t                                           
 Partition Values       [ds=2017-09-01, hr=5]                       
 Location [not included in 
comparison]sql/core/spark-warehouse/t/ds=2017-09-01/hr=5                        
 
+Created Time [not included in comparison]
+Last Access [not included in comparison]
 Partition Statistics   1144 bytes, 2 rows                          
                                                                    
 # Storage Information                                              

http://git-wip-us.apache.org/repos/asf/spark/blob/c04cb2d1/sql/core/src/test/resources/sql-tests/results/describe.sql.out
----------------------------------------------------------------------
diff --git a/sql/core/src/test/resources/sql-tests/results/describe.sql.out 
b/sql/core/src/test/resources/sql-tests/results/describe.sql.out
index 8c908b7..79390cb 100644
--- a/sql/core/src/test/resources/sql-tests/results/describe.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/describe.sql.out
@@ -282,6 +282,8 @@ Table                       t
 Partition Values       [c=Us, d=1]                                 
 Location [not included in comparison]sql/core/spark-warehouse/t/c=Us/d=1       
                    
 Storage Properties     [a=1, b=2]                                  
+Created Time [not included in comparison]
+Last Access [not included in comparison]
                                                                    
 # Storage Information                                              
 Num Buckets            2                                           
@@ -311,6 +313,8 @@ Table                       t
 Partition Values       [c=Us, d=1]                                 
 Location [not included in comparison]sql/core/spark-warehouse/t/c=Us/d=1       
                    
 Storage Properties     [a=1, b=2]                                  
+Created Time [not included in comparison]
+Last Access [not included in comparison]
                                                                    
 # Storage Information                                              
 Num Buckets            2                                           

http://git-wip-us.apache.org/repos/asf/spark/blob/c04cb2d1/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out
----------------------------------------------------------------------
diff --git a/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out 
b/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out
index 975bb06..abeb7e1 100644
--- a/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out
@@ -178,6 +178,8 @@ 
struct<database:string,tableName:string,isTemporary:boolean,information:string>
 -- !query 14 output
 showdb show_t1 false   Partition Values: [c=Us, d=1]
 Location [not included in 
comparison]sql/core/spark-warehouse/showdb.db/show_t1/c=Us/d=1
+Created Time [not included in comparison]
+Last Access [not included in comparison]
 
 
 -- !query 15

http://git-wip-us.apache.org/repos/asf/spark/blob/c04cb2d1/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index da9fe2d..1df46d7 100644
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -995,6 +995,8 @@ private[hive] object HiveClientImpl {
     tpart.setTableName(ht.getTableName)
     tpart.setValues(partValues.asJava)
     tpart.setSd(storageDesc)
+    tpart.setCreateTime((p.createTime / 1000).toInt)
+    tpart.setLastAccessTime((p.lastAccessTime / 1000).toInt)
     tpart.setParameters(mutable.Map(p.parameters.toSeq: _*).asJava)
     new HivePartition(ht, tpart)
   }
@@ -1019,6 +1021,8 @@ private[hive] object HiveClientImpl {
         compressed = apiPartition.getSd.isCompressed,
         properties = Option(apiPartition.getSd.getSerdeInfo.getParameters)
           .map(_.asScala.toMap).orNull),
+      createTime = apiPartition.getCreateTime.toLong * 1000,
+      lastAccessTime = apiPartition.getLastAccessTime.toLong * 1000,
       parameters = properties,
       stats = readHiveStats(properties))
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to