This is an automated email from the ASF dual-hosted git repository.

okumin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new fa6813f7255 HIVE-28727: Iceberg: Refactor 
IcebergTableUtil.toPartitionData (#5622) (Shohei Okumiya, reviewed by Denys 
Kuzmenko)
fa6813f7255 is described below

commit fa6813f725542bae127f9bec0921ebd01c946947
Author: Shohei Okumiya <[email protected]>
AuthorDate: Thu Feb 13 13:46:33 2025 +0900

    HIVE-28727: Iceberg: Refactor IcebergTableUtil.toPartitionData (#5622) 
(Shohei Okumiya, reviewed by Denys Kuzmenko)
---
 .../iceberg/mr/hive/HiveIcebergStorageHandler.java | 15 +---
 .../apache/iceberg/mr/hive/IcebergTableUtil.java   | 30 +------
 .../iceberg_major_compaction_partition_evolution.q | 13 ++-
 ...berg_major_compaction_partition_evolution.q.out | 99 ++++++++++++++++++++--
 4 files changed, 110 insertions(+), 47 deletions(-)

diff --git 
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
 
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
index b076721468b..879c109f73f 100644
--- 
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
+++ 
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
@@ -174,7 +174,6 @@
 import org.apache.iceberg.TableProperties;
 import org.apache.iceberg.TableScan;
 import org.apache.iceberg.actions.DeleteOrphanFiles;
-import org.apache.iceberg.data.GenericRecord;
 import org.apache.iceberg.data.PartitionStatsHandler;
 import org.apache.iceberg.exceptions.NoSuchTableException;
 import org.apache.iceberg.exceptions.ValidationException;
@@ -569,17 +568,9 @@ private static Map<String, String> 
getPartishSummary(Partish partish, Table tabl
         try (Closeable toClose = partitionStatsRecords) {
           PartitionStats partitionStats = 
Iterables.tryFind(partitionStatsRecords, stats -> {
             PartitionSpec spec = table.specs().get(stats.specId());
-            Schema readSchema = spec.partitionType().asSchema();
-            GenericRecord record = GenericRecord.create(readSchema);
-
-            List<Types.NestedField> fields = partitionType.fields();
-            for (int index = 0, pos = 0; index < fields.size(); index++) {
-              if (readSchema.findField(fields.get(index).fieldId()) != null) {
-                record.set(pos++, stats.partition().get(index, Object.class));
-              }
-            }
-            return 
spec.partitionToPath(record).equals(partish.getPartition().getName());
-
+            PartitionData data  = 
IcebergTableUtil.toPartitionData(stats.partition(), partitionType,
+                spec.partitionType());
+            return 
spec.partitionToPath(data).equals(partish.getPartition().getName());
           }).orNull();
 
           if (partitionStats != null) {
diff --git 
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
 
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
index 3230363ad7f..35c185a29e2 100644
--- 
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
+++ 
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
@@ -418,36 +418,14 @@ public static void performMetadataDelete(Table 
icebergTable, String branchName,
   }
 
   public static PartitionData toPartitionData(StructLike key, Types.StructType 
keyType) {
-    PartitionData data = new PartitionData(keyType);
-    for (int i = 0; i < keyType.fields().size(); i++) {
-      Object val = key.get(i, 
keyType.fields().get(i).type().typeId().javaClass());
-      if (val != null) {
-        data.set(i, val);
-      }
-    }
-    return data;
+    PartitionData keyTemplate = new PartitionData(keyType);
+    return keyTemplate.copyFor(key);
   }
 
   public static PartitionData toPartitionData(StructLike sourceKey, 
Types.StructType sourceKeyType,
       Types.StructType targetKeyType) {
-    PartitionData data = new PartitionData(targetKeyType);
-    for (int i = 0; i < targetKeyType.fields().size(); i++) {
-      Types.NestedField targetKey = targetKeyType.fields().get(i);
-
-      Optional<Object> val = sourceKeyType.fields().stream()
-          .filter(f -> f.name().equals(targetKey.name()))
-          .findFirst()
-          .map(sourceKeyElem ->
-            sourceKey.get(
-              sourceKeyType.fields().indexOf(sourceKeyElem),
-              targetKey.type().typeId().javaClass()
-            )
-          );
-      if (val.isPresent()) {
-        data.set(i, val.get());
-      }
-    }
-    return data;
+    StructProjection projection = StructProjection.create(sourceKeyType, 
targetKeyType).wrap(sourceKey);
+    return toPartitionData(projection, targetKeyType);
   }
 
   public static Expression generateExpressionFromPartitionSpec(Table table, 
Map<String, String> partitionSpec,
diff --git 
a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_major_compaction_partition_evolution.q
 
b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_major_compaction_partition_evolution.q
index e6c40dd20cc..32bd401e081 100644
--- 
a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_major_compaction_partition_evolution.q
+++ 
b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_major_compaction_partition_evolution.q
@@ -36,13 +36,16 @@ tblproperties ('format-version'='2', 
'hive.compactor.worker.pool'='iceberg');
 insert into ice_orc VALUES ('fn1','ln1', 1, 10, 100);
 insert into ice_orc VALUES ('fn2','ln2', 1, 10, 100);
 insert into ice_orc VALUES ('fn3','ln3', 1, 11, 100);
+insert into ice_orc VALUES (null,null, null, null, null);
 alter table ice_orc set partition spec(company_id, dept_id);
 insert into ice_orc VALUES ('fn4','ln4', 1, 11, 100);
 insert into ice_orc VALUES ('fn5','ln5', 2, 20, 100);
 insert into ice_orc VALUES ('fn6','ln6', 2, 20, 100);
+insert into ice_orc VALUES (null,null, null, null, null);
 alter table ice_orc set partition spec(company_id, dept_id, team_id);
 insert into ice_orc VALUES ('fn7','ln7', 2, 21, 100);
 insert into ice_orc VALUES ('fn8','ln8', 2, 21, 100);
+insert into ice_orc VALUES (null,null, null, null, null);
 
 update ice_orc set last_name = 'ln1a' where first_name='fn1';
 update ice_orc set last_name = 'ln2a' where first_name='fn2';
@@ -59,9 +62,17 @@ delete from ice_orc where last_name in ('ln1a', 'ln8a');
 select * from ice_orc;
 describe formatted ice_orc;
 
+select `partition`, spec_id, content, record_count
+from default.ice_orc.files
+order by `partition`, spec_id, content, record_count;
+
 explain alter table ice_orc COMPACT 'major' and wait;
 alter table ice_orc COMPACT 'major' and wait;
 
 select * from ice_orc;
 describe formatted ice_orc;
-show compactions order by 'partition';
\ No newline at end of file
+show compactions order by 'partition';
+
+select `partition`, spec_id, content, record_count
+from default.ice_orc.files
+order by `partition`, spec_id, content, record_count;
diff --git 
a/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_major_compaction_partition_evolution.q.out
 
b/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_major_compaction_partition_evolution.q.out
index 970467e9d99..5a742f00925 100644
--- 
a/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_major_compaction_partition_evolution.q.out
+++ 
b/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_major_compaction_partition_evolution.q.out
@@ -46,6 +46,14 @@ POSTHOOK: query: insert into ice_orc VALUES ('fn3','ln3', 1, 
11, 100)
 POSTHOOK: type: QUERY
 POSTHOOK: Input: _dummy_database@_dummy_table
 POSTHOOK: Output: default@ice_orc
+PREHOOK: query: insert into ice_orc VALUES (null,null, null, null, null)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@ice_orc
+POSTHOOK: query: insert into ice_orc VALUES (null,null, null, null, null)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@ice_orc
 PREHOOK: query: alter table ice_orc set partition spec(company_id, dept_id)
 PREHOOK: type: ALTERTABLE_SETPARTSPEC
 PREHOOK: Input: default@ice_orc
@@ -77,6 +85,14 @@ POSTHOOK: query: insert into ice_orc VALUES ('fn6','ln6', 2, 
20, 100)
 POSTHOOK: type: QUERY
 POSTHOOK: Input: _dummy_database@_dummy_table
 POSTHOOK: Output: default@ice_orc
+PREHOOK: query: insert into ice_orc VALUES (null,null, null, null, null)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@ice_orc
+POSTHOOK: query: insert into ice_orc VALUES (null,null, null, null, null)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@ice_orc
 PREHOOK: query: alter table ice_orc set partition spec(company_id, dept_id, 
team_id)
 PREHOOK: type: ALTERTABLE_SETPARTSPEC
 PREHOOK: Input: default@ice_orc
@@ -100,6 +116,14 @@ POSTHOOK: query: insert into ice_orc VALUES ('fn8','ln8', 
2, 21, 100)
 POSTHOOK: type: QUERY
 POSTHOOK: Input: _dummy_database@_dummy_table
 POSTHOOK: Output: default@ice_orc
+PREHOOK: query: insert into ice_orc VALUES (null,null, null, null, null)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@ice_orc
+POSTHOOK: query: insert into ice_orc VALUES (null,null, null, null, null)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@ice_orc
 PREHOOK: query: update ice_orc set last_name = 'ln1a' where first_name='fn1'
 PREHOOK: type: QUERY
 PREHOOK: Input: default@ice_orc
@@ -203,6 +227,9 @@ POSTHOOK: query: select * from ice_orc
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@ice_orc
 #### A masked pattern was here ####
+NULL   NULL    NULL    NULL    NULL
+NULL   NULL    NULL    NULL    NULL
+NULL   NULL    NULL    NULL    NULL
 fn2    ln2a    1       10      100
 fn3    ln3a    1       11      100
 fn4    ln4a    1       11      100
@@ -239,20 +266,20 @@ Table Parameters:
        bucketing_version       2                   
        current-schema          
{\"type\":\"struct\",\"schema-id\":0,\"fields\":[{\"id\":1,\"name\":\"first_name\",\"required\":false,\"type\":\"string\"},{\"id\":2,\"name\":\"last_name\",\"required\":false,\"type\":\"string\"},{\"id\":3,\"name\":\"dept_id\",\"required\":false,\"type\":\"long\"},{\"id\":4,\"name\":\"team_id\",\"required\":false,\"type\":\"long\"},{\"id\":5,\"name\":\"company_id\",\"required\":false,\"type\":\"long\"}]}
        current-snapshot-id     #Masked#
-       current-snapshot-summary        
{\"deleted-data-files\":\"2\",\"deleted-records\":\"2\",\"removed-files-size\":\"#Masked#\",\"changed-partition-count\":\"2\",\"total-records\":\"14\",\"total-files-size\":\"#Masked#\",\"total-data-files\":\"14\",\"total-delete-files\":\"8\",\"total-position-deletes\":\"8\",\"total-equality-deletes\":\"0\",\"iceberg-version\":\"#Masked#\"}
+       current-snapshot-summary        
{\"deleted-data-files\":\"2\",\"deleted-records\":\"2\",\"removed-files-size\":\"#Masked#\",\"changed-partition-count\":\"2\",\"total-records\":\"17\",\"total-files-size\":\"#Masked#\",\"total-data-files\":\"17\",\"total-delete-files\":\"8\",\"total-position-deletes\":\"8\",\"total-equality-deletes\":\"0\",\"iceberg-version\":\"#Masked#\"}
        current-snapshot-timestamp-ms   #Masked#       
        default-partition-spec  
{\"spec-id\":1,\"fields\":[{\"name\":\"company_id\",\"transform\":\"identity\",\"source-id\":5,\"field-id\":1000},{\"name\":\"dept_id\",\"transform\":\"identity\",\"source-id\":3,\"field-id\":1001}]}
        format-version          2                   
        hive.compactor.worker.pool      iceberg             
        iceberg.orc.files.only  true                
 #### A masked pattern was here ####
-       numFiles                14                  
-       numRows                 14                  
+       numFiles                17                  
+       numRows                 17                  
        parquet.compression     zstd                
 #### A masked pattern was here ####
        rawDataSize             0                   
        serialization.format    1                   
-       snapshot-count          17                  
+       snapshot-count          20                  
        storage_handler         
org.apache.iceberg.mr.hive.HiveIcebergStorageHandler
        table_type              ICEBERG             
        totalSize               #Masked#
@@ -269,6 +296,43 @@ InputFormat:               
org.apache.iceberg.mr.hive.HiveIcebergInputFormat
 OutputFormat:          org.apache.iceberg.mr.hive.HiveIcebergOutputFormat      
 
 Compressed:            No                       
 Sort Columns:          []                       
+PREHOOK: query: select `partition`, spec_id, content, record_count
+from default.ice_orc.files
+order by `partition`, spec_id, content, record_count
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice_orc
+#### A masked pattern was here ####
+POSTHOOK: query: select `partition`, spec_id, content, record_count
+from default.ice_orc.files
+order by `partition`, spec_id, content, record_count
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice_orc
+#### A masked pattern was here ####
+{"company_id":100,"dept_id":1,"team_id":10}    2       0       1
+{"company_id":100,"dept_id":1,"team_id":11}    2       0       1
+{"company_id":100,"dept_id":1,"team_id":11}    2       0       1
+{"company_id":100,"dept_id":1,"team_id":null}  1       0       1
+{"company_id":100,"dept_id":1,"team_id":null}  1       1       1
+{"company_id":100,"dept_id":2,"team_id":21}    2       0       1
+{"company_id":100,"dept_id":2,"team_id":21}    2       0       1
+{"company_id":100,"dept_id":2,"team_id":21}    2       1       1
+{"company_id":100,"dept_id":2,"team_id":21}    2       1       1
+{"company_id":100,"dept_id":2,"team_id":null}  1       0       1
+{"company_id":100,"dept_id":2,"team_id":null}  1       0       1
+{"company_id":100,"dept_id":2,"team_id":null}  1       0       1
+{"company_id":100,"dept_id":2,"team_id":null}  1       0       1
+{"company_id":100,"dept_id":2,"team_id":null}  1       0       1
+{"company_id":100,"dept_id":2,"team_id":null}  1       1       1
+{"company_id":100,"dept_id":2,"team_id":null}  1       1       1
+{"company_id":100,"dept_id":null,"team_id":null}       0       0       1
+{"company_id":100,"dept_id":null,"team_id":null}       0       0       1
+{"company_id":100,"dept_id":null,"team_id":null}       0       0       1
+{"company_id":100,"dept_id":null,"team_id":null}       0       1       1
+{"company_id":100,"dept_id":null,"team_id":null}       0       1       1
+{"company_id":100,"dept_id":null,"team_id":null}       0       1       1
+{"company_id":null,"dept_id":null,"team_id":null}      0       0       1
+{"company_id":null,"dept_id":null,"team_id":null}      1       0       1
+{"company_id":null,"dept_id":null,"team_id":null}      2       0       1
 PREHOOK: query: explain alter table ice_orc COMPACT 'major' and wait
 PREHOOK: type: ALTERTABLE_COMPACT
 PREHOOK: Input: default@ice_orc
@@ -305,6 +369,9 @@ POSTHOOK: query: select * from ice_orc
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@ice_orc
 #### A masked pattern was here ####
+NULL   NULL    NULL    NULL    NULL
+NULL   NULL    NULL    NULL    NULL
+NULL   NULL    NULL    NULL    NULL
 fn2    ln2a    1       10      100
 fn3    ln3a    1       11      100
 fn4    ln4a    1       11      100
@@ -341,20 +408,20 @@ Table Parameters:
        bucketing_version       2                   
        current-schema          
{\"type\":\"struct\",\"schema-id\":0,\"fields\":[{\"id\":1,\"name\":\"first_name\",\"required\":false,\"type\":\"string\"},{\"id\":2,\"name\":\"last_name\",\"required\":false,\"type\":\"string\"},{\"id\":3,\"name\":\"dept_id\",\"required\":false,\"type\":\"long\"},{\"id\":4,\"name\":\"team_id\",\"required\":false,\"type\":\"long\"},{\"id\":5,\"name\":\"company_id\",\"required\":false,\"type\":\"long\"}]}
        current-snapshot-id     #Masked#
-       current-snapshot-summary        
{\"added-data-files\":\"1\",\"deleted-data-files\":\"8\",\"removed-position-delete-files\":\"5\",\"removed-delete-files\":\"5\",\"added-records\":\"3\",\"deleted-records\":\"8\",\"added-files-size\":\"#Masked#\",\"removed-files-size\":\"#Masked#\",\"removed-position-deletes\":\"5\",\"changed-partition-count\":\"5\",\"total-records\":\"6\",\"total-files-size\":\"#Masked#\",\"total-data-files\":\"2\",\"total-delete-files\":\"0\",\"total-position-deletes\":\"0\",\"
 [...]
+       current-snapshot-summary        
{\"added-data-files\":\"2\",\"deleted-data-files\":\"10\",\"removed-position-delete-files\":\"5\",\"removed-delete-files\":\"5\",\"added-records\":\"5\",\"deleted-records\":\"10\",\"added-files-size\":\"#Masked#\",\"removed-files-size\":\"#Masked#\",\"removed-position-deletes\":\"5\",\"changed-partition-count\":\"8\",\"total-records\":\"9\",\"total-files-size\":\"#Masked#\",\"total-data-files\":\"4\",\"total-delete-files\":\"0\",\"total-position-deletes\":\"0\",
 [...]
        current-snapshot-timestamp-ms   #Masked#       
        default-partition-spec  
{\"spec-id\":1,\"fields\":[{\"name\":\"company_id\",\"transform\":\"identity\",\"source-id\":5,\"field-id\":1000},{\"name\":\"dept_id\",\"transform\":\"identity\",\"source-id\":3,\"field-id\":1001}]}
        format-version          2                   
        hive.compactor.worker.pool      iceberg             
        iceberg.orc.files.only  true                
 #### A masked pattern was here ####
-       numFiles                2                   
-       numRows                 6                   
+       numFiles                4                   
+       numRows                 9                   
        parquet.compression     zstd                
 #### A masked pattern was here ####
        rawDataSize             0                   
        serialization.format    1                   
-       snapshot-count          20                  
+       snapshot-count          23                  
        storage_handler         
org.apache.iceberg.mr.hive.HiveIcebergStorageHandler
        table_type              ICEBERG             
        totalSize               #Masked#
@@ -379,3 +446,19 @@ CompactionId       Database        Table   Partition       
Type    State   Worker host     Worker  Enqueue Time
 #Masked#       default ice_orc company_id=100/dept_id=1        MAJOR   
succeeded       #Masked#        manual  iceberg 0       0       0        --- 
 #Masked#       default ice_orc company_id=100/dept_id=2        MAJOR   
succeeded       #Masked#        manual  iceberg 0       0       0        --- 
 #Masked#       default ice_orc  ---    MAJOR   succeeded       #Masked#        
manual  iceberg 0       0       0        --- 
+PREHOOK: query: select `partition`, spec_id, content, record_count
+from default.ice_orc.files
+order by `partition`, spec_id, content, record_count
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice_orc
+#### A masked pattern was here ####
+POSTHOOK: query: select `partition`, spec_id, content, record_count
+from default.ice_orc.files
+order by `partition`, spec_id, content, record_count
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice_orc
+#### A masked pattern was here ####
+{"company_id":100,"dept_id":1,"team_id":null}  1       0       3
+{"company_id":100,"dept_id":2,"team_id":null}  1       0       3
+{"company_id":null,"dept_id":null,"team_id":null}      1       0       1
+{"company_id":null,"dept_id":null,"team_id":null}      1       0       2

Reply via email to