Re: [PR] HIVE-28586 Support write order for Iceberg tables at CREATE TABLE [hive]

via GitHub Sun, 22 Dec 2024 01:52:34 -0800


okumin commented on code in PR #5541:
URL: https://github.com/apache/hive/pull/5541#discussion_r1894863356



##########
ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java:
##########
@@ -16089,7 +16097,26 @@ protected void 
addPartitionColsToInsert(List<FieldSchema> partCols,
       rewrittenQueryStr.append(")");
     }
   }
-
+  
+  private String getSortOrderJson(ASTNode ast) {
+    List<SortFieldDesc> sortFieldDescList = new ArrayList<>();
+    SortFields sortFields = new SortFields(sortFieldDescList);
+    for (int i = 0; i < ast.getChildCount(); i++) {
+      ASTNode child = (ASTNode) ast.getChild(i);
+      SortFieldDesc.SortDirection sortDirection = child.getToken()
+          .getType() == HiveParser.TOK_TABSORTCOLNAMEDESC ? 
SortFieldDesc.SortDirection.DESC : SortFieldDesc.SortDirection.ASC;
+      child = (ASTNode) child.getChild(0);
+      String name = 
unescapeIdentifier(child.getChild(0).getText()).toLowerCase();
+      SortFieldDesc.NullOrder nullOrder = child.getToken().getType() == 
HiveParser.TOK_NULLS_FIRST ? SortFieldDesc.NullOrder.NULLS_FIRST : 
SortFieldDesc.NullOrder.NULLS_LAST;
+      sortFieldDescList.add(new SortFieldDesc(name, sortDirection, nullOrder));
+    }
+    try {
+      return jsonObjectMapper.writeValueAsString(sortFields);
+    } catch (JsonProcessingException e) {
+      return null;
+    }
+    //return new JSONObject().put("fields", new 
JSONArray(sortFields.stream().map(SortFieldDesc::toString).collect(Collectors.toList()))).toString();

Review Comment:
   Can we delete this one?



##########
iceberg/iceberg-handler/src/test/queries/positive/iceberg_create_locally_ordered_table.q:
##########
@@ -0,0 +1,44 @@
+-- Mask neededVirtualColumns due to non-strict order
+--! qt:replace:/(\s+neededVirtualColumns:\s)(.*)/$1#Masked#/
+-- Mask the totalSize value as it can have slight variability, causing test 
flakiness
+--! qt:replace:/(\s+totalSize\s+)\S+(\s+)/$1#Masked#$2/
+-- Mask random uuid
+--! qt:replace:/(\s+uuid\s+)\S+(\s*)/$1#Masked#$2/
+-- Mask a random snapshot id
+--! qt:replace:/(\s+current-snapshot-id\s+)\S+(\s*)/$1#Masked#/
+-- Mask added file size
+--! qt:replace:/(\S\"added-files-size\\\":\\\")(\d+)(\\\")/$1#Masked#$3/
+-- Mask total file size
+--! qt:replace:/(\S\"total-files-size\\\":\\\")(\d+)(\\\")/$1#Masked#$3/
+-- Mask removed file size
+--! qt:replace:/(\S\"removed-files-size\\\":\\\")(\d+)(\\\")/$1#Masked#$3/
+-- Mask current-snapshot-timestamp-ms
+--! qt:replace:/(\s+current-snapshot-timestamp-ms\s+)\S+(\s*)/$1#Masked#$2/
+--! 
qt:replace:/(MAJOR\s+succeeded\s+)[a-zA-Z0-9\-\.\s+]+(\s+manual)/$1#Masked#$2/
+-- Mask iceberg version
+--! 
qt:replace:/(\S\"iceberg-version\\\":\\\")(\w+\s\w+\s\d+\.\d+\.\d+\s\(\w+\s\w+\))(\\\")/$1#Masked#$3/
+set hive.llap.io.enabled=true;
+set hive.vectorized.execution.enabled=true;
+set hive.optimize.shared.work.merge.ts.schema=true;
+
+create table ice_orc (id int, text string) stored by iceberg stored as orc;
+
+insert into ice_orc values (3, "3"),(2, "2"),(4, "4"),(5, "5"),(1, "1"),(2, 
"3"),(3,null),(2,null),(null,"a");
+
+describe formatted ice_orc;
+describe extended ice_orc;
+set hive.fetch.task.conversion=more;
+select * from ice_orc;
+
+create table ice_orc_sorted (id int, text string) write locally ordered by id 
desc nulls first, text asc nulls last stored by iceberg stored as orc;

Review Comment:
   Can we add one more sort key that uses the default direction and null 
ordering? I expect ASC + NULLS FIRST is used by default.
   
   ```
   spark-sql (default)> CREATE TABLE hadoop_prod.default.test_spark (a int) 
USING iceberg;
   Time taken: 1.956 seconds
   spark-sql (default)> ALTER TABLE hadoop_prod.default.test_spark WRITE 
ORDERED BY a;
   Time taken: 0.181 seconds
   ```
   
   ```
   $ hdfs dfs -cat 
/user/hive/warehouse/catalog/default/test_spark/metadata/v2.metadata.json
   ...
     "default-sort-order-id" : 1,
     "sort-orders" : [ {
       "order-id" : 0,
       "fields" : [ ]
     }, {
       "order-id" : 1,
       "fields" : [ {
         "transform" : "identity",
         "source-id" : 1,
         "direction" : "asc",
         "null-order" : "nulls-first"
       } ]
     } ],
   ```



##########
ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java:
##########
@@ -16089,7 +16097,26 @@ protected void 
addPartitionColsToInsert(List<FieldSchema> partCols,
       rewrittenQueryStr.append(")");
     }
   }
-
+  
+  private String getSortOrderJson(ASTNode ast) {
+    List<SortFieldDesc> sortFieldDescList = new ArrayList<>();
+    SortFields sortFields = new SortFields(sortFieldDescList);
+    for (int i = 0; i < ast.getChildCount(); i++) {
+      ASTNode child = (ASTNode) ast.getChild(i);
+      SortFieldDesc.SortDirection sortDirection = child.getToken()
+          .getType() == HiveParser.TOK_TABSORTCOLNAMEDESC ? 
SortFieldDesc.SortDirection.DESC : SortFieldDesc.SortDirection.ASC;
+      child = (ASTNode) child.getChild(0);
+      String name = 
unescapeIdentifier(child.getChild(0).getText()).toLowerCase();
+      SortFieldDesc.NullOrder nullOrder = child.getToken().getType() == 
HiveParser.TOK_NULLS_FIRST ? SortFieldDesc.NullOrder.NULLS_FIRST : 
SortFieldDesc.NullOrder.NULLS_LAST;
+      sortFieldDescList.add(new SortFieldDesc(name, sortDirection, nullOrder));
+    }
+    try {
+      return jsonObjectMapper.writeValueAsString(sortFields);
+    } catch (JsonProcessingException e) {
+      return null;
+    }
+    //return new JSONObject().put("fields", new 
JSONArray(sortFields.stream().map(SortFieldDesc::toString).collect(Collectors.toList()))).toString();
+  }
   @Override

Review Comment:
   We may want a new line here.



##########
ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java:
##########
@@ -16089,7 +16097,26 @@ protected void 
addPartitionColsToInsert(List<FieldSchema> partCols,
       rewrittenQueryStr.append(")");
     }
   }
-
+  
+  private String getSortOrderJson(ASTNode ast) {
+    List<SortFieldDesc> sortFieldDescList = new ArrayList<>();
+    SortFields sortFields = new SortFields(sortFieldDescList);
+    for (int i = 0; i < ast.getChildCount(); i++) {
+      ASTNode child = (ASTNode) ast.getChild(i);
+      SortFieldDesc.SortDirection sortDirection = child.getToken()
+          .getType() == HiveParser.TOK_TABSORTCOLNAMEDESC ? 
SortFieldDesc.SortDirection.DESC : SortFieldDesc.SortDirection.ASC;
+      child = (ASTNode) child.getChild(0);
+      String name = 
unescapeIdentifier(child.getChild(0).getText()).toLowerCase();
+      SortFieldDesc.NullOrder nullOrder = child.getToken().getType() == 
HiveParser.TOK_NULLS_FIRST ? SortFieldDesc.NullOrder.NULLS_FIRST : 
SortFieldDesc.NullOrder.NULLS_LAST;
+      sortFieldDescList.add(new SortFieldDesc(name, sortDirection, nullOrder));
+    }
+    try {
+      return jsonObjectMapper.writeValueAsString(sortFields);
+    } catch (JsonProcessingException e) {

Review Comment:
   Can we add a warning message here?



##########
ql/src/java/org/apache/hadoop/hive/ql/ddl/misc/sortoder/SortFields.java:
##########
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.ddl.misc.sortoder;
+
+import java.util.LinkedList;
+import java.util.List;
+
+public class SortFields {
+  
+  private List<SortFieldDesc> sortFields;
+
+  public SortFields(){
+    this.sortFields = new LinkedList<>();
+  }

Review Comment:
   I'd insert a line break



##########
ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java:
##########
@@ -14415,7 +14423,7 @@ ASTNode analyzeCreateTable(
         throw new SemanticException(
             "Partition columns can only declared using their names in CTAS 
statements");
       }
-
+      

Review Comment:
   Please clean up this



##########
iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergMetaHook.java:
##########
@@ -271,6 +281,31 @@ public void preCreateTable(CreateTableRequest request) {
     setOrcOnlyFilesParam(hmsTable);
     // Remove hive primary key columns from table request, as iceberg doesn't 
support hive primary key.
     request.setPrimaryKeys(null);
+    setSortOrder(hmsTable, schema, catalogProperties);
+  }
+
+  private void setSortOrder(org.apache.hadoop.hive.metastore.api.Table 
hmsTable, Schema schema,
+      Properties properties) {
+    String sortOderJSONString = 
hmsTable.getParameters().get(TableProperties.DEFAULT_SORT_ORDER);
+    SortFields sortFields = null;
+    if (!Strings.isNullOrEmpty(sortOderJSONString)) {
+      try {
+        sortFields = jsonObjectMapper.readValue(sortOderJSONString, 
SortFields.class);
+      } catch (JsonProcessingException e) {
+        return;

Review Comment:
   Can we add warning?



##########
ql/src/java/org/apache/hadoop/hive/ql/ddl/misc/sortoder/SortFieldDesc.java:
##########
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.ddl.misc.sortoder;
+
+public class SortFieldDesc {
+
+  private String columnName;
+  private NullOrder nullOrder;
+  private SortDirection direction;
+
+  public SortFieldDesc(){

Review Comment:
   I'd add a space after the right paren



##########
ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java:
##########
@@ -966,15 +967,10 @@ protected List<Order> getColumnNamesOrder(ASTNode ast) 
throws SemanticException
       ASTNode child = (ASTNode) ast.getChild(i);
       int directionCode = 
DirectionUtils.tokenToCode(child.getToken().getType());
       child = (ASTNode) child.getChild(0);
-      if (child.getToken().getType() != HiveParser.TOK_NULLS_FIRST && 
directionCode == DirectionUtils.ASCENDING_CODE) {
-        throw new SemanticException(
-                "create/alter bucketed table: not supported NULLS LAST for 
SORTED BY in ASC order");
-      }
-      if (child.getToken().getType() != HiveParser.TOK_NULLS_LAST && 
directionCode == DirectionUtils.DESCENDING_CODE) {
-        throw new SemanticException(
-                "create/alter bucketed table: not supported NULLS FIRST for 
SORTED BY in DESC order");
-      }
-      colList.add(new 
Order(unescapeIdentifier(child.getChild(0).getText()).toLowerCase(), 
directionCode));
+      Order order = new 
Order(unescapeIdentifier(child.getChild(0).getText()).toLowerCase(), 
directionCode);
+      order.setNullOrdering(child.getToken().getType()== 
HiveParser.TOK_NULLS_FIRST?

Review Comment:
   We'd like a space between `getType()` and `==`, and between 
`TOK_NULLS_FIRST` and `?`



##########
ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java:
##########
@@ -115,6 +117,7 @@
 import org.apache.hadoop.security.alias.AbstractJavaKeyStoreProvider;
 import org.apache.hadoop.security.alias.CredentialProvider;
 import org.apache.hadoop.security.alias.CredentialProviderFactory;
+import org.json.JSONArray;

Review Comment:
   Could you please revert changes on this file?



##########
ql/src/java/org/apache/hadoop/hive/ql/ddl/misc/sortoder/SortFields.java:
##########
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.ddl.misc.sortoder;
+
+import java.util.LinkedList;
+import java.util.List;
+
+public class SortFields {
+  
+  private List<SortFieldDesc> sortFields;
+
+  public SortFields(){

Review Comment:
   I'd add a space after the right paren



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] HIVE-28586 Support write order for Iceberg tables at CREATE TABLE [hive]

Reply via email to