Re: [PR] HIVE-28601: Leverage configurable getPartitions API in HMS to decrease memory footprint in HS2 [hive]

via GitHub Wed, 18 Dec 2024 10:07:47 -0800


wecharyu commented on code in PR #5539:
URL: https://github.com/apache/hive/pull/5539#discussion_r1890470526



##########
ql/src/java/org/apache/hadoop/hive/ql/ddl/table/info/show/status/formatter/ShowTableStatusFormatter.java:
##########
@@ -22,9 +22,15 @@
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.metastore.api.GetPartitionsFilterSpec;

Review Comment:
   nit: remove unused imports.



##########
ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java:
##########
@@ -4466,6 +4526,81 @@ public List<Partition> getPartitionsByFilter(Table tbl, 
String filter)
     return convertFromMetastore(tbl, tParts);
   }
 
+  public List<Partition> getPartitionsWithSpecs(Table tbl, 
GetPartitionsRequest request)
+      throws HiveException, TException {
+
+    if (!tbl.isPartitioned()) {
+      throw new HiveException(ErrorMsg.TABLE_NOT_PARTITIONED, 
tbl.getTableName());
+    }
+    int batchSize= MetastoreConf.getIntVar(Hive.get().getConf(), 
MetastoreConf.ConfVars.BATCH_RETRIEVE_MAX);
+    if(batchSize > 0){
+      return new ArrayList<>(getAllPartitionsWithSpecsInBatches(tbl, 
batchSize, DEFAULT_BATCH_DECAYING_FACTOR, MetastoreConf.getIntVar(
+          Hive.get().getConf(), 
MetastoreConf.ConfVars.GETPARTITIONS_BATCH_MAX_RETRIES), request));
+    }else{
+      return getPartitionsWithSpecsInternal(tbl, request);
+    }
+  }
+
+  public List<Partition> getPartitionsWithSpecsInternal(Table tbl, 
GetPartitionsRequest request)
+      throws HiveException, TException {
+
+    if (!tbl.isPartitioned()) {
+      throw new HiveException(ErrorMsg.TABLE_NOT_PARTITIONED, 
tbl.getTableName());
+    }
+    GetPartitionsResponse response = getMSC().getPartitionsWithSpecs(request);
+    List<org.apache.hadoop.hive.metastore.api.PartitionSpec> partitionSpecs = 
response.getPartitionSpec();
+    List<Partition> partitions = new ArrayList<>();
+    partitions.addAll(convertFromPartSpec(partitionSpecs.iterator(), tbl));
+
+    return partitions;
+  }
+
+  List<Partition> getPartitionsWithSpecsByNames(Table tbl, List<String> 
partNames, GetPartitionsRequest request)
+      throws HiveException, TException {
+
+    if (!tbl.isPartitioned()) {
+      throw new HiveException(ErrorMsg.TABLE_NOT_PARTITIONED, 
tbl.getTableName());
+    }
+    List<Partition> partitions = new ArrayList<Partition>(partNames.size());
+
+    int batchSize = HiveConf.getIntVar(conf, 
HiveConf.ConfVars.METASTORE_BATCH_RETRIEVE_MAX);
+    int nParts = partNames.size();
+    int nBatches = nParts / batchSize;
+    // I do not want to modify the original request when implementing 
batching, hence we will know what actual request was being made
+    GetPartitionsRequest req = request;
+    if (!req.isSetFilterSpec()) {
+      req.setFilterSpec(new GetPartitionsFilterSpec());
+    }
+
+    try {
+      for (int i = 0; i < nBatches; ++i) {
+        req.getFilterSpec().setFilters(partNames.subList(i*batchSize, 
(i+1)*batchSize));
+        req.getFilterSpec().setFilterMode(PartitionFilterMode.BY_NAMES);
+        List<Partition> tParts = getPartitionsWithSpecsInternal(tbl, request);
+
+        if (tParts != null) {
+          for (Partition tpart: tParts) {
+            partitions.add(new Partition(tbl, tpart.getTPartition()));

Review Comment:
   can we use `partitions.addAll(tParts)` directly here?



##########
ql/src/java/org/apache/hadoop/hive/ql/exec/repl/ReplExternalTables.java:
##########
@@ -97,15 +100,22 @@ void dataLocationDump(Table table, FileList fileList, 
HashMap<String, Boolean> s
     }
     if (table.isPartitioned()) {
       List<Partition> partitions;
+      GetPartitionsRequest request = new 
GetPartitionsRequest(table.getDbName(), table.getTableName(),
+          null, null);
+      request.setCatName(table.getCatName());
+      request.setProjectionSpec(
+          new 
org.apache.hadoop.hive.metastore.client.builder.GetPartitionProjectionsSpecBuilder()
+              .addProjectField("catName").addProjectField("tableName")

Review Comment:
   use `addProjectFieldList()` here.



##########
ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java:
##########
@@ -4086,6 +4091,37 @@ public List<String> getPartitionNames(String dbName, 
String tblName,
     return names;
   }
 
+  // get partition names from provided partition values
+  public List<String> getPartitionNames(String dbName, String tblName,
+      short max, List<String> pVals) throws HiveException {
+    List<String> names = null;
+    Table t = getTable(dbName, tblName);
+
+    try {
+      GetPartitionNamesPsRequest req = new GetPartitionNamesPsRequest();
+      req.setTblName(tblName);
+      req.setDbName(dbName);
+      req.setPartValues(pVals);
+      req.setMaxParts(max);
+      if (AcidUtils.isTransactionalTable(t)) {
+        ValidWriteIdList validWriteIdList = getValidWriteIdList(dbName, 
tblName);
+        req.setValidWriteIdList(validWriteIdList != null ? 
validWriteIdList.toString() : null);
+        req.setId(t.getTTable().getId());
+      }
+      GetPartitionNamesPsResponse res = 
getMSC().listPartitionNamesRequest(req);
+      names = res.getNames();
+    } catch (NoSuchObjectException nsoe) {
+      // this means no partition exists for the given partition spec
+      // key value pairs - thrift cannot handle null return values, hence
+      // listPartitionNames() throws NoSuchObjectException to indicate null 
partitions
+      return Lists.newArrayList();

Review Comment:
   The `NoSuchObjectException` is thrown when the table does not exist, it does 
not mean the partition does not exist.



##########
ql/src/java/org/apache/hadoop/hive/ql/ddl/table/partition/exchange/AlterTableExchangePartitionAnalyzer.java:
##########
@@ -77,9 +85,24 @@ protected void analyzeCommand(TableName tableName, 
Map<String, String> partition
     if (AcidUtils.isTransactionalTable(sourceTable) || 
AcidUtils.isTransactionalTable(destTable)) {
       throw new 
SemanticException(ErrorMsg.EXCHANGE_PARTITION_NOT_ALLOWED_WITH_TRANSACTIONAL_TABLES.getMsg());
     }
+    List<String> sourceProjectFilters = 
MetaStoreUtils.getPvals(sourceTable.getPartCols(), partitionSpecs);
 
     // check if source partition exists
-    PartitionUtils.getPartitions(db, sourceTable, partitionSpecs, true);
+    GetPartitionsFilterSpec getSourcePartitionsFilterSpec = new 
GetPartitionsFilterSpec();
+    getSourcePartitionsFilterSpec.setFilters(sourceProjectFilters);
+    getSourcePartitionsFilterSpec.setFilterMode(PartitionFilterMode.BY_VALUES);
+
+    GetProjectionsSpec getProjectionsSpec = new 
GetPartitionProjectionsSpecBuilder()
+        .addProjectFieldList(Arrays.asList("values","lastAccessTime")).build();
+
+    GetPartitionsRequest request = new 
GetPartitionsRequest(sourceTable.getDbName(), sourceTable.getTableName(),
+        getProjectionsSpec, getSourcePartitionsFilterSpec);
+    request.setCatName(sourceTable.getCatName());
+    try {
+      PartitionUtils.getPartitionsWithSpecs(db, sourceTable, request, true);
+    } catch (SemanticException ex) {

Review Comment:
   The `try ... catch()` clause is unnecessary here.



##########
standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/client/builder/GetPartitionProjectionsSpecBuilder.java:
##########
@@ -41,6 +42,12 @@ public GetPartitionProjectionsSpecBuilder 
addProjectField(String field) {
         return this;
     }
 
+    public GetPartitionProjectionsSpecBuilder addProjectFieldList(List<String> 
fields) {
+        fieldList.addAll(Arrays.asList("catName","dbName","tableName"));

Review Comment:
   why need these fields here?



##########
ql/src/java/org/apache/hadoop/hive/ql/ddl/table/partition/exchange/AlterTableExchangePartitionAnalyzer.java:
##########
@@ -77,9 +85,24 @@ protected void analyzeCommand(TableName tableName, 
Map<String, String> partition
     if (AcidUtils.isTransactionalTable(sourceTable) || 
AcidUtils.isTransactionalTable(destTable)) {
       throw new 
SemanticException(ErrorMsg.EXCHANGE_PARTITION_NOT_ALLOWED_WITH_TRANSACTIONAL_TABLES.getMsg());
     }
+    List<String> sourceProjectFilters = 
MetaStoreUtils.getPvals(sourceTable.getPartCols(), partitionSpecs);
 
     // check if source partition exists
-    PartitionUtils.getPartitions(db, sourceTable, partitionSpecs, true);
+    GetPartitionsFilterSpec getSourcePartitionsFilterSpec = new 
GetPartitionsFilterSpec();
+    getSourcePartitionsFilterSpec.setFilters(sourceProjectFilters);
+    getSourcePartitionsFilterSpec.setFilterMode(PartitionFilterMode.BY_VALUES);
+
+    GetProjectionsSpec getProjectionsSpec = new 
GetPartitionProjectionsSpecBuilder()
+        .addProjectFieldList(Arrays.asList("values","lastAccessTime")).build();

Review Comment:
   the project field list seems unnecessary since we are just checking if 
partition exists?



##########
ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java:
##########
@@ -4466,6 +4526,81 @@ public List<Partition> getPartitionsByFilter(Table tbl, 
String filter)
     return convertFromMetastore(tbl, tParts);
   }
 
+  public List<Partition> getPartitionsWithSpecs(Table tbl, 
GetPartitionsRequest request)
+      throws HiveException, TException {
+
+    if (!tbl.isPartitioned()) {
+      throw new HiveException(ErrorMsg.TABLE_NOT_PARTITIONED, 
tbl.getTableName());
+    }
+    int batchSize= MetastoreConf.getIntVar(Hive.get().getConf(), 
MetastoreConf.ConfVars.BATCH_RETRIEVE_MAX);
+    if(batchSize > 0){
+      return new ArrayList<>(getAllPartitionsWithSpecsInBatches(tbl, 
batchSize, DEFAULT_BATCH_DECAYING_FACTOR, MetastoreConf.getIntVar(
+          Hive.get().getConf(), 
MetastoreConf.ConfVars.GETPARTITIONS_BATCH_MAX_RETRIES), request));
+    }else{
+      return getPartitionsWithSpecsInternal(tbl, request);
+    }
+  }
+
+  public List<Partition> getPartitionsWithSpecsInternal(Table tbl, 
GetPartitionsRequest request)
+      throws HiveException, TException {
+
+    if (!tbl.isPartitioned()) {
+      throw new HiveException(ErrorMsg.TABLE_NOT_PARTITIONED, 
tbl.getTableName());
+    }
+    GetPartitionsResponse response = getMSC().getPartitionsWithSpecs(request);
+    List<org.apache.hadoop.hive.metastore.api.PartitionSpec> partitionSpecs = 
response.getPartitionSpec();
+    List<Partition> partitions = new ArrayList<>();
+    partitions.addAll(convertFromPartSpec(partitionSpecs.iterator(), tbl));
+
+    return partitions;
+  }
+
+  List<Partition> getPartitionsWithSpecsByNames(Table tbl, List<String> 
partNames, GetPartitionsRequest request)
+      throws HiveException, TException {
+
+    if (!tbl.isPartitioned()) {
+      throw new HiveException(ErrorMsg.TABLE_NOT_PARTITIONED, 
tbl.getTableName());
+    }
+    List<Partition> partitions = new ArrayList<Partition>(partNames.size());
+
+    int batchSize = HiveConf.getIntVar(conf, 
HiveConf.ConfVars.METASTORE_BATCH_RETRIEVE_MAX);
+    int nParts = partNames.size();
+    int nBatches = nParts / batchSize;
+    // I do not want to modify the original request when implementing 
batching, hence we will know what actual request was being made
+    GetPartitionsRequest req = request;
+    if (!req.isSetFilterSpec()) {
+      req.setFilterSpec(new GetPartitionsFilterSpec());
+    }
+
+    try {
+      for (int i = 0; i < nBatches; ++i) {

Review Comment:
   use `org.apache.hadoop.hive.metastore.Batchable` to run batches.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] HIVE-28601: Leverage configurable getPartitions API in HMS to decrease memory footprint in HS2 [hive]

Reply via email to