[ 
https://issues.apache.org/jira/browse/HIVE-24350?focusedWorklogId=511170&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-511170
 ]

ASF GitHub Bot logged work on HIVE-24350:
-----------------------------------------

                Author: ASF GitHub Bot
            Created on: 13/Nov/20 01:54
            Start Date: 13/Nov/20 01:54
    Worklog Time Spent: 10m 
      Work Description: mustafaiman commented on a change in pull request #1645:
URL: https://github.com/apache/hive/pull/1645#discussion_r522558974



##########
File path: 
ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/NullScanTaskDispatcher.java
##########
@@ -93,94 +91,109 @@ private String getAliasForTableScanOperator(MapWork work,
     return null;
   }
 
-  private PartitionDesc changePartitionToMetadataOnly(PartitionDesc desc,
-      Path path) {
-    if (desc == null) {
-      return null;
-    }
-    FileStatus[] filesFoundInPartitionDir = null;
+  private void lookupAndProcessPath(MapWork work, Path path,
+                           Collection<String> aliasesToOptimize) {
     try {
-      filesFoundInPartitionDir = 
Utilities.listNonHiddenFileStatus(physicalContext.getConf(), path);
+      boolean isEmpty = 
Utilities.listNonHiddenFileStatus(physicalContext.getConf(), path).length == 0;
+      processPath(work, path, aliasesToOptimize, isEmpty);
     } catch (IOException e) {
-      LOG.error("Cannot determine if the table is empty", e);
-    }
-    if (!isMetadataOnlyAllowed(filesFoundInPartitionDir)) {
-      return desc;
+      LOG.warn("Could not determine if path {} was empty." +
+          "Cannot use null scan optimization for this path.", path, e);
     }
-
-    boolean isEmpty = filesFoundInPartitionDir == null || 
filesFoundInPartitionDir.length == 0;
-    desc.setInputFileFormatClass(isEmpty ? ZeroRowsInputFormat.class : 
OneNullRowInputFormat.class);
-    desc.setOutputFileFormatClass(HiveIgnoreKeyTextOutputFormat.class);
-    desc.getProperties().setProperty(serdeConstants.SERIALIZATION_LIB,
-        NullStructSerDe.class.getName());
-    return desc;
   }
 
-  private boolean isMetadataOnlyAllowed(FileStatus[] filesFoundInPartitionDir) 
{
-    if (filesFoundInPartitionDir == null || filesFoundInPartitionDir.length == 
0) {
-      return true; // empty folders are safe to convert to metadata-only
-    }
-    for (FileStatus f : filesFoundInPartitionDir) {
-      if (AcidUtils.isDeleteDelta(f.getPath())) {
-        /*
-         * as described in HIVE-23712, an acid partition is not a safe subject 
of metadata-only
-         * optimization, because there is a chance that it contains no data 
but contains folders
-         * (e.g: delta_0000002_0000002_0000, 
delete_delta_0000003_0000003_0000), without scanning
-         * the underlying file contents, we cannot tell whether this partition 
contains data or not
-         */
-        return false;
-      }
+  private void processPath(MapWork work, Path path, Collection<String> 
aliasesToOptimize,
+                           boolean isEmpty) {
+    PartitionDesc partDesc = work.getPathToPartitionInfo().get(path).clone();
+    partDesc.setInputFileFormatClass(isEmpty ? ZeroRowsInputFormat.class : 
OneNullRowInputFormat.class);
+    partDesc.setOutputFileFormatClass(HiveIgnoreKeyTextOutputFormat.class);
+    partDesc.getProperties().setProperty(serdeConstants.SERIALIZATION_LIB,
+        NullStructSerDe.class.getName());
+    Path fakePath =
+        new Path(NullScanFileSystem.getBase() + partDesc.getTableName()
+            + "/part" + encode(partDesc.getPartSpec()));
+    StringInternUtils.internUriStringsInPath(fakePath);
+    work.addPathToPartitionInfo(fakePath, partDesc);
+    work.addPathToAlias(fakePath, new ArrayList<>(aliasesToOptimize));
+    Collection<String> aliasesContainingPath = 
work.getPathToAliases().get(path);
+    aliasesContainingPath.removeAll(aliasesToOptimize);
+    if (aliasesContainingPath.isEmpty()) {
+      work.removePathToAlias(path);
+      work.removePathToPartitionInfo(path);
     }
-    return true;
   }
 
-  private void processAlias(MapWork work, Path path,
-      Collection<String> aliasesAffected, Set<String> aliases) {
-    // the aliases that are allowed to map to a null scan.
-    Collection<String> allowed = aliasesAffected.stream()
-        .filter(a -> aliases.contains(a)).collect(Collectors.toList());
-    if (!allowed.isEmpty()) {
-      PartitionDesc partDesc = work.getPathToPartitionInfo().get(path).clone();
-      PartitionDesc newPartition =
-          changePartitionToMetadataOnly(partDesc, path);
-      // Prefix partition with something to avoid it being a hidden file.
-      Path fakePath =
-          new Path(NullScanFileSystem.getBase() + newPartition.getTableName()
-              + "/part" + encode(newPartition.getPartSpec()));
-      StringInternUtils.internUriStringsInPath(fakePath);
-      work.addPathToPartitionInfo(fakePath, newPartition);
-      work.addPathToAlias(fakePath, new ArrayList<>(allowed));
-      aliasesAffected.removeAll(allowed);
-      if (aliasesAffected.isEmpty()) {
-        work.removePathToAlias(path);
-        work.removePathToPartitionInfo(path);
-      }
-    }
+  private void addCandidatePath(Map<Path, Collection<String>> aliasesForPath, 
Path path, String alias) {
+    Collection<String> aliases = aliasesForPath.computeIfAbsent(path, k -> new 
ArrayList<>());
+    aliases.add(alias);
   }
 
-  private void processAlias(MapWork work, Set<TableScanOperator> tableScans) {
-    Set<String> aliases = new HashSet<>();
+  private void processTableScans(MapWork work, Set<TableScanOperator> 
tableScans) {
+    Map<Path, Boolean> managedEmptyPathMap = new HashMap<>();
+    Map<Path, Collection<String>> candidatePathsToAliases = new HashMap<>();
+    Map<String, Boolean> aliasTypeMap = new HashMap<>();
+    Map<String, Map<Path, Partition>> allowedAliasesToPartitions = new 
HashMap<>();
     for (TableScanOperator tso : tableScans) {
       // use LinkedHashMap<String, Operator<? extends OperatorDesc>>
       // getAliasToWork() should not apply this for non-native table
       if (tso.getConf().getTableMetadata().getStorageHandler() != null) {
         continue;
       }
       String alias = getAliasForTableScanOperator(work, tso);
-      aliases.add(alias);
+      boolean isManagedTable = 
!MetaStoreUtils.isExternalTable(tso.getConf().getTableMetadata().getTTable());
+      aliasTypeMap.put(alias, isManagedTable);
+      allowedAliasesToPartitions.put(alias, getPathToPartitionMap(alias, tso));
       tso.getConf().setIsMetadataOnly(true);
     }
-    // group path alias according to work
-    Map<Path, List<String>> candidates = new HashMap<>();
+
     for (Path path : work.getPaths()) {
-      List<String> aliasesAffected = work.getPathToAliases().get(path);
-      if (CollectionUtils.isNotEmpty(aliasesAffected)) {
-        candidates.put(path, aliasesAffected);
+      List<String> aliases = work.getPathToAliases().get(path);
+      for (String alias: aliases) {
+        Map<Path, Partition> pathToPartitionMap = 
allowedAliasesToPartitions.get(alias);
+        if (pathToPartitionMap == null) {
+          continue;
+        }
+        addCandidatePath(candidatePathsToAliases, path, alias);
+        Partition partitionObject = pathToPartitionMap.get(path);
+        Map<String, String> partitionParameters = partitionObject != null ? 
partitionObject.getParameters() : null;
+        boolean isManagedTable = aliasTypeMap.get(alias);
+        if (isManagedTable && partitionParameters != null && 
StatsSetupConst.areBasicStatsUptoDate(partitionParameters)) {
+          long rwCount = 
Long.parseLong(partitionParameters.get(StatsSetupConst.ROW_COUNT));
+          if (rwCount == 0) {
+            managedEmptyPathMap.put(path, true);
+          } else {
+            managedEmptyPathMap.put(path, false);
+          }
+        }
+      }
+    }
+
+    for (Entry<Path, Collection<String>> entry : 
candidatePathsToAliases.entrySet()) {

Review comment:
       Yes




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
-------------------

    Worklog Id:     (was: 511170)
    Time Spent: 1h 10m  (was: 1h)

> NullScanTaskDispatcher should use stats
> ---------------------------------------
>
>                 Key: HIVE-24350
>                 URL: https://issues.apache.org/jira/browse/HIVE-24350
>             Project: Hive
>          Issue Type: Sub-task
>            Reporter: Mustafa İman
>            Assignee: Mustafa İman
>            Priority: Major
>              Labels: pull-request-available
>          Time Spent: 1h 10m
>  Remaining Estimate: 0h
>
> NullScanTaskDispatcher manually checks each partition directory to see if 
> they are empty. While this is necessary in external tables, we can just use 
> stats for managed tables.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to