deniskuzZ commented on code in PR #5934:
URL: https://github.com/apache/hive/pull/5934#discussion_r2188343170


##########
ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java:
##########
@@ -71,8 +70,8 @@ public class SplitGrouper {
 
   // TODO This needs to be looked at. Map of Map to Map... Made concurrent for 
now since split generation
   // can happen in parallel.
-  private static final Map<Map<Path, PartitionDesc>, Map<Path, PartitionDesc>> 
cache =
-      new ConcurrentHashMap<>();
+  private final Map<Map<Path, PartitionDesc>, Map<Path, PartitionDesc>> cache =

Review Comment:
   updated
   ````
   From 1fa2dac1659cec3fdedb15a60fe269335a00c67a Mon Sep 17 00:00:00 2001
   From: Denys Kuzmenko <denisk...@gmail.com>
   Date: Sun, 6 Jul 2025 15:40:18 +0300
   Subject: [PATCH 1/1] test
   
   ---
    .../hadoop/hive/ql/exec/tez/SplitGrouper.java | 24 ++++++++----
    .../hive/ql/io/HiveFileFormatUtils.java       | 39 +++++++++----------
    2 files changed, 35 insertions(+), 28 deletions(-)
   
   diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java 
b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java
   index 2ce7ed88c8..902fd232ed 100644
   --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java
   +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java
   @@ -29,11 +29,12 @@
    import java.util.List;
    import java.util.Map;
    import java.util.Objects;
   -import java.util.concurrent.ConcurrentHashMap;
   +import java.util.concurrent.TimeUnit;
    
   +import com.github.benmanes.caffeine.cache.Cache;
   +import com.github.benmanes.caffeine.cache.Caffeine;
    import org.apache.hadoop.hive.ql.io.AcidUtils;
    import org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
   -import org.apache.hadoop.mapred.InputFormat;
    import org.slf4j.Logger;
    import org.slf4j.LoggerFactory;
    import org.apache.hadoop.conf.Configuration;
   @@ -69,10 +70,10 @@ public class SplitGrouper {
    
      private static final Logger LOG = 
LoggerFactory.getLogger(SplitGrouper.class);
    
   -  // TODO This needs to be looked at. Map of Map to Map... Made concurrent 
for now since split generation
   -  // can happen in parallel.
   -  private static final Map<Map<Path, PartitionDesc>, Map<Path, 
PartitionDesc>> cache =
   -      new ConcurrentHashMap<>();
   +  private static final Cache<Path, Path> cache = Caffeine.newBuilder()
   +      .maximumSize(1000000)
   +      .expireAfterWrite(1, TimeUnit.DAYS)
   +      .build();
    
      private final TezMapredSplitsGrouper tezGrouper = new 
TezMapredSplitsGrouper();
    
   @@ -412,8 +413,17 @@ private static MapWork populateMapWork(JobConf jobConf, 
String inputName) {
        return work;
      }
    
   +  private static void checkAndClearCacheIfNeeded() {
   +    if (cache.size() > MAX_CACHE_SIZE) {
   +        cache.clear();
   +    }
   +  }
   +
      private boolean schemaEvolved(InputSplit s, InputSplit prevSplit, boolean 
groupAcrossFiles,
   -                                       MapWork work) throws IOException {
   +                                     MapWork work) throws IOException {
   +    // Check cache size periodically
   +    checkAndClearCacheIfNeeded();
   +
        boolean retval = false;
        Path path = ((FileSplit) s).getPath();
        PartitionDesc pd = HiveFileFormatUtils.getFromPathRecursively(
   diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java
   index 20fde98c26..644ce9a3fd 100644
   --- a/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java
   +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java
   @@ -360,17 +360,17 @@ private static RecordUpdater getRecordUpdater(JobConf 
jc,
      }
    
      public static <T> T getFromPathRecursively(Map<Path, T> 
pathToPartitionInfo, Path dir,
   -      Map<Map<Path, T>, Map<Path, T>> cacheMap) throws IOException {
   +      Map<Path, Path> cacheMap) throws IOException {
        return getFromPathRecursively(pathToPartitionInfo, dir, cacheMap, 
false);
      }
    
      public static <T> T getFromPathRecursively(Map<Path, T> 
pathToPartitionInfo, Path dir,
   -      Map<Map<Path, T>, Map<Path, T>> cacheMap, boolean ignoreSchema) 
throws IOException {
   +      Map<Path, Path> cacheMap, boolean ignoreSchema) throws IOException {
        return getFromPathRecursively(pathToPartitionInfo, dir, cacheMap, 
ignoreSchema, false);
      }
    
      public static <T> T getFromPathRecursively(Map<Path, T> 
pathToPartitionInfo, Path dir,
   -      Map<Map<Path, T>, Map<Path, T>> cacheMap, boolean ignoreSchema, 
boolean ifPresent)
   +      Map<Path, Path> cacheMap, boolean ignoreSchema, boolean ifPresent)
              throws IOException {
        T part = getFromPath(pathToPartitionInfo, dir);
    
   @@ -379,18 +379,7 @@ public static <T> T getFromPathRecursively(Map<Path, T> 
pathToPartitionInfo, Pat
                || (dir.toUri().getScheme() == null || 
dir.toUri().getScheme().trim().equals(""))
                || 
FileUtils.pathsContainNoScheme(pathToPartitionInfo.keySet()))) {
    
   -      Map<Path, T> newPathToPartitionInfo = null;
   -      if (cacheMap != null) {
   -        newPathToPartitionInfo = cacheMap.get(pathToPartitionInfo);
   -      }
   -
   -      if (newPathToPartitionInfo == null) { // still null
   -        newPathToPartitionInfo = populateNewT(pathToPartitionInfo);
   -
   -        if (cacheMap != null) {
   -          cacheMap.put(pathToPartitionInfo, newPathToPartitionInfo);
   -        }
   -      }
   +      Map<Path, T> newPathToPartitionInfo = 
populateNewT(pathToPartitionInfo, cacheMap);
          part = getFromPath(newPathToPartitionInfo, dir);
        }
        if (part != null || ifPresent) {
   @@ -401,13 +390,21 @@ public static <T> T getFromPathRecursively(Map<Path, 
T> pathToPartitionInfo, Pat
        }
      }
    
   -  private static <T> Map<Path, T> populateNewT(Map<Path, T> 
pathToPartitionInfo) {
   -    Map<Path, T> newPathToPartitionInfo = new HashMap<>();
   -    for (Map.Entry<Path, T> entry: pathToPartitionInfo.entrySet()) {
   -      T partDesc = entry.getValue();
   -      Path pathOnly = Path.getPathWithoutSchemeAndAuthority(entry.getKey());
   +  private static <T> Map<Path, T> populateNewT(Map<Path, T> 
pathToPartitionInfo,
   +      Map<Path, Path> cacheMap) {
   +    Map<Path, T> newPathToPartitionInfo = new 
HashMap<>(pathToPartitionInfo.size());
   +
   +    pathToPartitionInfo.forEach((originalPath, partDesc) -> {
   +      Path pathOnly = cacheMap != null ?
   +        cacheMap.get(originalPath) : null;
   +      if (pathOnly == null) {
   +        pathOnly = Path.getPathWithoutSchemeAndAuthority(originalPath);
   +        if (cacheMap != null) {
   +          cacheMap.put(originalPath, pathOnly);
   +        }
   +      }
          newPathToPartitionInfo.put(pathOnly, partDesc);
   -    }
   +    });
        return newPathToPartitionInfo;
      }
    
   -- 
   2.48.0
   
   
   ````
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org
For additional commands, e-mail: gitbox-h...@hive.apache.org

Reply via email to