deniskuzZ commented on code in PR #5934: URL: https://github.com/apache/hive/pull/5934#discussion_r2188343170
########## ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java: ########## @@ -71,8 +70,8 @@ public class SplitGrouper { // TODO This needs to be looked at. Map of Map to Map... Made concurrent for now since split generation // can happen in parallel. - private static final Map<Map<Path, PartitionDesc>, Map<Path, PartitionDesc>> cache = - new ConcurrentHashMap<>(); + private final Map<Map<Path, PartitionDesc>, Map<Path, PartitionDesc>> cache = Review Comment: updated ```` From 1fa2dac1659cec3fdedb15a60fe269335a00c67a Mon Sep 17 00:00:00 2001 From: Denys Kuzmenko <denisk...@gmail.com> Date: Sun, 6 Jul 2025 15:40:18 +0300 Subject: [PATCH 1/1] test --- .../hadoop/hive/ql/exec/tez/SplitGrouper.java | 24 ++++++++---- .../hive/ql/io/HiveFileFormatUtils.java | 39 +++++++++---------- 2 files changed, 35 insertions(+), 28 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java index 2ce7ed88c8..902fd232ed 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java @@ -29,11 +29,12 @@ import java.util.List; import java.util.Map; import java.util.Objects; -import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.TimeUnit; +import com.github.benmanes.caffeine.cache.Cache; +import com.github.benmanes.caffeine.cache.Caffeine; import org.apache.hadoop.hive.ql.io.AcidUtils; import org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat; -import org.apache.hadoop.mapred.InputFormat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -69,10 +70,10 @@ public class SplitGrouper { private static final Logger LOG = LoggerFactory.getLogger(SplitGrouper.class); - // TODO This needs to be looked at. Map of Map to Map... Made concurrent for now since split generation - // can happen in parallel. - private static final Map<Map<Path, PartitionDesc>, Map<Path, PartitionDesc>> cache = - new ConcurrentHashMap<>(); + private static final Cache<Path, Path> cache = Caffeine.newBuilder() + .maximumSize(1000000) + .expireAfterWrite(1, TimeUnit.DAYS) + .build(); private final TezMapredSplitsGrouper tezGrouper = new TezMapredSplitsGrouper(); @@ -412,8 +413,17 @@ private static MapWork populateMapWork(JobConf jobConf, String inputName) { return work; } + private static void checkAndClearCacheIfNeeded() { + if (cache.size() > MAX_CACHE_SIZE) { + cache.clear(); + } + } + private boolean schemaEvolved(InputSplit s, InputSplit prevSplit, boolean groupAcrossFiles, - MapWork work) throws IOException { + MapWork work) throws IOException { + // Check cache size periodically + checkAndClearCacheIfNeeded(); + boolean retval = false; Path path = ((FileSplit) s).getPath(); PartitionDesc pd = HiveFileFormatUtils.getFromPathRecursively( diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java index 20fde98c26..644ce9a3fd 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java @@ -360,17 +360,17 @@ private static RecordUpdater getRecordUpdater(JobConf jc, } public static <T> T getFromPathRecursively(Map<Path, T> pathToPartitionInfo, Path dir, - Map<Map<Path, T>, Map<Path, T>> cacheMap) throws IOException { + Map<Path, Path> cacheMap) throws IOException { return getFromPathRecursively(pathToPartitionInfo, dir, cacheMap, false); } public static <T> T getFromPathRecursively(Map<Path, T> pathToPartitionInfo, Path dir, - Map<Map<Path, T>, Map<Path, T>> cacheMap, boolean ignoreSchema) throws IOException { + Map<Path, Path> cacheMap, boolean ignoreSchema) throws IOException { return getFromPathRecursively(pathToPartitionInfo, dir, cacheMap, ignoreSchema, false); } public static <T> T getFromPathRecursively(Map<Path, T> pathToPartitionInfo, Path dir, - Map<Map<Path, T>, Map<Path, T>> cacheMap, boolean ignoreSchema, boolean ifPresent) + Map<Path, Path> cacheMap, boolean ignoreSchema, boolean ifPresent) throws IOException { T part = getFromPath(pathToPartitionInfo, dir); @@ -379,18 +379,7 @@ public static <T> T getFromPathRecursively(Map<Path, T> pathToPartitionInfo, Pat || (dir.toUri().getScheme() == null || dir.toUri().getScheme().trim().equals("")) || FileUtils.pathsContainNoScheme(pathToPartitionInfo.keySet()))) { - Map<Path, T> newPathToPartitionInfo = null; - if (cacheMap != null) { - newPathToPartitionInfo = cacheMap.get(pathToPartitionInfo); - } - - if (newPathToPartitionInfo == null) { // still null - newPathToPartitionInfo = populateNewT(pathToPartitionInfo); - - if (cacheMap != null) { - cacheMap.put(pathToPartitionInfo, newPathToPartitionInfo); - } - } + Map<Path, T> newPathToPartitionInfo = populateNewT(pathToPartitionInfo, cacheMap); part = getFromPath(newPathToPartitionInfo, dir); } if (part != null || ifPresent) { @@ -401,13 +390,21 @@ public static <T> T getFromPathRecursively(Map<Path, T> pathToPartitionInfo, Pat } } - private static <T> Map<Path, T> populateNewT(Map<Path, T> pathToPartitionInfo) { - Map<Path, T> newPathToPartitionInfo = new HashMap<>(); - for (Map.Entry<Path, T> entry: pathToPartitionInfo.entrySet()) { - T partDesc = entry.getValue(); - Path pathOnly = Path.getPathWithoutSchemeAndAuthority(entry.getKey()); + private static <T> Map<Path, T> populateNewT(Map<Path, T> pathToPartitionInfo, + Map<Path, Path> cacheMap) { + Map<Path, T> newPathToPartitionInfo = new HashMap<>(pathToPartitionInfo.size()); + + pathToPartitionInfo.forEach((originalPath, partDesc) -> { + Path pathOnly = cacheMap != null ? + cacheMap.get(originalPath) : null; + if (pathOnly == null) { + pathOnly = Path.getPathWithoutSchemeAndAuthority(originalPath); + if (cacheMap != null) { + cacheMap.put(originalPath, pathOnly); + } + } newPathToPartitionInfo.put(pathOnly, partDesc); - } + }); return newPathToPartitionInfo; } -- 2.48.0 ```` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org For additional commands, e-mail: gitbox-h...@hive.apache.org