This is an automated email from the ASF dual-hosted git repository. wchevreuil pushed a commit to branch branch-3 in repository https://gitbox.apache.org/repos/asf/hbase.git
commit 3a57d8b985c30a1672b04252498bc0ba819d43f6 Author: vinayak hegde <[email protected]> AuthorDate: Mon Apr 22 15:23:30 2024 +0530 HBASE-28466 Integration of time-based priority logic of bucket cache in prefetch functionality of HBase (#5808) Signed-off-by: Wellington Chevreuil <[email protected]> --- .../apache/hadoop/hbase/io/hfile/BlockCache.java | 5 +- .../hadoop/hbase/io/hfile/CombinedBlockCache.java | 6 +- .../apache/hadoop/hbase/io/hfile/HFileInfo.java | 6 ++ .../hadoop/hbase/io/hfile/HFilePreadReader.java | 2 + .../hadoop/hbase/io/hfile/bucket/BucketCache.java | 15 +++- .../hbase/regionserver/DataTieringManager.java | 91 +++++++++++++++++----- .../hbase/regionserver/TestDataTieringManager.java | 64 +++++++++++---- 7 files changed, 146 insertions(+), 43 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockCache.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockCache.java index 43d1e77c7e6..d923e4c0780 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockCache.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockCache.java @@ -199,11 +199,12 @@ public interface BlockCache extends Iterable<CachedBlock>, ConfigurationObserver * overridden by all implementing classes. In such cases, the returned Optional will be empty. For * subclasses implementing this logic, the returned Optional would contain the boolean value * reflecting if the passed file should indeed be cached. - * @param fileName to check if it should be cached. + * @param hFileInfo Information about the file to check if it should be cached. + * @param conf The configuration object to use for determining caching behavior. * @return empty optional if this method is not supported, otherwise the returned optional * contains the boolean value informing if the file should be cached. */ - default Optional<Boolean> shouldCacheFile(String fileName) { + default Optional<Boolean> shouldCacheFile(HFileInfo hFileInfo, Configuration conf) { return Optional.empty(); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CombinedBlockCache.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CombinedBlockCache.java index 7e503f99f3f..9f173477108 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CombinedBlockCache.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CombinedBlockCache.java @@ -487,9 +487,9 @@ public class CombinedBlockCache implements ResizableBlockCache, HeapSize { } @Override - public Optional<Boolean> shouldCacheFile(String fileName) { - Optional<Boolean> l1Result = l1Cache.shouldCacheFile(fileName); - Optional<Boolean> l2Result = l2Cache.shouldCacheFile(fileName); + public Optional<Boolean> shouldCacheFile(HFileInfo hFileInfo, Configuration conf) { + Optional<Boolean> l1Result = l1Cache.shouldCacheFile(hFileInfo, conf); + Optional<Boolean> l2Result = l2Cache.shouldCacheFile(hFileInfo, conf); final Mutable<Boolean> combinedResult = new MutableBoolean(true); l1Result.ifPresent(b -> combinedResult.setValue(b && combinedResult.getValue())); l2Result.ifPresent(b -> combinedResult.setValue(b && combinedResult.getValue())); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileInfo.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileInfo.java index 0c3b7890f6e..2386e8d82a5 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileInfo.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileInfo.java @@ -123,6 +123,7 @@ public class HFileInfo implements SortedMap<byte[], byte[]> { private FixedFileTrailer trailer; private HFileContext hfileContext; + private boolean initialized = false; public HFileInfo() { super(); @@ -364,6 +365,10 @@ public class HFileInfo implements SortedMap<byte[], byte[]> { * should be called after initTrailerAndContext */ public void initMetaAndIndex(HFile.Reader reader) throws IOException { + if (initialized) { + return; + } + ReaderContext context = reader.getContext(); try { HFileBlock.FSReader blockReader = reader.getUncachedBlockReader(); @@ -401,6 +406,7 @@ public class HFileInfo implements SortedMap<byte[], byte[]> { throw new CorruptHFileException( "Problem reading data index and meta index from file " + context.getFilePath(), t); } + initialized = true; } private HFileContext createHFileContext(Path path, FixedFileTrailer trailer, Configuration conf) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFilePreadReader.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFilePreadReader.java index 996d3a85e22..147e2598ef9 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFilePreadReader.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFilePreadReader.java @@ -37,6 +37,8 @@ public class HFilePreadReader extends HFileReaderImpl { public HFilePreadReader(ReaderContext context, HFileInfo fileInfo, CacheConfig cacheConf, Configuration conf) throws IOException { super(context, fileInfo, cacheConf, conf); + // Initialize HFileInfo object with metadata for caching decisions + fileInfo.initMetaAndIndex(this); // master hosted regions, like the master procedures store wouldn't have a block cache // Prefetch file blocks upon open if requested if (cacheConf.getBlockCache().isPresent() && cacheConf.shouldPrefetchOnOpen()) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/bucket/BucketCache.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/bucket/BucketCache.java index 7802d868bea..63ae163dee5 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/bucket/BucketCache.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/bucket/BucketCache.java @@ -79,9 +79,11 @@ import org.apache.hadoop.hbase.io.hfile.CachedBlock; import org.apache.hadoop.hbase.io.hfile.CombinedBlockCache; import org.apache.hadoop.hbase.io.hfile.HFileBlock; import org.apache.hadoop.hbase.io.hfile.HFileContext; +import org.apache.hadoop.hbase.io.hfile.HFileInfo; import org.apache.hadoop.hbase.nio.ByteBuff; import org.apache.hadoop.hbase.nio.RefCnt; import org.apache.hadoop.hbase.protobuf.ProtobufMagic; +import org.apache.hadoop.hbase.regionserver.DataTieringManager; import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.regionserver.StoreFileInfo; import org.apache.hadoop.hbase.util.Bytes; @@ -2410,7 +2412,18 @@ public class BucketCache implements BlockCache, HeapSize { } @Override - public Optional<Boolean> shouldCacheFile(String fileName) { + public Optional<Boolean> shouldCacheFile(HFileInfo hFileInfo, Configuration conf) { + String fileName = hFileInfo.getHFileContext().getHFileName(); + try { + DataTieringManager dataTieringManager = DataTieringManager.getInstance(); + if (!dataTieringManager.isHotData(hFileInfo, conf)) { + LOG.debug("Data tiering is enabled for file: '{}' and it is not hot data", fileName); + return Optional.of(false); + } + } catch (IllegalStateException e) { + LOG.error("Error while getting DataTieringManager instance: {}", e.getMessage()); + } + // if we don't have the file in fullyCachedFiles, we should cache it return Optional.of(!fullyCachedFiles.containsKey(fileName)); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/DataTieringManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/DataTieringManager.java index 0bc04ddc428..dec96604774 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/DataTieringManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/DataTieringManager.java @@ -17,6 +17,9 @@ */ package org.apache.hadoop.hbase.regionserver; +import static org.apache.hadoop.hbase.regionserver.HStoreFile.TIMERANGE_KEY; + +import java.io.IOException; import java.util.HashSet; import java.util.Map; import java.util.OptionalLong; @@ -24,6 +27,7 @@ import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.io.hfile.BlockCacheKey; +import org.apache.hadoop.hbase.io.hfile.HFileInfo; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.yetus.audience.InterfaceAudience; @@ -106,11 +110,13 @@ public class DataTieringManager { } /** - * Determines whether the data associated with the given block cache key is considered hot. + * Determines whether the data associated with the given block cache key is considered hot. If the + * data tiering type is set to {@link DataTieringType#TIME_RANGE} and maximum timestamp is not + * present, it considers {@code Long.MAX_VALUE} as the maximum timestamp, making the data hot by + * default. * @param key the block cache key * @return {@code true} if the data is hot, {@code false} otherwise - * @throws DataTieringException if there is an error retrieving data tiering information or the - * HFile maximum timestamp + * @throws DataTieringException if there is an error retrieving data tiering information */ public boolean isHotData(BlockCacheKey key) throws DataTieringException { Path hFilePath = key.getFilePath(); @@ -122,37 +128,82 @@ public class DataTieringManager { /** * Determines whether the data in the HFile at the given path is considered hot based on the - * configured data tiering type and hot data age. + * configured data tiering type and hot data age. If the data tiering type is set to + * {@link DataTieringType#TIME_RANGE} and maximum timestamp is not present, it considers + * {@code Long.MAX_VALUE} as the maximum timestamp, making the data hot by default. * @param hFilePath the path to the HFile * @return {@code true} if the data is hot, {@code false} otherwise - * @throws DataTieringException if there is an error retrieving data tiering information or the - * HFile maximum timestamp + * @throws DataTieringException if there is an error retrieving data tiering information */ public boolean isHotData(Path hFilePath) throws DataTieringException { Configuration configuration = getConfiguration(hFilePath); DataTieringType dataTieringType = getDataTieringType(configuration); if (dataTieringType.equals(DataTieringType.TIME_RANGE)) { - long hotDataAge = getDataTieringHotDataAge(configuration); - - HStoreFile hStoreFile = getHStoreFile(hFilePath); - if (hStoreFile == null) { - LOG.error("HStoreFile corresponding to " + hFilePath + " doesn't exist"); - return false; - } - OptionalLong maxTimestamp = hStoreFile.getMaximumTimestamp(); - if (!maxTimestamp.isPresent()) { - throw new DataTieringException("Maximum timestamp not present for " + hFilePath); - } + return hotDataValidator(getMaxTimestamp(hFilePath), getDataTieringHotDataAge(configuration)); + } + // DataTieringType.NONE or other types are considered hot by default + return true; + } - long currentTimestamp = EnvironmentEdgeManager.getDelegate().currentTime(); - long diff = currentTimestamp - maxTimestamp.getAsLong(); - return diff <= hotDataAge; + /** + * Determines whether the data in the HFile being read is considered hot based on the configured + * data tiering type and hot data age. If the data tiering type is set to + * {@link DataTieringType#TIME_RANGE} and maximum timestamp is not present, it considers + * {@code Long.MAX_VALUE} as the maximum timestamp, making the data hot by default. + * @param hFileInfo Information about the HFile to determine if its data is hot. + * @param configuration The configuration object to use for determining hot data criteria. + * @return {@code true} if the data is hot, {@code false} otherwise + */ + public boolean isHotData(HFileInfo hFileInfo, Configuration configuration) { + DataTieringType dataTieringType = getDataTieringType(configuration); + if (dataTieringType.equals(DataTieringType.TIME_RANGE)) { + return hotDataValidator(getMaxTimestamp(hFileInfo), getDataTieringHotDataAge(configuration)); } // DataTieringType.NONE or other types are considered hot by default return true; } + private boolean hotDataValidator(long maxTimestamp, long hotDataAge) { + long currentTimestamp = getCurrentTimestamp(); + long diff = currentTimestamp - maxTimestamp; + return diff <= hotDataAge; + } + + private long getMaxTimestamp(Path hFilePath) throws DataTieringException { + HStoreFile hStoreFile = getHStoreFile(hFilePath); + if (hStoreFile == null) { + LOG.error("HStoreFile corresponding to " + hFilePath + " doesn't exist"); + return Long.MAX_VALUE; + } + OptionalLong maxTimestamp = hStoreFile.getMaximumTimestamp(); + if (!maxTimestamp.isPresent()) { + LOG.error("Maximum timestamp not present for " + hFilePath); + return Long.MAX_VALUE; + } + return maxTimestamp.getAsLong(); + } + + private long getMaxTimestamp(HFileInfo hFileInfo) { + try { + byte[] hFileTimeRange = hFileInfo.get(TIMERANGE_KEY); + if (hFileTimeRange == null) { + LOG.error("Timestamp information not found for file: {}", + hFileInfo.getHFileContext().getHFileName()); + return Long.MAX_VALUE; + } + return TimeRangeTracker.parseFrom(hFileTimeRange).getMax(); + } catch (IOException e) { + LOG.error("Error occurred while reading the timestamp metadata of file: {}", + hFileInfo.getHFileContext().getHFileName(), e); + return Long.MAX_VALUE; + } + } + + private long getCurrentTimestamp() { + return EnvironmentEdgeManager.getDelegate().currentTime(); + } + /** * Returns a set of cold data filenames from the given set of cached blocks. Cold data is * determined by the configured data tiering type and hot data age. diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestDataTieringManager.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestDataTieringManager.java index afb5862a8a4..30420bcb779 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestDataTieringManager.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestDataTieringManager.java @@ -17,7 +17,9 @@ */ package org.apache.hadoop.hbase.regionserver; +import static org.apache.hadoop.hbase.HConstants.BUCKET_CACHE_SIZE_KEY; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import java.io.IOException; @@ -26,14 +28,17 @@ import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtil; +import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.Waiter; import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor; import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder; import org.apache.hadoop.hbase.client.RegionInfo; @@ -47,10 +52,13 @@ import org.apache.hadoop.hbase.io.hfile.BlockCacheKey; import org.apache.hadoop.hbase.io.hfile.BlockType; import org.apache.hadoop.hbase.io.hfile.CacheConfig; import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder; +import org.apache.hadoop.hbase.regionserver.storefiletracker.StoreFileTracker; +import org.apache.hadoop.hbase.regionserver.storefiletracker.StoreFileTrackerFactory; import org.apache.hadoop.hbase.testclassification.RegionServerTests; import org.apache.hadoop.hbase.testclassification.SmallTests; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.CommonFSUtils; +import org.apache.hadoop.hbase.util.Pair; import org.junit.BeforeClass; import org.junit.ClassRule; import org.junit.Test; @@ -85,22 +93,26 @@ public class TestDataTieringManager { private static final HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil(); private static Configuration defaultConf; private static FileSystem fs; + private static BlockCache blockCache; private static CacheConfig cacheConf; private static Path testDir; - private static Map<String, HRegion> testOnlineRegions; + private static final Map<String, HRegion> testOnlineRegions = new HashMap<>(); private static DataTieringManager dataTieringManager; - private static List<HStoreFile> hStoreFiles; + private static final List<HStoreFile> hStoreFiles = new ArrayList<>(); @BeforeClass public static void setupBeforeClass() throws Exception { testDir = TEST_UTIL.getDataTestDir(TestDataTieringManager.class.getSimpleName()); defaultConf = TEST_UTIL.getConfiguration(); + defaultConf.setBoolean(CacheConfig.PREFETCH_BLOCKS_ON_OPEN_KEY, true); + defaultConf.setStrings(HConstants.BUCKET_CACHE_IOENGINE_KEY, "offheap"); + defaultConf.setLong(BUCKET_CACHE_SIZE_KEY, 32); fs = HFileSystem.get(defaultConf); - BlockCache blockCache = BlockCacheFactory.createBlockCache(defaultConf); + blockCache = BlockCacheFactory.createBlockCache(defaultConf); cacheConf = new CacheConfig(defaultConf, blockCache); - setupOnlineRegions(); DataTieringManager.instantiate(testOnlineRegions); + setupOnlineRegions(); dataTieringManager = DataTieringManager.getInstance(); } @@ -189,7 +201,24 @@ public class TestDataTieringManager { // Test with a filename where corresponding HStoreFile in not present hFilePath = new Path(hStoreFiles.get(0).getPath().getParent(), "incorrectFileName"); - testDataTieringMethodWithPathNoException(methodCallerWithPath, hFilePath, false); + testDataTieringMethodWithPathNoException(methodCallerWithPath, hFilePath, true); + } + + @Test + public void testPrefetchWhenDataTieringEnabled() throws IOException { + // Evict blocks from cache by closing the files and passing evict on close. + // Then initialize the reader again. Since Prefetch on open is set to true, it should prefetch + // those blocks. + for (HStoreFile file : hStoreFiles) { + file.closeStoreFile(true); + file.initReader(); + } + + // Since we have one cold file among four files, only three should get prefetched. + Optional<Map<String, Pair<String, Long>>> fullyCachedFiles = blockCache.getFullyCachedFiles(); + assertTrue("We should get the fully cached files from the cache", fullyCachedFiles.isPresent()); + Waiter.waitFor(defaultConf, 60000, () -> fullyCachedFiles.get().size() == 3); + assertEquals("Number of fully cached files are incorrect", 3, fullyCachedFiles.get().size()); } @Test @@ -271,21 +300,18 @@ public class TestDataTieringManager { } private static void setupOnlineRegions() throws IOException { - testOnlineRegions = new HashMap<>(); - hStoreFiles = new ArrayList<>(); - long day = 24 * 60 * 60 * 1000; long currentTime = System.currentTimeMillis(); HRegion region1 = createHRegion("table1"); HStore hStore11 = createHStore(region1, "cf1", getConfWithTimeRangeDataTieringEnabled(day)); - hStoreFiles - .add(createHStoreFile(hStore11.getStoreContext().getFamilyStoreDirectoryPath(), currentTime)); + hStoreFiles.add(createHStoreFile(hStore11.getStoreContext().getFamilyStoreDirectoryPath(), + hStore11.getReadOnlyConfiguration(), currentTime, region1.getRegionFileSystem())); hStore11.refreshStoreFiles(); HStore hStore12 = createHStore(region1, "cf2"); hStoreFiles.add(createHStoreFile(hStore12.getStoreContext().getFamilyStoreDirectoryPath(), - currentTime - day)); + hStore12.getReadOnlyConfiguration(), currentTime - day, region1.getRegionFileSystem())); hStore12.refreshStoreFiles(); region1.stores.put(Bytes.toBytes("cf1"), hStore11); @@ -296,11 +322,11 @@ public class TestDataTieringManager { HStore hStore21 = createHStore(region2, "cf1"); hStoreFiles.add(createHStoreFile(hStore21.getStoreContext().getFamilyStoreDirectoryPath(), - currentTime - 2 * day)); + hStore21.getReadOnlyConfiguration(), currentTime - 2 * day, region2.getRegionFileSystem())); hStore21.refreshStoreFiles(); HStore hStore22 = createHStore(region2, "cf2"); hStoreFiles.add(createHStoreFile(hStore22.getStoreContext().getFamilyStoreDirectoryPath(), - currentTime - 3 * day)); + hStore22.getReadOnlyConfiguration(), currentTime - 3 * day, region2.getRegionFileSystem())); hStore22.refreshStoreFiles(); region2.stores.put(Bytes.toBytes("cf1"), hStore21); @@ -359,17 +385,21 @@ public class TestDataTieringManager { return conf; } - private static HStoreFile createHStoreFile(Path storeDir, long timestamp) throws IOException { + private static HStoreFile createHStoreFile(Path storeDir, Configuration conf, long timestamp, + HRegionFileSystem regionFs) throws IOException { String columnFamily = storeDir.getName(); - StoreFileWriter storeFileWriter = new StoreFileWriter.Builder(defaultConf, cacheConf, fs) + StoreFileWriter storeFileWriter = new StoreFileWriter.Builder(conf, cacheConf, fs) .withOutputDir(storeDir).withFileContext(new HFileContextBuilder().build()).build(); writeStoreFileRandomData(storeFileWriter, Bytes.toBytes(columnFamily), Bytes.toBytes("random"), timestamp); - return new HStoreFile(fs, storeFileWriter.getPath(), defaultConf, cacheConf, BloomType.NONE, - true); + StoreContext storeContext = + StoreContext.getBuilder().withRegionFileSystem(regionFs).build(); + + StoreFileTracker sft = StoreFileTrackerFactory.create(conf, true, storeContext); + return new HStoreFile(fs, storeFileWriter.getPath(), conf, cacheConf, BloomType.NONE, true, sft); } private static void writeStoreFileRandomData(final StoreFileWriter writer, byte[] columnFamily,
