This is an automated email from the ASF dual-hosted git repository. wchevreuil pushed a commit to branch HBASE-28463 in repository https://gitbox.apache.org/repos/asf/hbase.git
commit 84fc9acc2248d000bee6bada3925fa5800a00184 Author: vinayak hegde <vinayakph...@gmail.com> AuthorDate: Mon Apr 22 15:23:30 2024 +0530 HBASE-28466 Integration of time-based priority logic of bucket cache in prefetch functionality of HBase (#5808) Signed-off-by: Wellington Chevreuil <wchevre...@apache.org> --- .../apache/hadoop/hbase/io/hfile/BlockCache.java | 6 +- .../hadoop/hbase/io/hfile/CombinedBlockCache.java | 7 +- .../apache/hadoop/hbase/io/hfile/HFileInfo.java | 6 ++ .../hadoop/hbase/io/hfile/HFilePreadReader.java | 6 +- .../hadoop/hbase/io/hfile/bucket/BucketCache.java | 15 +++- .../hbase/regionserver/DataTieringManager.java | 91 +++++++++++++++++----- .../hbase/regionserver/TestDataTieringManager.java | 58 ++++++++++---- 7 files changed, 145 insertions(+), 44 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockCache.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockCache.java index bed0194b1fa..ac83af1053a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockCache.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockCache.java @@ -20,6 +20,7 @@ package org.apache.hadoop.hbase.io.hfile; import java.util.Iterator; import java.util.Map; import java.util.Optional; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.util.Pair; import org.apache.yetus.audience.InterfaceAudience; @@ -197,11 +198,12 @@ public interface BlockCache extends Iterable<CachedBlock> { * overridden by all implementing classes. In such cases, the returned Optional will be empty. For * subclasses implementing this logic, the returned Optional would contain the boolean value * reflecting if the passed file should indeed be cached. - * @param fileName to check if it should be cached. + * @param hFileInfo Information about the file to check if it should be cached. + * @param conf The configuration object to use for determining caching behavior. * @return empty optional if this method is not supported, otherwise the returned optional * contains the boolean value informing if the file should be cached. */ - default Optional<Boolean> shouldCacheFile(String fileName) { + default Optional<Boolean> shouldCacheFile(HFileInfo hFileInfo, Configuration conf) { return Optional.empty(); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CombinedBlockCache.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CombinedBlockCache.java index d6692d2e2bf..b12510cdccd 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CombinedBlockCache.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CombinedBlockCache.java @@ -22,6 +22,7 @@ import java.util.Map; import java.util.Optional; import org.apache.commons.lang3.mutable.Mutable; import org.apache.commons.lang3.mutable.MutableBoolean; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.io.HeapSize; import org.apache.hadoop.hbase.io.hfile.bucket.BucketCache; @@ -482,9 +483,9 @@ public class CombinedBlockCache implements ResizableBlockCache, HeapSize { } @Override - public Optional<Boolean> shouldCacheFile(String fileName) { - Optional<Boolean> l1Result = l1Cache.shouldCacheFile(fileName); - Optional<Boolean> l2Result = l2Cache.shouldCacheFile(fileName); + public Optional<Boolean> shouldCacheFile(HFileInfo hFileInfo, Configuration conf) { + Optional<Boolean> l1Result = l1Cache.shouldCacheFile(hFileInfo, conf); + Optional<Boolean> l2Result = l2Cache.shouldCacheFile(hFileInfo, conf); final Mutable<Boolean> combinedResult = new MutableBoolean(true); l1Result.ifPresent(b -> combinedResult.setValue(b && combinedResult.getValue())); l2Result.ifPresent(b -> combinedResult.setValue(b && combinedResult.getValue())); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileInfo.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileInfo.java index 31e637a0099..e89f86e7c4e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileInfo.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileInfo.java @@ -122,6 +122,7 @@ public class HFileInfo implements SortedMap<byte[], byte[]> { private FixedFileTrailer trailer; private HFileContext hfileContext; + private boolean initialized = false; public HFileInfo() { super(); @@ -363,6 +364,10 @@ public class HFileInfo implements SortedMap<byte[], byte[]> { * should be called after initTrailerAndContext */ public void initMetaAndIndex(HFile.Reader reader) throws IOException { + if (initialized) { + return; + } + ReaderContext context = reader.getContext(); try { HFileBlock.FSReader blockReader = reader.getUncachedBlockReader(); @@ -400,6 +405,7 @@ public class HFileInfo implements SortedMap<byte[], byte[]> { throw new CorruptHFileException( "Problem reading data index and meta index from file " + context.getFilePath(), t); } + initialized = true; } private HFileContext createHFileContext(Path path, FixedFileTrailer trailer, Configuration conf) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFilePreadReader.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFilePreadReader.java index 6063ffe6889..7631dc78c3a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFilePreadReader.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFilePreadReader.java @@ -37,10 +37,14 @@ public class HFilePreadReader extends HFileReaderImpl { public HFilePreadReader(ReaderContext context, HFileInfo fileInfo, CacheConfig cacheConf, Configuration conf) throws IOException { super(context, fileInfo, cacheConf, conf); + final MutableBoolean shouldCache = new MutableBoolean(true); + // Initialize HFileInfo object with metadata for caching decisions + fileInfo.initMetaAndIndex(this); + cacheConf.getBlockCache().ifPresent(cache -> { - Optional<Boolean> result = cache.shouldCacheFile(path.getName()); + Optional<Boolean> result = cache.shouldCacheFile(fileInfo, conf); shouldCache.setValue(result.isPresent() ? result.get().booleanValue() : true); }); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/bucket/BucketCache.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/bucket/BucketCache.java index 71bfc757e51..622a57f91c2 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/bucket/BucketCache.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/bucket/BucketCache.java @@ -73,9 +73,11 @@ import org.apache.hadoop.hbase.io.hfile.CachedBlock; import org.apache.hadoop.hbase.io.hfile.CombinedBlockCache; import org.apache.hadoop.hbase.io.hfile.HFileBlock; import org.apache.hadoop.hbase.io.hfile.HFileContext; +import org.apache.hadoop.hbase.io.hfile.HFileInfo; import org.apache.hadoop.hbase.nio.ByteBuff; import org.apache.hadoop.hbase.nio.RefCnt; import org.apache.hadoop.hbase.protobuf.ProtobufMagic; +import org.apache.hadoop.hbase.regionserver.DataTieringManager; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.util.IdReadWriteLock; @@ -2150,7 +2152,18 @@ public class BucketCache implements BlockCache, HeapSize { } @Override - public Optional<Boolean> shouldCacheFile(String fileName) { + public Optional<Boolean> shouldCacheFile(HFileInfo hFileInfo, Configuration conf) { + String fileName = hFileInfo.getHFileContext().getHFileName(); + try { + DataTieringManager dataTieringManager = DataTieringManager.getInstance(); + if (!dataTieringManager.isHotData(hFileInfo, conf)) { + LOG.debug("Data tiering is enabled for file: '{}' and it is not hot data", fileName); + return Optional.of(false); + } + } catch (IllegalStateException e) { + LOG.error("Error while getting DataTieringManager instance: {}", e.getMessage()); + } + // if we don't have the file in fullyCachedFiles, we should cache it return Optional.of(!fullyCachedFiles.containsKey(fileName)); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/DataTieringManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/DataTieringManager.java index 0bc04ddc428..dec96604774 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/DataTieringManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/DataTieringManager.java @@ -17,6 +17,9 @@ */ package org.apache.hadoop.hbase.regionserver; +import static org.apache.hadoop.hbase.regionserver.HStoreFile.TIMERANGE_KEY; + +import java.io.IOException; import java.util.HashSet; import java.util.Map; import java.util.OptionalLong; @@ -24,6 +27,7 @@ import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.io.hfile.BlockCacheKey; +import org.apache.hadoop.hbase.io.hfile.HFileInfo; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.yetus.audience.InterfaceAudience; @@ -106,11 +110,13 @@ public class DataTieringManager { } /** - * Determines whether the data associated with the given block cache key is considered hot. + * Determines whether the data associated with the given block cache key is considered hot. If the + * data tiering type is set to {@link DataTieringType#TIME_RANGE} and maximum timestamp is not + * present, it considers {@code Long.MAX_VALUE} as the maximum timestamp, making the data hot by + * default. * @param key the block cache key * @return {@code true} if the data is hot, {@code false} otherwise - * @throws DataTieringException if there is an error retrieving data tiering information or the - * HFile maximum timestamp + * @throws DataTieringException if there is an error retrieving data tiering information */ public boolean isHotData(BlockCacheKey key) throws DataTieringException { Path hFilePath = key.getFilePath(); @@ -122,37 +128,82 @@ public class DataTieringManager { /** * Determines whether the data in the HFile at the given path is considered hot based on the - * configured data tiering type and hot data age. + * configured data tiering type and hot data age. If the data tiering type is set to + * {@link DataTieringType#TIME_RANGE} and maximum timestamp is not present, it considers + * {@code Long.MAX_VALUE} as the maximum timestamp, making the data hot by default. * @param hFilePath the path to the HFile * @return {@code true} if the data is hot, {@code false} otherwise - * @throws DataTieringException if there is an error retrieving data tiering information or the - * HFile maximum timestamp + * @throws DataTieringException if there is an error retrieving data tiering information */ public boolean isHotData(Path hFilePath) throws DataTieringException { Configuration configuration = getConfiguration(hFilePath); DataTieringType dataTieringType = getDataTieringType(configuration); if (dataTieringType.equals(DataTieringType.TIME_RANGE)) { - long hotDataAge = getDataTieringHotDataAge(configuration); - - HStoreFile hStoreFile = getHStoreFile(hFilePath); - if (hStoreFile == null) { - LOG.error("HStoreFile corresponding to " + hFilePath + " doesn't exist"); - return false; - } - OptionalLong maxTimestamp = hStoreFile.getMaximumTimestamp(); - if (!maxTimestamp.isPresent()) { - throw new DataTieringException("Maximum timestamp not present for " + hFilePath); - } + return hotDataValidator(getMaxTimestamp(hFilePath), getDataTieringHotDataAge(configuration)); + } + // DataTieringType.NONE or other types are considered hot by default + return true; + } - long currentTimestamp = EnvironmentEdgeManager.getDelegate().currentTime(); - long diff = currentTimestamp - maxTimestamp.getAsLong(); - return diff <= hotDataAge; + /** + * Determines whether the data in the HFile being read is considered hot based on the configured + * data tiering type and hot data age. If the data tiering type is set to + * {@link DataTieringType#TIME_RANGE} and maximum timestamp is not present, it considers + * {@code Long.MAX_VALUE} as the maximum timestamp, making the data hot by default. + * @param hFileInfo Information about the HFile to determine if its data is hot. + * @param configuration The configuration object to use for determining hot data criteria. + * @return {@code true} if the data is hot, {@code false} otherwise + */ + public boolean isHotData(HFileInfo hFileInfo, Configuration configuration) { + DataTieringType dataTieringType = getDataTieringType(configuration); + if (dataTieringType.equals(DataTieringType.TIME_RANGE)) { + return hotDataValidator(getMaxTimestamp(hFileInfo), getDataTieringHotDataAge(configuration)); } // DataTieringType.NONE or other types are considered hot by default return true; } + private boolean hotDataValidator(long maxTimestamp, long hotDataAge) { + long currentTimestamp = getCurrentTimestamp(); + long diff = currentTimestamp - maxTimestamp; + return diff <= hotDataAge; + } + + private long getMaxTimestamp(Path hFilePath) throws DataTieringException { + HStoreFile hStoreFile = getHStoreFile(hFilePath); + if (hStoreFile == null) { + LOG.error("HStoreFile corresponding to " + hFilePath + " doesn't exist"); + return Long.MAX_VALUE; + } + OptionalLong maxTimestamp = hStoreFile.getMaximumTimestamp(); + if (!maxTimestamp.isPresent()) { + LOG.error("Maximum timestamp not present for " + hFilePath); + return Long.MAX_VALUE; + } + return maxTimestamp.getAsLong(); + } + + private long getMaxTimestamp(HFileInfo hFileInfo) { + try { + byte[] hFileTimeRange = hFileInfo.get(TIMERANGE_KEY); + if (hFileTimeRange == null) { + LOG.error("Timestamp information not found for file: {}", + hFileInfo.getHFileContext().getHFileName()); + return Long.MAX_VALUE; + } + return TimeRangeTracker.parseFrom(hFileTimeRange).getMax(); + } catch (IOException e) { + LOG.error("Error occurred while reading the timestamp metadata of file: {}", + hFileInfo.getHFileContext().getHFileName(), e); + return Long.MAX_VALUE; + } + } + + private long getCurrentTimestamp() { + return EnvironmentEdgeManager.getDelegate().currentTime(); + } + /** * Returns a set of cold data filenames from the given set of cached blocks. Cold data is * determined by the configured data tiering type and hot data age. diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestDataTieringManager.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestDataTieringManager.java index afb5862a8a4..6beed9943b3 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestDataTieringManager.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestDataTieringManager.java @@ -17,7 +17,9 @@ */ package org.apache.hadoop.hbase.regionserver; +import static org.apache.hadoop.hbase.HConstants.BUCKET_CACHE_SIZE_KEY; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import java.io.IOException; @@ -26,14 +28,17 @@ import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtil; +import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.Waiter; import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor; import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder; import org.apache.hadoop.hbase.client.RegionInfo; @@ -51,6 +56,7 @@ import org.apache.hadoop.hbase.testclassification.RegionServerTests; import org.apache.hadoop.hbase.testclassification.SmallTests; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.CommonFSUtils; +import org.apache.hadoop.hbase.util.Pair; import org.junit.BeforeClass; import org.junit.ClassRule; import org.junit.Test; @@ -85,22 +91,26 @@ public class TestDataTieringManager { private static final HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil(); private static Configuration defaultConf; private static FileSystem fs; + private static BlockCache blockCache; private static CacheConfig cacheConf; private static Path testDir; - private static Map<String, HRegion> testOnlineRegions; + private static final Map<String, HRegion> testOnlineRegions = new HashMap<>(); private static DataTieringManager dataTieringManager; - private static List<HStoreFile> hStoreFiles; + private static final List<HStoreFile> hStoreFiles = new ArrayList<>(); @BeforeClass public static void setupBeforeClass() throws Exception { testDir = TEST_UTIL.getDataTestDir(TestDataTieringManager.class.getSimpleName()); defaultConf = TEST_UTIL.getConfiguration(); + defaultConf.setBoolean(CacheConfig.PREFETCH_BLOCKS_ON_OPEN_KEY, true); + defaultConf.setStrings(HConstants.BUCKET_CACHE_IOENGINE_KEY, "offheap"); + defaultConf.setLong(BUCKET_CACHE_SIZE_KEY, 32); fs = HFileSystem.get(defaultConf); - BlockCache blockCache = BlockCacheFactory.createBlockCache(defaultConf); + blockCache = BlockCacheFactory.createBlockCache(defaultConf); cacheConf = new CacheConfig(defaultConf, blockCache); - setupOnlineRegions(); DataTieringManager.instantiate(testOnlineRegions); + setupOnlineRegions(); dataTieringManager = DataTieringManager.getInstance(); } @@ -189,7 +199,24 @@ public class TestDataTieringManager { // Test with a filename where corresponding HStoreFile in not present hFilePath = new Path(hStoreFiles.get(0).getPath().getParent(), "incorrectFileName"); - testDataTieringMethodWithPathNoException(methodCallerWithPath, hFilePath, false); + testDataTieringMethodWithPathNoException(methodCallerWithPath, hFilePath, true); + } + + @Test + public void testPrefetchWhenDataTieringEnabled() throws IOException { + // Evict blocks from cache by closing the files and passing evict on close. + // Then initialize the reader again. Since Prefetch on open is set to true, it should prefetch + // those blocks. + for (HStoreFile file : hStoreFiles) { + file.closeStoreFile(true); + file.initReader(); + } + + // Since we have one cold file among four files, only three should get prefetched. + Optional<Map<String, Pair<String, Long>>> fullyCachedFiles = blockCache.getFullyCachedFiles(); + assertTrue("We should get the fully cached files from the cache", fullyCachedFiles.isPresent()); + Waiter.waitFor(defaultConf, 60000, () -> fullyCachedFiles.get().size() == 3); + assertEquals("Number of fully cached files are incorrect", 3, fullyCachedFiles.get().size()); } @Test @@ -271,21 +298,18 @@ public class TestDataTieringManager { } private static void setupOnlineRegions() throws IOException { - testOnlineRegions = new HashMap<>(); - hStoreFiles = new ArrayList<>(); - long day = 24 * 60 * 60 * 1000; long currentTime = System.currentTimeMillis(); HRegion region1 = createHRegion("table1"); HStore hStore11 = createHStore(region1, "cf1", getConfWithTimeRangeDataTieringEnabled(day)); - hStoreFiles - .add(createHStoreFile(hStore11.getStoreContext().getFamilyStoreDirectoryPath(), currentTime)); + hStoreFiles.add(createHStoreFile(hStore11.getStoreContext().getFamilyStoreDirectoryPath(), + hStore11.getReadOnlyConfiguration(), currentTime)); hStore11.refreshStoreFiles(); HStore hStore12 = createHStore(region1, "cf2"); hStoreFiles.add(createHStoreFile(hStore12.getStoreContext().getFamilyStoreDirectoryPath(), - currentTime - day)); + hStore12.getReadOnlyConfiguration(), currentTime - day)); hStore12.refreshStoreFiles(); region1.stores.put(Bytes.toBytes("cf1"), hStore11); @@ -296,11 +320,11 @@ public class TestDataTieringManager { HStore hStore21 = createHStore(region2, "cf1"); hStoreFiles.add(createHStoreFile(hStore21.getStoreContext().getFamilyStoreDirectoryPath(), - currentTime - 2 * day)); + hStore21.getReadOnlyConfiguration(), currentTime - 2 * day)); hStore21.refreshStoreFiles(); HStore hStore22 = createHStore(region2, "cf2"); hStoreFiles.add(createHStoreFile(hStore22.getStoreContext().getFamilyStoreDirectoryPath(), - currentTime - 3 * day)); + hStore22.getReadOnlyConfiguration(), currentTime - 3 * day)); hStore22.refreshStoreFiles(); region2.stores.put(Bytes.toBytes("cf1"), hStore21); @@ -359,17 +383,17 @@ public class TestDataTieringManager { return conf; } - private static HStoreFile createHStoreFile(Path storeDir, long timestamp) throws IOException { + private static HStoreFile createHStoreFile(Path storeDir, Configuration conf, long timestamp) + throws IOException { String columnFamily = storeDir.getName(); - StoreFileWriter storeFileWriter = new StoreFileWriter.Builder(defaultConf, cacheConf, fs) + StoreFileWriter storeFileWriter = new StoreFileWriter.Builder(conf, cacheConf, fs) .withOutputDir(storeDir).withFileContext(new HFileContextBuilder().build()).build(); writeStoreFileRandomData(storeFileWriter, Bytes.toBytes(columnFamily), Bytes.toBytes("random"), timestamp); - return new HStoreFile(fs, storeFileWriter.getPath(), defaultConf, cacheConf, BloomType.NONE, - true); + return new HStoreFile(fs, storeFileWriter.getPath(), conf, cacheConf, BloomType.NONE, true); } private static void writeStoreFileRandomData(final StoreFileWriter writer, byte[] columnFamily,