[ https://issues.apache.org/jira/browse/HBASE-29299?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Eungsop Yoo updated HBASE-29299: -------------------------------- Affects Version/s: 2.5.11 2.4.18 > Reopen initialReader of HStoreFile to refresh metadata when read failed > ----------------------------------------------------------------------- > > Key: HBASE-29299 > URL: https://issues.apache.org/jira/browse/HBASE-29299 > Project: HBase > Issue Type: Bug > Affects Versions: 2.4.18, 2.5.11 > Reporter: Eungsop Yoo > Assignee: Eungsop Yoo > Priority: Major > > I discovered an issue while testing Erasure Coding. If more DataNodes go down > than the number of parity stripes, the Scan naturally fails. However, even > after restarting the downed DataNodes, the Scan continues to fail. This issue > does not occur every time, but it happens with high probability. The root > cause is that the initialReader inside the HStoreFile holds an HDFS metadata > cache, which does not get refreshed. Therefore, I modified the logic to close > the initialReader and reopen it when an exception occurs. > Here is the log captured when the scan fails: > {code} > org.apache.hadoop.hbase.client.RetriesExhaustedException: Failed after > attempts=8, exceptions: > 2025-05-07T08:17:57.123Z, > RpcRetryingCaller{globalStartTime=2025-05-07T08:17:57.084Z, pause=100, > maxAttempts=8}, java.io.IOException: java.io.IOException: Could not seek > StoreFileScanner[HFileScanner for reader reader=hdfs://hbase-alpha25/hbas > e/data/default/test1/8a9fd0285a94ed3a8a16f595842e17fa/c/0ca5ca4cd7d14fe993d19e4632b2fb52, > compression=none, cacheConf=cacheDataOnRead=true, cacheDataOnWrite=false, > cacheIndexesOnWrite=false, cacheBloomsOnWrite=false, cacheEvictOnClose=false, > c > acheDataCompressed=false, prefetchOnOpen=false, > firstKey=Optional[user00000000000000000000000000000000256006064453599002/c:field0/1745905948161/Put/seqid=0], > > lastKey=Optional[user00000000000000000000000000000000511999723045682420/c:field3/1745 > 905845638/Put/seqid=0], avgKeyLen=73, avgValueLen=30, entries=134592, > length=15040759, cur=null] to key > org.apache.hadoop.hbase.PrivateCellUtil$FirstOnRowDeleteFamilyCell@1e25b769 > at > org.apache.hadoop.hbase.regionserver.StoreFileScanner.seek(StoreFileScanner.java:232) > at > org.apache.hadoop.hbase.regionserver.StoreScanner.seekScanners(StoreScanner.java:416) > at > org.apache.hadoop.hbase.regionserver.StoreScanner.<init>(StoreScanner.java:260) > at > org.apache.hadoop.hbase.regionserver.HStore.createScanner(HStore.java:1712) > at > org.apache.hadoop.hbase.regionserver.HStore.getScanner(HStore.java:1703) > at > org.apache.hadoop.hbase.regionserver.RegionScannerImpl.initializeScanners(RegionScannerImpl.java:166) > at > org.apache.hadoop.hbase.regionserver.RegionScannerImpl.<init>(RegionScannerImpl.java:146) > at > org.apache.hadoop.hbase.regionserver.HRegion.instantiateRegionScanner(HRegion.java:3019) > at > org.apache.hadoop.hbase.regionserver.HRegion.lambda$getScanner$3(HRegion.java:3004) > at org.apache.hadoop.hbase.trace.TraceUtil.trace(TraceUtil.java:216) > at > org.apache.hadoop.hbase.regionserver.HRegion.getScanner(HRegion.java:2990) > at > org.apache.hadoop.hbase.regionserver.HRegion.getScanner(HRegion.java:2985) > at > org.apache.hadoop.hbase.regionserver.HRegion.getScanner(HRegion.java:2979) > at > org.apache.hadoop.hbase.regionserver.RSRpcServices.newRegionScanner(RSRpcServices.java:3203) > at > org.apache.hadoop.hbase.regionserver.RSRpcServices.scan(RSRpcServices.java:3580) > at > org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:45006) > at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:415) > at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:124) > at org.apache.hadoop.hbase.ipc.RpcHandler.run(RpcHandler.java:102) > at org.apache.hadoop.hbase.ipc.RpcHandler.run(RpcHandler.java:82) > Caused by: java.io.IOException: Encountered an exception when invoking > ByteBuffer positioned read when trying to read 0 bytes from position 0 > at > org.apache.hadoop.hbase.io.util.BlockIOUtils.preadWithExtraDirectly(BlockIOUtils.java:368) > at > org.apache.hadoop.hbase.io.util.BlockIOUtils.preadWithExtra(BlockIOUtils.java:311) > at > org.apache.hadoop.hbase.io.hfile.HFileBlock$FSReaderImpl.readAtOffset(HFileBlock.java:1481) > at > org.apache.hadoop.hbase.io.hfile.HFileBlock$FSReaderImpl.readBlockDataInternal(HFileBlock.java:1719) > at > org.apache.hadoop.hbase.io.hfile.HFileBlock$FSReaderImpl.readBlockData(HFileBlock.java:1519) > at > org.apache.hadoop.hbase.io.hfile.HFileReaderImpl.readBlock(HFileReaderImpl.java:1331) > at > org.apache.hadoop.hbase.io.hfile.HFileReaderImpl.readBlock(HFileReaderImpl.java:1252) > at > org.apache.hadoop.hbase.io.hfile.HFileReaderImpl$HFileScannerImpl.readAndUpdateNewBlock(HFileReaderImpl.java:943) > at > org.apache.hadoop.hbase.io.hfile.HFileReaderImpl$HFileScannerImpl.seekTo(HFileReaderImpl.java:932) > at > org.apache.hadoop.hbase.regionserver.StoreFileScanner.seekAtOrAfter(StoreFileScanner.java:311) > at > org.apache.hadoop.hbase.regionserver.StoreFileScanner.seek(StoreFileScanner.java:214) > ... 19 more > Caused by: java.lang.reflect.InvocationTargetException > at > java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:118) > at java.base/java.lang.reflect.Method.invoke(Method.java:580) > at > org.apache.hadoop.hbase.io.util.BlockIOUtils.preadWithExtraDirectly(BlockIOUtils.java:363) > ... 29 more > Caused by: java.io.IOException: 3 missing blocks, the stripe is: > AlignedStripe(Offset=0, length=33, fetchedChunksNum=0, missingChunksNum=3); > locatedBlocks is: LocatedBlocks{; fileLength=15040759; > underConstruction=false; blocks=[LocatedStri > pedBlock{BP-5442367-10.202.27.120-1743751500104:blk_-9223372036854771360_190437; > getBlockSize()=15040759; corrupt=false; offset=0; > locs=[DatanodeInfoWithStorage[10.202.5.226:1004,DS-7207429b-7335-4e37-9848-4d9b88ab83e0,DISK], > DatanodeInfoWithS > torage[10.202.4.17:1004,DS-42a094e9-a2df-4317-b7bb-7685c3a4e13e,DISK], > DatanodeInfoWithStorage[10.203.21.242:1004,DS-19008e58-49f0-4820-945a-0533a6fb4d0a,DISK], > > DatanodeInfoWithStorage[10.202.15.79:1004,DS-1816a8ec-7dbd-4889-afc7-063ecb6521ec, > DISK], > DatanodeInfoWithStorage[10.202.12.73:1004,DS-d5c48efe-cfdd-4ea9-9eb3-59af4afe3824,DISK]]; > indices=[0, 1, 2, 3, 4]}]; > lastLocatedBlock=LocatedStripedBlock{BP-5442367-10.202.27.120-1743751500104:blk_-9223372036854771360_190437; > getBlockS > ize()=15040759; corrupt=false; offset=0; > locs=[DatanodeInfoWithStorage[10.202.5.226:1004,DS-7207429b-7335-4e37-9848-4d9b88ab83e0,DISK], > > DatanodeInfoWithStorage[10.202.4.17:1004,DS-42a094e9-a2df-4317-b7bb-7685c3a4e13e,DISK], > DatanodeInfoWithSto > rage[10.203.21.242:1004,DS-19008e58-49f0-4820-945a-0533a6fb4d0a,DISK], > DatanodeInfoWithStorage[10.202.15.79:1004,DS-1816a8ec-7dbd-4889-afc7-063ecb6521ec,DISK], > > DatanodeInfoWithStorage[10.202.12.73:1004,DS-d5c48efe-cfdd-4ea9-9eb3-59af4afe3824,D > ISK]]; indices=[0, 1, 2, 3, 4]}; isLastBlockComplete=true; > ecPolicy=ErasureCodingPolicy=[Name=RS-3-2-1024k, Schema=[ECSchema=[Codec=rs, > numDataUnits=3, numParityUnits=2]], CellSize=1048576, Id=2]} > at > org.apache.hadoop.hdfs.StripeReader.checkMissingBlocks(StripeReader.java:180) > at > org.apache.hadoop.hdfs.StripeReader.readDataForDecoding(StripeReader.java:198) > at > org.apache.hadoop.hdfs.StripeReader.readStripe(StripeReader.java:344) > at > org.apache.hadoop.hdfs.DFSStripedInputStream.fetchBlockByteRange(DFSStripedInputStream.java:506) > at > org.apache.hadoop.hdfs.DFSInputStream.pread(DFSInputStream.java:1499) > at > org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:1708) > at > org.apache.hadoop.fs.FSDataInputStream.read(FSDataInputStream.java:259) > at > java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:103) > ... 31 more > {code} -- This message was sent by Atlassian Jira (v8.20.10#820010)