[ 
https://issues.apache.org/jira/browse/HBASE-29299?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Eungsop Yoo updated HBASE-29299:
--------------------------------
    Affects Version/s: 2.5.11
                       2.4.18

> Reopen initialReader of HStoreFile to refresh metadata when read failed
> -----------------------------------------------------------------------
>
>                 Key: HBASE-29299
>                 URL: https://issues.apache.org/jira/browse/HBASE-29299
>             Project: HBase
>          Issue Type: Bug
>    Affects Versions: 2.4.18, 2.5.11
>            Reporter: Eungsop Yoo
>            Assignee: Eungsop Yoo
>            Priority: Major
>
> I discovered an issue while testing Erasure Coding. If more DataNodes go down 
> than the number of parity stripes, the Scan naturally fails. However, even 
> after restarting the downed DataNodes, the Scan continues to fail. This issue 
> does not occur every time, but it happens with high probability. The root 
> cause is that the initialReader inside the HStoreFile holds an HDFS metadata 
> cache, which does not get refreshed. Therefore, I modified the logic to close 
> the initialReader and reopen it when an exception occurs.
> Here is the log captured when the scan fails:
> {code}
> org.apache.hadoop.hbase.client.RetriesExhaustedException: Failed after 
> attempts=8, exceptions:
> 2025-05-07T08:17:57.123Z, 
> RpcRetryingCaller{globalStartTime=2025-05-07T08:17:57.084Z, pause=100, 
> maxAttempts=8}, java.io.IOException: java.io.IOException: Could not seek 
> StoreFileScanner[HFileScanner for reader reader=hdfs://hbase-alpha25/hbas
> e/data/default/test1/8a9fd0285a94ed3a8a16f595842e17fa/c/0ca5ca4cd7d14fe993d19e4632b2fb52,
>  compression=none, cacheConf=cacheDataOnRead=true, cacheDataOnWrite=false, 
> cacheIndexesOnWrite=false, cacheBloomsOnWrite=false, cacheEvictOnClose=false, 
> c
> acheDataCompressed=false, prefetchOnOpen=false, 
> firstKey=Optional[user00000000000000000000000000000000256006064453599002/c:field0/1745905948161/Put/seqid=0],
>  
> lastKey=Optional[user00000000000000000000000000000000511999723045682420/c:field3/1745
> 905845638/Put/seqid=0], avgKeyLen=73, avgValueLen=30, entries=134592, 
> length=15040759, cur=null] to key 
> org.apache.hadoop.hbase.PrivateCellUtil$FirstOnRowDeleteFamilyCell@1e25b769
>         at 
> org.apache.hadoop.hbase.regionserver.StoreFileScanner.seek(StoreFileScanner.java:232)
>         at 
> org.apache.hadoop.hbase.regionserver.StoreScanner.seekScanners(StoreScanner.java:416)
>         at 
> org.apache.hadoop.hbase.regionserver.StoreScanner.<init>(StoreScanner.java:260)
>         at 
> org.apache.hadoop.hbase.regionserver.HStore.createScanner(HStore.java:1712)
>         at 
> org.apache.hadoop.hbase.regionserver.HStore.getScanner(HStore.java:1703)
>         at 
> org.apache.hadoop.hbase.regionserver.RegionScannerImpl.initializeScanners(RegionScannerImpl.java:166)
>         at 
> org.apache.hadoop.hbase.regionserver.RegionScannerImpl.<init>(RegionScannerImpl.java:146)
>         at 
> org.apache.hadoop.hbase.regionserver.HRegion.instantiateRegionScanner(HRegion.java:3019)
>         at 
> org.apache.hadoop.hbase.regionserver.HRegion.lambda$getScanner$3(HRegion.java:3004)
>         at org.apache.hadoop.hbase.trace.TraceUtil.trace(TraceUtil.java:216)
>         at 
> org.apache.hadoop.hbase.regionserver.HRegion.getScanner(HRegion.java:2990)
>         at 
> org.apache.hadoop.hbase.regionserver.HRegion.getScanner(HRegion.java:2985)
>         at 
> org.apache.hadoop.hbase.regionserver.HRegion.getScanner(HRegion.java:2979)
>         at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.newRegionScanner(RSRpcServices.java:3203)
>         at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.scan(RSRpcServices.java:3580)
>         at 
> org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:45006)
>         at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:415)
>         at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:124)
>         at org.apache.hadoop.hbase.ipc.RpcHandler.run(RpcHandler.java:102)
>         at org.apache.hadoop.hbase.ipc.RpcHandler.run(RpcHandler.java:82)
> Caused by: java.io.IOException: Encountered an exception when invoking 
> ByteBuffer positioned read when trying to read 0 bytes from position 0
>         at 
> org.apache.hadoop.hbase.io.util.BlockIOUtils.preadWithExtraDirectly(BlockIOUtils.java:368)
>         at 
> org.apache.hadoop.hbase.io.util.BlockIOUtils.preadWithExtra(BlockIOUtils.java:311)
>         at 
> org.apache.hadoop.hbase.io.hfile.HFileBlock$FSReaderImpl.readAtOffset(HFileBlock.java:1481)
>         at 
> org.apache.hadoop.hbase.io.hfile.HFileBlock$FSReaderImpl.readBlockDataInternal(HFileBlock.java:1719)
>         at 
> org.apache.hadoop.hbase.io.hfile.HFileBlock$FSReaderImpl.readBlockData(HFileBlock.java:1519)
>         at 
> org.apache.hadoop.hbase.io.hfile.HFileReaderImpl.readBlock(HFileReaderImpl.java:1331)
>         at 
> org.apache.hadoop.hbase.io.hfile.HFileReaderImpl.readBlock(HFileReaderImpl.java:1252)
>         at 
> org.apache.hadoop.hbase.io.hfile.HFileReaderImpl$HFileScannerImpl.readAndUpdateNewBlock(HFileReaderImpl.java:943)
>         at 
> org.apache.hadoop.hbase.io.hfile.HFileReaderImpl$HFileScannerImpl.seekTo(HFileReaderImpl.java:932)
>         at 
> org.apache.hadoop.hbase.regionserver.StoreFileScanner.seekAtOrAfter(StoreFileScanner.java:311)
>         at 
> org.apache.hadoop.hbase.regionserver.StoreFileScanner.seek(StoreFileScanner.java:214)
>         ... 19 more
> Caused by: java.lang.reflect.InvocationTargetException
>         at 
> java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:118)
>         at java.base/java.lang.reflect.Method.invoke(Method.java:580)
>         at 
> org.apache.hadoop.hbase.io.util.BlockIOUtils.preadWithExtraDirectly(BlockIOUtils.java:363)
>         ... 29 more
> Caused by: java.io.IOException: 3 missing blocks, the stripe is: 
> AlignedStripe(Offset=0, length=33, fetchedChunksNum=0, missingChunksNum=3); 
> locatedBlocks is: LocatedBlocks{;  fileLength=15040759;  
> underConstruction=false;  blocks=[LocatedStri
> pedBlock{BP-5442367-10.202.27.120-1743751500104:blk_-9223372036854771360_190437;
>  getBlockSize()=15040759; corrupt=false; offset=0; 
> locs=[DatanodeInfoWithStorage[10.202.5.226:1004,DS-7207429b-7335-4e37-9848-4d9b88ab83e0,DISK],
>  DatanodeInfoWithS
> torage[10.202.4.17:1004,DS-42a094e9-a2df-4317-b7bb-7685c3a4e13e,DISK], 
> DatanodeInfoWithStorage[10.203.21.242:1004,DS-19008e58-49f0-4820-945a-0533a6fb4d0a,DISK],
>  
> DatanodeInfoWithStorage[10.202.15.79:1004,DS-1816a8ec-7dbd-4889-afc7-063ecb6521ec,
> DISK], 
> DatanodeInfoWithStorage[10.202.12.73:1004,DS-d5c48efe-cfdd-4ea9-9eb3-59af4afe3824,DISK]];
>  indices=[0, 1, 2, 3, 4]}];  
> lastLocatedBlock=LocatedStripedBlock{BP-5442367-10.202.27.120-1743751500104:blk_-9223372036854771360_190437;
>  getBlockS
> ize()=15040759; corrupt=false; offset=0; 
> locs=[DatanodeInfoWithStorage[10.202.5.226:1004,DS-7207429b-7335-4e37-9848-4d9b88ab83e0,DISK],
>  
> DatanodeInfoWithStorage[10.202.4.17:1004,DS-42a094e9-a2df-4317-b7bb-7685c3a4e13e,DISK],
>  DatanodeInfoWithSto
> rage[10.203.21.242:1004,DS-19008e58-49f0-4820-945a-0533a6fb4d0a,DISK], 
> DatanodeInfoWithStorage[10.202.15.79:1004,DS-1816a8ec-7dbd-4889-afc7-063ecb6521ec,DISK],
>  
> DatanodeInfoWithStorage[10.202.12.73:1004,DS-d5c48efe-cfdd-4ea9-9eb3-59af4afe3824,D
> ISK]]; indices=[0, 1, 2, 3, 4]};  isLastBlockComplete=true;  
> ecPolicy=ErasureCodingPolicy=[Name=RS-3-2-1024k, Schema=[ECSchema=[Codec=rs, 
> numDataUnits=3, numParityUnits=2]], CellSize=1048576, Id=2]}
>         at 
> org.apache.hadoop.hdfs.StripeReader.checkMissingBlocks(StripeReader.java:180)
>         at 
> org.apache.hadoop.hdfs.StripeReader.readDataForDecoding(StripeReader.java:198)
>         at 
> org.apache.hadoop.hdfs.StripeReader.readStripe(StripeReader.java:344)
>         at 
> org.apache.hadoop.hdfs.DFSStripedInputStream.fetchBlockByteRange(DFSStripedInputStream.java:506)
>         at 
> org.apache.hadoop.hdfs.DFSInputStream.pread(DFSInputStream.java:1499)
>         at 
> org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:1708)
>         at 
> org.apache.hadoop.fs.FSDataInputStream.read(FSDataInputStream.java:259)
>         at 
> java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:103)
>         ... 31 more
> {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to