hi all, 再向社区求助一个RS宕机隐患,HBase版本是2.2.6,看了源码,理论上也会在最新HBase版本中出现,具体的异常栈如下:
2023-01-24 19:14:45,414 ERROR [RpcServer.default.RWQ.Fifo.read.handler=92,queue=11,port=60020] ipc.RpcServer: Unexpected throwable object java.lang.IllegalArgumentException: In CellChunkMap, cell must be associated with chunk.. We were looking for a cell at index 5 at org.apache.hadoop.hbase.regionserver.CellChunkMap.getCell(CellChunkMap.java:109) at org.apache.hadoop.hbase.regionserver.CellFlatMap.find(CellFlatMap.java:87) at org.apache.hadoop.hbase.regionserver.CellFlatMap.getValidIndex(CellFlatMap.java:114) at org.apache.hadoop.hbase.regionserver.CellFlatMap.tailMap(CellFlatMap.java:184) at org.apache.hadoop.hbase.regionserver.CellFlatMap.tailMap(CellFlatMap.java:45) at org.apache.hadoop.hbase.regionserver.CellSet.tailSet(CellSet.java:150) at org.apache.hadoop.hbase.regionserver.CellSet.tailSet(CellSet.java:145) at org.apache.hadoop.hbase.regionserver.Segment.tailSet(Segment.java:414) at org.apache.hadoop.hbase.regionserver.SegmentScanner.getIterator(SegmentScanner.java:131) at org.apache.hadoop.hbase.regionserver.SegmentScanner.reseek(SegmentScanner.java:156) at org.apache.hadoop.hbase.regionserver.NonLazyKeyValueScanner.doRealSeek(NonLazyKeyValueScanner.java:55) at org.apache.hadoop.hbase.regionserver.KeyValueHeap.generalizedSeek(KeyValueHeap.java:324) at org.apache.hadoop.hbase.regionserver.KeyValueHeap.reseek(KeyValueHeap.java:267) at org.apache.hadoop.hbase.regionserver.StoreScanner.reseek(StoreScanner.java:1099) at org.apache.hadoop.hbase.regionserver.StoreScanner.seekAsDirection(StoreScanner.java:1088) at org.apache.hadoop.hbase.regionserver.StoreScanner.seekOrSkipToNextColumn(StoreScanner.java:823) at org.apache.hadoop.hbase.regionserver.StoreScanner.next(StoreScanner.java:730) at org.apache.hadoop.hbase.regionserver.KeyValueHeap.next(KeyValueHeap.java:157) at org.apache.hadoop.hbase.regionserver.HRegion$RegionScannerImpl.populateResult(HRegion.java:6681) at org.apache.hadoop.hbase.regionserver.HRegion$RegionScannerImpl.nextInternal(HRegion.java:6845) at org.apache.hadoop.hbase.regionserver.HRegion$RegionScannerImpl.nextRaw(HRegion.java:6615) at org.apache.hadoop.hbase.regionserver.HRegion$RegionScannerImpl.next(HRegion.java:6592) at org.apache.hadoop.hbase.regionserver.HRegion$RegionScannerImpl.next(HRegion.java:6579) at org.apache.hadoop.hbase.regionserver.RSRpcServices.get(RSRpcServices.java:2645) at org.apache.hadoop.hbase.regionserver.RSRpcServices.get(RSRpcServices.java:2571) at org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:42274) at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:379) at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:133) at org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:338) at org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:318) 2023-01-24 19:14:45,419 WARN [main-BucketCacheWriter-1] bucket.BucketCache: Failed allocation for 5740f58a86a14107afaab310bf2444cb_0; org.apache.hadoop.hbase.io.hfile.bucket.BucketAllocatorException: Allocation too big size=4493298; adjust BucketCache sizes hbase.bucketcache.bucket.sizes to accomodate if size seems reasonable and you want it cached. 2023-01-24 19:14:45,420 WARN [main-BucketCacheWriter-1] bucket.BucketCache: Failed allocation for b1aa2b31cc8a4a23bb5d6bc3d43df51b_0; org.apache.hadoop.hbase.io.hfile.bucket.BucketAllocatorException: Allocation too big size=3260552; adjust BucketCache sizes hbase.bucketcache.bucket.sizes to accomodate if size seems reasonable and you want it cached. 2023-01-24 19:14:45,424 WARN [main-BucketCacheWriter-1] bucket.BucketCache: Failed allocation for e20825c52020440c9eaa1abc29e3516b_0; org.apache.hadoop.hbase.io.hfile.bucket.BucketAllocatorException: Allocation too big size=3266144; adjust BucketCache sizes hbase.bucketcache.bucket.sizes to accomodate if size seems reasonable and you want it cached. 2023-01-24 19:14:45,426 WARN [main-BucketCacheWriter-1] bucket.BucketCache: Failed allocation for 591bb35d28d64eaebc30079f8f83b529_0; org.apache.hadoop.hbase.io.hfile.bucket.BucketAllocatorException: Allocation too big size=3261225; adjust BucketCache sizes hbase.bucketcache.bucket.sizes to accomodate if size seems reasonable and you want it cached. 2023-01-24 19:14:45,430 WARN [main-BucketCacheWriter-2] bucket.BucketCache: Failed allocation for a94eeb8e9e9f4cc590d3c6cd2118c48d_0; org.apache.hadoop.hbase.io.hfile.bucket.BucketAllocatorException: Allocation too big size=3260054; adjust BucketCache sizes hbase.bucketcache.bucket.sizes to accomodate if size seems reasonable and you want it cached. 2023-01-24 19:14:45,436 WARN [main-BucketCacheWriter-1] bucket.BucketCache: Failed allocation for 1ce6f0cb59b64345aba5d0c21c1b178f_0; org.apache.hadoop.hbase.io.hfile.bucket.BucketAllocatorException: Allocation too big size=3260278; adjust BucketCache sizes hbase.bucketcache.bucket.sizes to accomodate if size seems reasonable and you want it cached. 2023-01-24 19:14:45,439 ERROR [MemStoreFlusher.7] regionserver.HRegionServer: ***** ABORTING region server node71-130-29.hadoop,60020,1670988984660: Replay of WAL required. Forcing server shutdown ***** org.apache.hadoop.hbase.DroppedSnapshotException: region: boss_nova_tag_vip,,1628813687297.eb382431ecdd2207af9f25159f09386d. at org.apache.hadoop.hbase.regionserver.HRegion.internalFlushCacheAndCommit(HRegion.java:2882) at org.apache.hadoop.hbase.regionserver.HRegion.internalFlushcache(HRegion.java:2551) at org.apache.hadoop.hbase.regionserver.HRegion.internalFlushcache(HRegion.java:2523) at org.apache.hadoop.hbase.regionserver.HRegion.flushcache(HRegion.java:2409) at org.apache.hadoop.hbase.regionserver.MemStoreFlusher.flushRegion(MemStoreFlusher.java:611) at org.apache.hadoop.hbase.regionserver.MemStoreFlusher.flushRegion(MemStoreFlusher.java:580) at org.apache.hadoop.hbase.regionserver.MemStoreFlusher.access$1000(MemStoreFlusher.java:68) at org.apache.hadoop.hbase.regionserver.MemStoreFlusher$FlushHandler.run(MemStoreFlusher.java:360) at java.lang.Thread.run(Thread.java:748) Caused by: java.lang.IllegalArgumentException: In CellChunkMap, cell must be associated with chunk.. We were looking for a cell at index 5 at org.apache.hadoop.hbase.regionserver.CellChunkMap.getCell(CellChunkMap.java:109) at org.apache.hadoop.hbase.regionserver.CellFlatMap$CellFlatMapIterator.next(CellFlatMap.java:441) at org.apache.hadoop.hbase.regionserver.CellFlatMap$CellFlatMapIterator.next(CellFlatMap.java:427) at org.apache.hadoop.hbase.regionserver.SnapshotSegmentScanner.reseek(SnapshotSegmentScanner.java:79) at org.apache.hadoop.hbase.regionserver.NonLazyKeyValueScanner.doRealSeek(NonLazyKeyValueScanner.java:55) at org.apache.hadoop.hbase.regionserver.KeyValueHeap.generalizedSeek(KeyValueHeap.java:324) at org.apache.hadoop.hbase.regionserver.KeyValueHeap.reseek(KeyValueHeap.java:267) at org.apache.hadoop.hbase.regionserver.StoreScanner.reseek(StoreScanner.java:1099) at org.apache.hadoop.hbase.regionserver.StoreScanner.seekAsDirection(StoreScanner.java:1088) at org.apache.hadoop.hbase.regionserver.StoreScanner.seekOrSkipToNextColumn(StoreScanner.java:823) at org.apache.hadoop.hbase.regionserver.StoreScanner.next(StoreScanner.java:730) at org.apache.hadoop.hbase.regionserver.StoreFlusher.performFlush(StoreFlusher.java:127) at org.apache.hadoop.hbase.regionserver.DefaultStoreFlusher.flushSnapshot(DefaultStoreFlusher.java:69) at org.apache.hadoop.hbase.regionserver.HStore.flushCache(HStore.java:1057) at org.apache.hadoop.hbase.regionserver.HStore$StoreFlusherImpl.flushCache(HStore.java:2374) at org.apache.hadoop.hbase.regionserver.HRegion.internalFlushCacheAndCommit(HRegion.java:2818) ... 8 more 涉及的源码位置:hbase-server模块中的CellChunkMap类的getCell方法内部 protected Cell getCell(int i) { // get the index of the relevant chunk inside chunk array int chunkIndex = (i / numOfCellRepsInChunk); ByteBuffer block = chunks[chunkIndex].getData();// get the ByteBuffer of the relevant chunk int j = i - chunkIndex * numOfCellRepsInChunk; // get the index of the cell-representation // find inside the offset inside the chunk holding the index, skip bytes for chunk id int offsetInBytes = ChunkCreator.SIZEOF_CHUNK_HEADER + j * ClassSize.CELL_CHUNK_MAP_ENTRY; // find the chunk holding the data of the cell, the chunkID is stored first int chunkId = ByteBufferUtils.toInt(block, offsetInBytes); Chunk chunk = ChunkCreator.getInstance().getChunk(chunkId); if (chunk == null) { // this should not happen throw new IllegalArgumentException("In CellChunkMap, cell must be associated with chunk." + ". We were looking for a cell at index " + i); } // ...省略... }