Still observing chaining of region server restarts. Our Phoenix version is 4.14-HBase-1.4 at commit https://github.com/apache/phoenix/commit/52893c240e4f24e2bfac0834d35205f866c16ed8 <https://github.com/apache/phoenix/commit/52893c240e4f24e2bfac0834d35205f866c16ed8>
At prod022 got this: Oct 02 03:24:03 prod022 hbase[160534]: 2018-10-02 03:24:03,678 WARN [hconnection-0x4a616d85-shared--pool8-t10050] client.AsyncProcess: #21, table=KM_IDX1, attempt=1/1 failed=2ops, last exception: org.apache.hadoop.hbase.NotServingRegionException: org.apache.hadoop.hbase.NotServingRegionException: Region KM_IDX1,\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00,1537400041091.9fdc7d07edce09b08b8d2750b24961b8. is not online on prod015,60020,1538417657739 Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.hadoop.hbase.regionserver.HRegionServer.getRegionByEncodedName(HRegionServer.java:3081) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.hadoop.hbase.regionserver.RSRpcServices.getRegion(RSRpcServices.java:1271) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.hadoop.hbase.regionserver.RSRpcServices.multi(RSRpcServices.java:2365) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.hadoop.hbase.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:36621) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2359) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:124) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:297) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:277) Oct 02 03:24:03 prod022 hbase[160534]: on prod015,60020,1538417657739, tracking started Tue Oct 02 03:24:03 MSK 2018; not retrying 2 - final failure Oct 02 03:24:03 prod022 hbase[160534]: 2018-10-02 03:24:03,695 INFO [RpcServer.default.FPBQ.Fifo.handler=82,queue=2,port=60020] index.PhoenixIndexFailurePolicy: Successfully update INDEX_DISABLE_TIMESTAMP for KM_IDX1 due to an exception while writing updates. indexState=PENDING_DISABLE Oct 02 03:24:03 prod022 hbase[160534]: org.apache.phoenix.hbase.index.exception.MultiIndexWriteFailureException: disableIndexOnFailure=true, Failed to write to multiple index tables: [KM_IDX1] Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.phoenix.hbase.index.write.TrackingParallelWriterIndexCommitter.write(TrackingParallelWriterIndexCommitter.java:236) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.phoenix.hbase.index.write.IndexWriter.write(IndexWriter.java:195) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.phoenix.hbase.index.write.IndexWriter.writeAndKillYourselfOnFailure(IndexWriter.java:156) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.phoenix.hbase.index.write.IndexWriter.writeAndKillYourselfOnFailure(IndexWriter.java:145) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.phoenix.hbase.index.Indexer.doPostWithExceptions(Indexer.java:620) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.phoenix.hbase.index.Indexer.doPost(Indexer.java:595) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.phoenix.hbase.index.Indexer.postBatchMutateIndispensably(Indexer.java:578) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost$37.call(RegionCoprocessorHost.java:1048) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost$RegionOperation.call(RegionCoprocessorHost.java:1711) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost.execOperation(RegionCoprocessorHost.java:1789) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost.execOperation(RegionCoprocessorHost.java:1745) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost.postBatchMutateIndispensably(RegionCoprocessorHost.java:1044) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.hadoop.hbase.regionserver.HRegion.doMiniBatchMutation(HRegion.java:3646) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:3108) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:3050) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.hadoop.hbase.regionserver.RSRpcServices.doBatchOp(RSRpcServices.java:916) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.hadoop.hbase.regionserver.RSRpcServices.doNonAtomicRegionMutation(RSRpcServices.java:844) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.hadoop.hbase.regionserver.RSRpcServices.multi(RSRpcServices.java:2405) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.hadoop.hbase.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:36621) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2359) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:124) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:297) Oct 02 03:24:03 prod022 hbase[160534]: at org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:277) Oct 02 03:24:03 prod022 hbase[160534]: 2018-10-02 03:24:03,696 INFO [RpcServer.default.FPBQ.Fifo.handler=82,queue=2,port=60020] util.IndexManagementUtil: Rethrowing org.apache.hadoop.hbase.DoNotRetryIOException: ERROR 1121 (XCL21): Write to the index failed. disableIndexOnFailure=true, Failed to write to multiple index tables: [KM_IDX1] ,serverTimestamp=1538439843665, Oct 02 03:24:04 prod022 hbase[160534]: 2018-10-02 03:24:04,094 ERROR [prod022,60020,1538417662358-index-writer--pool5-t1605] client.AsyncProcess: Cannot get replica 0 location for {"totalColumns":13,"row":"\\x05\\x00(01)04600266008909(21)=>n,t_B\\x00\\x01\\x80\\x00\\x01f2&*p\\x00\\x00\\x00\\x00","families":{"d":[{"qualifier":"_0","vlen":2,"tag":[],"timestamp":1538439843665},{"qualifier":"d:st","vlen":1,"tag":[],"timestamp":1538439843665},{"qualifier":"d:pt","vlen":1,"tag":[],"timestamp":1538439843665},{"qualifier":"d:sid","vlen":2,"tag":[],"timestamp":1538439843665}]}} Oct 02 03:24:04 prod022 hbase[160534]: 2018-10-02 03:24:04,095 ERROR [prod022,60020,1538417662358-index-writer--pool5-t1605] client.AsyncProcess: Cannot get replica 0 location for {"totalColumns":13,"row":"\\x05\\x00(01)04600266008909(21)!Tv_UD!\\x00\\x01\\x80\\x00\\x01f2&*p\\x00\\x00\\x00\\x00","families":{"d":[{"qualifier":"_0","vlen":2,"tag":[],"timestamp":1538439843665},{"qualifier":"d:st","vlen":1,"tag":[],"timestamp":1538439843665},{"qualifier":"d:pt","vlen":1,"tag":[],"timestamp":1538439843665},{"qualifier":"d:sid","vlen":2,"tag":[],"timestamp":1538439843665}]}} I've already mentioned problem with org.apache.hadoop.hbase.NotServingRegionException in near email in this subscription. ERROR [prod022,60020,1538417662358-index-writer--pool5-t1605] client.AsyncProcess: Cannot get replica 0 location for {"totalColumns":13,"row":"\\x05\\x00(01)04600266008909(21)=>n,t_B\\x00\\x01\\x80\\x00\\x01f2&*p\\x00\\x00\\x00\\x00","families":{"d":[{"qualifier":"_0","vlen":2,"tag":[],"timestamp":1538439843665},{"qualifier":"d:st","vlen":1,"tag":[],"timestamp":1538439843665},{"qualifier":"d:pt","vlen":1,"tag":[],"timestamp":1538439843665},{"qualifier":"d:sid","vlen":2,"tag":[],"timestamp":1538439843665}]}} Is something new for me. Later prod022 goes to restart: Oct 02 03:24:52 prod022 hbase[160534]: 2018-10-02 03:24:52,238 ERROR [RpcServer.default.FPBQ.Fifo.handler=82,queue=2,port=60020] write.KillServerOnFailurePolicy: Could not update the index table, killing server region because couldn't write to an index table Next prod002 (and prod004,prod005,prod015, prod021) also goes to restart: Oct 02 03:25:12 prod002 hbase[195373]: 2018-10-02 03:25:12,669 WARN [hconnection-0x3909810e-shared--pool21-t9595] client.AsyncProcess: #102, table=KM_IDX1, attempt=1/1 failed=6ops, last exception: java.io.IOException: Call to prod022/10.0.0.22:60020 failed on local exception: java.io.EOFException on prod022,60020,1538417662358, tracking started Tue Oct 02 03:25:12 MSK 2018; not retrying 6 - final failure Oct 02 03:25:12 prod002 hbase[195373]: 2018-10-02 03:25:12,688 WARN [RpcServer.default.FPBQ.Fifo.handler=58,queue=8,port=60020] index.PhoenixIndexFailurePolicy: Attempt to disable index KM_IDX1 failed with code = UNALLOWED_TABLE_MUTATION. Will use default failure policy instead. Oct 02 03:25:12 prod002 hbase[195373]: 2018-10-02 03:25:12,688 WARN [RpcServer.default.FPBQ.Fifo.handler=58,queue=8,port=60020] index.PhoenixIndexFailurePolicy: handleFailure failed Oct 02 03:25:12 prod002 hbase[195373]: java.io.IOException: org.apache.hadoop.hbase.DoNotRetryIOException: Attempt to disable KM_IDX1 failed. Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.hadoop.hbase.security.User.runAsLoginUser(User.java:212) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.phoenix.index.PhoenixIndexFailurePolicy.handleFailureWithExceptions(PhoenixIndexFailurePolicy.java:244) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.phoenix.index.PhoenixIndexFailurePolicy.handleFailure(PhoenixIndexFailurePolicy.java:153) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.phoenix.hbase.index.write.IndexWriter.writeAndKillYourselfOnFailure(IndexWriter.java:161) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.phoenix.hbase.index.write.IndexWriter.writeAndKillYourselfOnFailure(IndexWriter.java:145) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.phoenix.hbase.index.Indexer.doPostWithExceptions(Indexer.java:620) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.phoenix.hbase.index.Indexer.doPost(Indexer.java:595) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.phoenix.hbase.index.Indexer.postBatchMutateIndispensably(Indexer.java:578) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost$37.call(RegionCoprocessorHost.java:1048) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost$RegionOperation.call(RegionCoprocessorHost.java:1711) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost.execOperation(RegionCoprocessorHost.java:1789) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost.execOperation(RegionCoprocessorHost.java:1745) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost.postBatchMutateIndispensably(RegionCoprocessorHost.java:1044) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.hadoop.hbase.regionserver.HRegion.doMiniBatchMutation(HRegion.java:3646) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:3108) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:3050) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.hadoop.hbase.regionserver.RSRpcServices.doBatchOp(RSRpcServices.java:916) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.hadoop.hbase.regionserver.RSRpcServices.doNonAtomicRegionMutation(RSRpcServices.java:844) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.hadoop.hbase.regionserver.RSRpcServices.multi(RSRpcServices.java:2405) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.hadoop.hbase.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:36621) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2359) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:124) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:297) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:277) Oct 02 03:25:12 prod002 hbase[195373]: Caused by: org.apache.hadoop.hbase.DoNotRetryIOException: Attempt to disable KM_IDX1 failed. Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.phoenix.index.PhoenixIndexFailurePolicy$2.run(PhoenixIndexFailurePolicy.java:280) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.phoenix.index.PhoenixIndexFailurePolicy$2.run(PhoenixIndexFailurePolicy.java:244) Oct 02 03:25:12 prod002 hbase[195373]: at java.security.AccessController.doPrivileged(Native Method) Oct 02 03:25:12 prod002 hbase[195373]: at javax.security.auth.Subject.doAs(Subject.java:422) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1746) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.hadoop.security.SecurityUtil.doAsUser(SecurityUtil.java:448) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.hadoop.security.SecurityUtil.doAsLoginUser(SecurityUtil.java:429) Oct 02 03:25:12 prod002 hbase[195373]: at sun.reflect.GeneratedMethodAccessor160.invoke(Unknown Source) Oct 02 03:25:12 prod002 hbase[195373]: at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) Oct 02 03:25:12 prod002 hbase[195373]: at java.lang.reflect.Method.invoke(Method.java:498) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.hadoop.hbase.util.Methods.call(Methods.java:39) Oct 02 03:25:12 prod002 hbase[195373]: at org.apache.hadoop.hbase.security.User.runAsLoginUser(User.java:210) Oct 02 03:25:12 prod002 hbase[195373]: ... 23 more Oct 02 03:25:12 prod002 hbase[195373]: 2018-10-02 03:25:12,689 ERROR [RpcServer.default.FPBQ.Fifo.handler=58,queue=8,port=60020] write.KillServerOnFailurePolicy: Could not update the index table, killing server region because couldn't write to an index table > On 16 Sep 2018, at 07:43, Sergey Soldatov <sergey.solda...@gmail.com> wrote: > > Obviously yes. If it's not configured than default handlers would be used > for index writes and may lead to the distributed deadlock. > > Thanks, > Sergey > > On Sat, Sep 15, 2018 at 11:36 AM Batyrshin Alexander <0x62...@gmail.com > <mailto:0x62...@gmail.com>> wrote: > I've found that we still not configured this: > > hbase.region.server.rpc.scheduler.factory.class = > org.apache.hadoop.hbase.ipc.PhoenixRpcSchedulerFactory > > Can this misconfiguration leads to our problems? > >> On 15 Sep 2018, at 02:04, Sergey Soldatov <sergey.solda...@gmail.com >> <mailto:sergey.solda...@gmail.com>> wrote: >> >> That was the real problem quite a long time ago (couple years?). Can't say >> for sure in which version that was fixed, but now indexes has a priority >> over regular tables and their regions open first. So by the moment when we >> replay WALs for tables, all index regions are supposed to be online. If you >> see the problem on recent versions that usually means that cluster is not >> healthy and some of the index regions stuck in RiT state. >> >> Thanks, >> Sergey >> >> On Thu, Sep 13, 2018 at 8:12 PM Jonathan Leech <jonat...@gmail.com >> <mailto:jonat...@gmail.com>> wrote: >> This seems similar to a failure scenario I’ve seen a couple times. I believe >> after multiple restarts you got lucky and tables were brought up by Hbase in >> the correct order. >> >> What happens is some kind of semi-catastrophic failure where 1 or more >> region servers go down with edits that weren’t flushed, and are only in the >> WAL. These edits belong to regions whose tables have secondary indexes. >> Hbase wants to replay the WAL before bringing up the region server. Phoenix >> wants to talk to the index region during this, but can’t. It fails enough >> times then stops. >> >> The more region servers / tables / indexes affected, the more likely that a >> full restart will get stuck in a classic deadlock. A good old-fashioned data >> center outage is a great way to get started with this kind of problem. You >> might make some progress and get stuck again, or restart number N might get >> those index regions initialized before the main table. >> >> The sure fire way to recover a cluster in this condition is to strategically >> disable all the tables that are failing to come up. You can do this from the >> Hbase shell as long as the master is running. If I remember right, it’s a >> pain since the disable command will hang. You might need to disable a table, >> kill the shell, disable the next table, etc. Then restart. You’ll eventually >> have a cluster with all the region servers finally started, and a bunch of >> disabled regions. If you disabled index tables, enable one, wait for it to >> become available; eg its WAL edits will be replayed, then enable the >> associated main table and wait for it to come online. If Hbase did it’s job >> without error, and your failure didn’t include losing 4 disks at once, order >> will be restored. Lather, rinse, repeat until everything is enabled and >> online. >> >> <TLDR> A big enough failure sprinkled with a little bit of bad luck and what >> seems to be a Phoenix flaw == deadlock trying to get HBASE to start up. Fix >> by forcing the order that Hbase brings regions online. Finally, never go >> full restart. </TLDR> >> >> > On Sep 10, 2018, at 7:30 PM, Batyrshin Alexander <0x62...@gmail.com >> > <mailto:0x62...@gmail.com>> wrote: >> > >> > After update web interface at Master show that every region server now >> > 1.4.7 and no RITS. >> > >> > Cluster recovered only when we restart all regions servers 4 times... >> > >> >> On 11 Sep 2018, at 04:08, Josh Elser <els...@apache.org >> >> <mailto:els...@apache.org>> wrote: >> >> >> >> Did you update the HBase jars on all RegionServers? >> >> >> >> Make sure that you have all of the Regions assigned (no RITs). There >> >> could be a pretty simple explanation as to why the index can't be written >> >> to. >> >> >> >>> On 9/9/18 3:46 PM, Batyrshin Alexander wrote: >> >>> Correct me if im wrong. >> >>> But looks like if you have A and B region server that has index and >> >>> primary table then possible situation like this. >> >>> A and B under writes on table with indexes >> >>> A - crash >> >>> B failed on index update because A is not operating then B starting >> >>> aborting >> >>> A after restart try to rebuild index from WAL but B at this time is >> >>> aborting then A starting aborting too >> >>> From this moment nothing happens (0 requests to region servers) and A >> >>> and B is not responsible from Master-status web interface >> >>>> On 9 Sep 2018, at 04:38, Batyrshin Alexander <0x62...@gmail.com >> >>>> <mailto:0x62...@gmail.com> <mailto:0x62...@gmail.com >> >>>> <mailto:0x62...@gmail.com>>> wrote: >> >>>> >> >>>> After update we still can't recover HBase cluster. Our region servers >> >>>> ABORTING over and over: >> >>>> >> >>>> prod003: >> >>>> Sep 09 02:51:27 prod003 hbase[1440]: 2018-09-09 02:51:27,395 FATAL >> >>>> [RpcServer.default.FPBQ.Fifo.handler=92,queue=2,port=60020] >> >>>> regionserver.HRegionServer: ABORTING region server >> >>>> prod003,60020,1536446665703: Could not update the index table, killing >> >>>> server region because couldn't write to an index table >> >>>> Sep 09 02:51:27 prod003 hbase[1440]: 2018-09-09 02:51:27,395 FATAL >> >>>> [RpcServer.default.FPBQ.Fifo.handler=77,queue=7,port=60020] >> >>>> regionserver.HRegionServer: ABORTING region server >> >>>> prod003,60020,1536446665703: Could not update the index table, killing >> >>>> server region because couldn't write to an index table >> >>>> Sep 09 02:52:19 prod003 hbase[1440]: 2018-09-09 02:52:19,224 FATAL >> >>>> [RpcServer.default.FPBQ.Fifo.handler=82,queue=2,port=60020] >> >>>> regionserver.HRegionServer: ABORTING region server >> >>>> prod003,60020,1536446665703: Could not update the index table, killing >> >>>> server region because couldn't write to an index table >> >>>> Sep 09 02:52:28 prod003 hbase[1440]: 2018-09-09 02:52:28,922 FATAL >> >>>> [RpcServer.default.FPBQ.Fifo.handler=94,queue=4,port=60020] >> >>>> regionserver.HRegionServer: ABORTING region server >> >>>> prod003,60020,1536446665703: Could not update the index table, killing >> >>>> server region because couldn't write to an index table >> >>>> Sep 09 02:55:02 prod003 hbase[957]: 2018-09-09 02:55:02,096 FATAL >> >>>> [RpcServer.default.FPBQ.Fifo.handler=95,queue=5,port=60020] >> >>>> regionserver.HRegionServer: ABORTING region server >> >>>> prod003,60020,1536450772841: Could not update the index table, killing >> >>>> server region because couldn't write to an index table >> >>>> Sep 09 02:55:18 prod003 hbase[957]: 2018-09-09 02:55:18,793 FATAL >> >>>> [RpcServer.default.FPBQ.Fifo.handler=97,queue=7,port=60020] >> >>>> regionserver.HRegionServer: ABORTING region server >> >>>> prod003,60020,1536450772841: Could not update the index table, killing >> >>>> server region because couldn't write to an index table >> >>>> >> >>>> prod004: >> >>>> Sep 09 02:52:13 prod004 hbase[4890]: 2018-09-09 02:52:13,541 FATAL >> >>>> [RpcServer.default.FPBQ.Fifo.handler=83,queue=3,port=60020] >> >>>> regionserver.HRegionServer: ABORTING region server >> >>>> prod004,60020,1536446387325: Could not update the index table, killing >> >>>> server region because couldn't write to an index table >> >>>> Sep 09 02:52:50 prod004 hbase[4890]: 2018-09-09 02:52:50,264 FATAL >> >>>> [RpcServer.default.FPBQ.Fifo.handler=75,queue=5,port=60020] >> >>>> regionserver.HRegionServer: ABORTING region server >> >>>> prod004,60020,1536446387325: Could not update the index table, killing >> >>>> server region because couldn't write to an index table >> >>>> Sep 09 02:53:40 prod004 hbase[4890]: 2018-09-09 02:53:40,709 FATAL >> >>>> [RpcServer.default.FPBQ.Fifo.handler=66,queue=6,port=60020] >> >>>> regionserver.HRegionServer: ABORTING region server >> >>>> prod004,60020,1536446387325: Could not update the index table, killing >> >>>> server region because couldn't write to an index table >> >>>> Sep 09 02:54:00 prod004 hbase[4890]: 2018-09-09 02:54:00,060 FATAL >> >>>> [RpcServer.default.FPBQ.Fifo.handler=89,queue=9,port=60020] >> >>>> regionserver.HRegionServer: ABORTING region server >> >>>> prod004,60020,1536446387325: Could not update the index table, killing >> >>>> server region because couldn't write to an index table >> >>>> >> >>>> prod005: >> >>>> Sep 09 02:52:50 prod005 hbase[3772]: 2018-09-09 02:52:50,661 FATAL >> >>>> [RpcServer.default.FPBQ.Fifo.handler=65,queue=5,port=60020] >> >>>> regionserver.HRegionServer: ABORTING region server >> >>>> prod005,60020,1536446400009: Could not update the index table, killing >> >>>> server region because couldn't write to an index table >> >>>> Sep 09 02:53:27 prod005 hbase[3772]: 2018-09-09 02:53:27,542 FATAL >> >>>> [RpcServer.default.FPBQ.Fifo.handler=90,queue=0,port=60020] >> >>>> regionserver.HRegionServer: ABORTING region server >> >>>> prod005,60020,1536446400009: Could not update the index table, killing >> >>>> server region because couldn't write to an index table >> >>>> Sep 09 02:54:00 prod005 hbase[3772]: 2018-09-09 02:53:59,915 FATAL >> >>>> [RpcServer.default.FPBQ.Fifo.handler=7,queue=7,port=60020] >> >>>> regionserver.HRegionServer: ABORTING region server >> >>>> prod005,60020,1536446400009: Could not update the index table, killing >> >>>> server region because couldn't write to an index table >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: 2018-09-09 02:54:30,058 FATAL >> >>>> [RpcServer.default.FPBQ.Fifo.handler=16,queue=6,port=60020] >> >>>> regionserver.HRegionServer: ABORTING region server >> >>>> prod005,60020,1536446400009: Could not update the index table, killing >> >>>> server region because couldn't write to an index table >> >>>> >> >>>> And so on... >> >>>> >> >>>> Trace is the same everywhere: >> >>>> >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: >> >>>> org.apache.phoenix.hbase.index.exception.MultiIndexWriteFailureException: >> >>>> disableIndexOnFailure=true, Failed to write to multiple index tables: >> >>>> [KM_IDX1, KM_IDX2, KM_HISTORY_IDX1, KM_HISTORY_IDX2, KM_HISTORY_IDX3] >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.phoenix.hbase.index.write.TrackingParallelWriterIndexCommitter.write(TrackingParallelWriterIndexCommitter.java:235) >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.phoenix.hbase.index.write.IndexWriter.write(IndexWriter.java:195) >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.phoenix.hbase.index.write.IndexWriter.writeAndKillYourselfOnFailure(IndexWriter.java:156) >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.phoenix.hbase.index.write.IndexWriter.writeAndKillYourselfOnFailure(IndexWriter.java:145) >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.phoenix.hbase.index.Indexer.doPostWithExceptions(Indexer.java:620) >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.phoenix.hbase.index.Indexer.doPost(Indexer.java:595) >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.phoenix.hbase.index.Indexer.postBatchMutateIndispensably(Indexer.java:578) >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost$37.call(RegionCoprocessorHost.java:1048) >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost$RegionOperation.call(RegionCoprocessorHost.java:1711) >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost.execOperation(RegionCoprocessorHost.java:1789) >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost.execOperation(RegionCoprocessorHost.java:1745) >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost.postBatchMutateIndispensably(RegionCoprocessorHost.java:1044) >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.hadoop.hbase.regionserver.HRegion.doMiniBatchMutation(HRegion.java:3646) >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:3108) >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:3050) >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.phoenix.coprocessor.UngroupedAggregateRegionObserver.commitBatch(UngroupedAggregateRegionObserver.java:271) >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.phoenix.coprocessor.UngroupedAggregateRegionObserver.commitBatchWithRetries(UngroupedAggregateRegionObserver.java:241) >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.phoenix.coprocessor.UngroupedAggregateRegionObserver.rebuildIndices(UngroupedAggregateRegionObserver.java:1068) >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.phoenix.coprocessor.UngroupedAggregateRegionObserver.doPostScannerOpen(UngroupedAggregateRegionObserver.java:386) >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.phoenix.coprocessor.BaseScannerRegionObserver$RegionScannerHolder.overrideDelegate(BaseScannerRegionObserver.java:239) >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.phoenix.coprocessor.BaseScannerRegionObserver$RegionScannerHolder.nextRaw(BaseScannerRegionObserver.java:287) >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.hadoop.hbase.regionserver.RSRpcServices.scan(RSRpcServices.java:2843) >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.hadoop.hbase.regionserver.RSRpcServices.scan(RSRpcServices.java:3080) >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.hadoop.hbase.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:36613) >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2354) >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:124) >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:297) >> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at >> >>>> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:277) >> >>>> >> >>>>> On 9 Sep 2018, at 01:44, Batyrshin Alexander <0x62...@gmail.com >> >>>>> <mailto:0x62...@gmail.com> <mailto:0x62...@gmail.com >> >>>>> <mailto:0x62...@gmail.com>>> wrote: >> >>>>> >> >>>>> Thank you. >> >>>>> We're updating our cluster right now... >> >>>>> >> >>>>> >> >>>>>> On 9 Sep 2018, at 01:39, Ted Yu <yuzhih...@gmail.com >> >>>>>> <mailto:yuzhih...@gmail.com> <mailto:yuzhih...@gmail.com >> >>>>>> <mailto:yuzhih...@gmail.com>>> wrote: >> >>>>>> >> >>>>>> It seems you should deploy hbase with the following fix: >> >>>>>> >> >>>>>> HBASE-21069 NPE in StoreScanner.updateReaders causes RS to crash >> >>>>>> >> >>>>>> 1.4.7 was recently released. >> >>>>>> >> >>>>>> FYI >> >>>>>> >> >>>>>> On Sat, Sep 8, 2018 at 3:32 PM Batyrshin Alexander <0x62...@gmail.com >> >>>>>> <mailto:0x62...@gmail.com> <mailto:0x62...@gmail.com >> >>>>>> <mailto:0x62...@gmail.com>>> wrote: >> >>>>>> >> >>>>>> Hello, >> >>>>>> >> >>>>>> We got this exception from *prod006* server >> >>>>>> >> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: 2018-09-09 00:38:02,532 >> >>>>>> FATAL [MemStoreFlusher.1] regionserver.HRegionServer: ABORTING >> >>>>>> region server prod006,60020,1536235102833: Replay of >> >>>>>> WAL required. Forcing server shutdown >> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: >> >>>>>> org.apache.hadoop.hbase.DroppedSnapshotException: >> >>>>>> region: >> >>>>>> KM,c\xEF\xBF\xBD\x16I7\xEF\xBF\xBD\x0A"A\xEF\xBF\xBDd\xEF\xBF\xBD\xEF\xBF\xBD\x19\x07t,1536178245576.60c121ba50e67f2429b9ca2ba2a11bad. >> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.regionserver.HRegion.internalFlushCacheAndCommit(HRegion.java:2645) >> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.regionserver.HRegion.internalFlushcache(HRegion.java:2322) >> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.regionserver.HRegion.internalFlushcache(HRegion.java:2284) >> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.regionserver.HRegion.flushcache(HRegion.java:2170) >> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.regionserver.HRegion.flush(HRegion.java:2095) >> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.regionserver.MemStoreFlusher.flushRegion(MemStoreFlusher.java:508) >> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.regionserver.MemStoreFlusher.flushRegion(MemStoreFlusher.java:478) >> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.regionserver.MemStoreFlusher.access$900(MemStoreFlusher.java:76) >> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.regionserver.MemStoreFlusher$FlushHandler.run(MemStoreFlusher.java:264) >> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at >> >>>>>> java.lang.Thread.run(Thread.java:748) >> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: Caused by: >> >>>>>> java.lang.NullPointerException >> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at >> >>>>>> java.util.ArrayList.<init>(ArrayList.java:178) >> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.regionserver.StoreScanner.updateReaders(StoreScanner.java:863) >> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.regionserver.HStore.notifyChangedReadersObservers(HStore.java:1172) >> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.regionserver.HStore.updateStorefiles(HStore.java:1145) >> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.regionserver.HStore.access$900(HStore.java:122) >> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.regionserver.HStore$StoreFlusherImpl.commit(HStore.java:2505) >> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.regionserver.HRegion.internalFlushCacheAndCommit(HRegion.java:2600) >> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: ... 9 more >> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: 2018-09-09 00:38:02,532 >> >>>>>> FATAL [MemStoreFlusher.1] regionserver.HRegionServer: >> >>>>>> RegionServer abort: loaded coprocessors >> >>>>>> are: >> >>>>>> [org.apache.hadoop.hbase.regionserver.IndexHalfStoreFileReaderGenerator, >> >>>>>> org.apache.phoenix.coprocessor.SequenceRegionObserver, >> >>>>>> org.apache.phoenix.c >> >>>>>> >> >>>>>> After that we got ABORTING on almost every Region Servers in >> >>>>>> cluster with different reasons: >> >>>>>> >> >>>>>> *prod003* >> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: 2018-09-09 01:12:11,799 >> >>>>>> FATAL [PostOpenDeployTasks:88bfac1dfd807c4cd1e9c1f31b4f053f] >> >>>>>> regionserver.HRegionServer: ABORTING region >> >>>>>> server prod003,60020,1536444066291: Exception running >> >>>>>> postOpenDeployTasks; region=88bfac1dfd807c4cd1e9c1f31b4f053f >> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: >> >>>>>> java.io <http://java.io/>.InterruptedIOException: #139, interrupted. >> >>>>>> currentNumberOfTask=8 >> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.client.AsyncProcess.waitForMaximumCurrentTasks(AsyncProcess.java:1853) >> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.client.AsyncProcess.waitForMaximumCurrentTasks(AsyncProcess.java:1823) >> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.client.AsyncProcess.waitForAllPreviousOpsAndReset(AsyncProcess.java:1899) >> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.client.BufferedMutatorImpl.backgroundFlushCommits(BufferedMutatorImpl.java:250) >> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.client.BufferedMutatorImpl.flush(BufferedMutatorImpl.java:213) >> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: at >> >>>>>> org.apache.hadoop.hbase.client.HTable.flushCommits(HTable.java:1484) >> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: at >> >>>>>> org.apache.hadoop.hbase.client.HTable.put(HTable.java:1031) >> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.MetaTableAccessor.put(MetaTableAccessor.java:1033) >> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.MetaTableAccessor.putToMetaTable(MetaTableAccessor.java:1023) >> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.MetaTableAccessor.updateLocation(MetaTableAccessor.java:1433) >> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.MetaTableAccessor.updateRegionLocation(MetaTableAccessor.java:1400) >> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.regionserver.HRegionServer.postOpenDeployTasks(HRegionServer.java:2041) >> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.regionserver.handler.OpenRegionHandler$PostOpenDeployTasksThread.run(OpenRegionHandler.java:329) >> >>>>>> >> >>>>>> *prod002* >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: 2018-09-09 01:12:30,144 >> >>>>>> FATAL >> >>>>>> [RpcServer.default.FPBQ.Fifo.handler=36,queue=6,port=60020] >> >>>>>> regionserver.HRegionServer: ABORTING region >> >>>>>> server prod002,60020,1536235138673: Could not update the index >> >>>>>> table, killing server region because couldn't write to an index >> >>>>>> table >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: >> >>>>>> >> >>>>>> org.apache.phoenix.hbase.index.exception.MultiIndexWriteFailureException: >> >>>>>> disableIndexOnFailure=true, Failed to write to multiple index >> >>>>>> tables: [KM_IDX1, KM_IDX2, KM_HISTORY1, KM_HISTORY2, >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.phoenix.hbase.index.write.TrackingParallelWriterIndexCommitter.write(TrackingParallelWriterIndexCommitter.java:235) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.phoenix.hbase.index.write.IndexWriter.write(IndexWriter.java:195) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.phoenix.hbase.index.write.IndexWriter.writeAndKillYourselfOnFailure(IndexWriter.java:156) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.phoenix.hbase.index.write.IndexWriter.writeAndKillYourselfOnFailure(IndexWriter.java:145) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.phoenix.hbase.index.Indexer.doPostWithExceptions(Indexer.java:620) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> org.apache.phoenix.hbase.index.Indexer.doPost(Indexer.java:595) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.phoenix.hbase.index.Indexer.postBatchMutateIndispensably(Indexer.java:578) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost$37.call(RegionCoprocessorHost.java:1048) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost$RegionOperation.call(RegionCoprocessorHost.java:1711) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost.execOperation(RegionCoprocessorHost.java:1789) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost.execOperation(RegionCoprocessorHost.java:1745) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost.postBatchMutateIndispensably(RegionCoprocessorHost.java:1044) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.regionserver.HRegion.doMiniBatchMutation(HRegion.java:3646) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:3108) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:3050) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.phoenix.coprocessor.UngroupedAggregateRegionObserver.commitBatch(UngroupedAggregateRegionObserver.java:271) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.phoenix.coprocessor.UngroupedAggregateRegionObserver.access$000(UngroupedAggregateRegionObserver.java:164) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.phoenix.coprocessor.UngroupedAggregateRegionObserver$1.doMutation(UngroupedAggregateRegionObserver.java:246) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.phoenix.index.PhoenixIndexFailurePolicy.doBatchWithRetries(PhoenixIndexFailurePolicy.java:455) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.phoenix.coprocessor.UngroupedAggregateRegionObserver.handleIndexWriteException(UngroupedAggregateRegionObserver.java:929) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.phoenix.coprocessor.UngroupedAggregateRegionObserver.commitBatchWithRetries(UngroupedAggregateRegionObserver.java:243) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.phoenix.coprocessor.UngroupedAggregateRegionObserver.rebuildIndices(UngroupedAggregateRegionObserver.java:1077) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.phoenix.coprocessor.UngroupedAggregateRegionObserver.doPostScannerOpen(UngroupedAggregateRegionObserver.java:386) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.phoenix.coprocessor.BaseScannerRegionObserver$RegionScannerHolder.overrideDelegate(BaseScannerRegionObserver.java:239) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.phoenix.coprocessor.BaseScannerRegionObserver$RegionScannerHolder.nextRaw(BaseScannerRegionObserver.java:287) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.regionserver.RSRpcServices.scan(RSRpcServices.java:2843) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.regionserver.RSRpcServices.scan(RSRpcServices.java:3080) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:36613) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2354) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:124) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:297) >> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at >> >>>>>> >> >>>>>> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:277) >> >>>>>> >> >>>>>> >> >>>>>> And etc... >> >>>>>> >> >>>>>> Master-status web interface shows that contact lost from this >> >>>>>> aborted servers. >> >>>>> >> >>>> >> > >