Still observing chaining of region server restarts.
Our Phoenix version is 4.14-HBase-1.4 at commit
https://github.com/apache/phoenix/commit/52893c240e4f24e2bfac0834d35205f866c16ed8
<https://github.com/apache/phoenix/commit/52893c240e4f24e2bfac0834d35205f866c16ed8>
At prod022 got this:
Oct 02 03:24:03 prod022 hbase[160534]: 2018-10-02 03:24:03,678 WARN
[hconnection-0x4a616d85-shared--pool8-t10050] client.AsyncProcess: #21,
table=KM_IDX1, attempt=1/1 failed=2ops, last exception:
org.apache.hadoop.hbase.NotServingRegionException:
org.apache.hadoop.hbase.NotServingRegionException: Region
KM_IDX1,\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00,1537400041091.9fdc7d07edce09b08b8d2750b24961b8.
is not online on prod015,60020,1538417657739
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.hadoop.hbase.regionserver.HRegionServer.getRegionByEncodedName(HRegionServer.java:3081)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.hadoop.hbase.regionserver.RSRpcServices.getRegion(RSRpcServices.java:1271)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.hadoop.hbase.regionserver.RSRpcServices.multi(RSRpcServices.java:2365)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.hadoop.hbase.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:36621)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2359)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:124)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:297)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:277)
Oct 02 03:24:03 prod022 hbase[160534]: on prod015,60020,1538417657739,
tracking started Tue Oct 02 03:24:03 MSK 2018; not retrying 2 - final failure
Oct 02 03:24:03 prod022 hbase[160534]: 2018-10-02 03:24:03,695 INFO
[RpcServer.default.FPBQ.Fifo.handler=82,queue=2,port=60020]
index.PhoenixIndexFailurePolicy: Successfully update INDEX_DISABLE_TIMESTAMP
for KM_IDX1 due to an exception while writing updates.
indexState=PENDING_DISABLE
Oct 02 03:24:03 prod022 hbase[160534]:
org.apache.phoenix.hbase.index.exception.MultiIndexWriteFailureException:
disableIndexOnFailure=true, Failed to write to multiple index tables: [KM_IDX1]
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.phoenix.hbase.index.write.TrackingParallelWriterIndexCommitter.write(TrackingParallelWriterIndexCommitter.java:236)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.phoenix.hbase.index.write.IndexWriter.write(IndexWriter.java:195)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.phoenix.hbase.index.write.IndexWriter.writeAndKillYourselfOnFailure(IndexWriter.java:156)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.phoenix.hbase.index.write.IndexWriter.writeAndKillYourselfOnFailure(IndexWriter.java:145)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.phoenix.hbase.index.Indexer.doPostWithExceptions(Indexer.java:620)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.phoenix.hbase.index.Indexer.doPost(Indexer.java:595)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.phoenix.hbase.index.Indexer.postBatchMutateIndispensably(Indexer.java:578)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost$37.call(RegionCoprocessorHost.java:1048)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost$RegionOperation.call(RegionCoprocessorHost.java:1711)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost.execOperation(RegionCoprocessorHost.java:1789)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost.execOperation(RegionCoprocessorHost.java:1745)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost.postBatchMutateIndispensably(RegionCoprocessorHost.java:1044)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.hadoop.hbase.regionserver.HRegion.doMiniBatchMutation(HRegion.java:3646)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:3108)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:3050)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.hadoop.hbase.regionserver.RSRpcServices.doBatchOp(RSRpcServices.java:916)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.hadoop.hbase.regionserver.RSRpcServices.doNonAtomicRegionMutation(RSRpcServices.java:844)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.hadoop.hbase.regionserver.RSRpcServices.multi(RSRpcServices.java:2405)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.hadoop.hbase.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:36621)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2359)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:124)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:297)
Oct 02 03:24:03 prod022 hbase[160534]: at
org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:277)
Oct 02 03:24:03 prod022 hbase[160534]: 2018-10-02 03:24:03,696 INFO
[RpcServer.default.FPBQ.Fifo.handler=82,queue=2,port=60020]
util.IndexManagementUtil: Rethrowing
org.apache.hadoop.hbase.DoNotRetryIOException: ERROR 1121 (XCL21): Write to the
index failed. disableIndexOnFailure=true, Failed to write to multiple index
tables: [KM_IDX1] ,serverTimestamp=1538439843665,
Oct 02 03:24:04 prod022 hbase[160534]: 2018-10-02 03:24:04,094 ERROR
[prod022,60020,1538417662358-index-writer--pool5-t1605] client.AsyncProcess:
Cannot get replica 0 location for
{"totalColumns":13,"row":"\\x05\\x00(01)04600266008909(21)=>n,t_B\\x00\\x01\\x80\\x00\\x01f2&*p\\x00\\x00\\x00\\x00","families":{"d":[{"qualifier":"_0","vlen":2,"tag":[],"timestamp":1538439843665},{"qualifier":"d:st","vlen":1,"tag":[],"timestamp":1538439843665},{"qualifier":"d:pt","vlen":1,"tag":[],"timestamp":1538439843665},{"qualifier":"d:sid","vlen":2,"tag":[],"timestamp":1538439843665}]}}
Oct 02 03:24:04 prod022 hbase[160534]: 2018-10-02 03:24:04,095 ERROR
[prod022,60020,1538417662358-index-writer--pool5-t1605] client.AsyncProcess:
Cannot get replica 0 location for
{"totalColumns":13,"row":"\\x05\\x00(01)04600266008909(21)!Tv_UD!\\x00\\x01\\x80\\x00\\x01f2&*p\\x00\\x00\\x00\\x00","families":{"d":[{"qualifier":"_0","vlen":2,"tag":[],"timestamp":1538439843665},{"qualifier":"d:st","vlen":1,"tag":[],"timestamp":1538439843665},{"qualifier":"d:pt","vlen":1,"tag":[],"timestamp":1538439843665},{"qualifier":"d:sid","vlen":2,"tag":[],"timestamp":1538439843665}]}}
I've already mentioned problem with
org.apache.hadoop.hbase.NotServingRegionException in near email in this
subscription.
ERROR [prod022,60020,1538417662358-index-writer--pool5-t1605]
client.AsyncProcess: Cannot get replica 0 location for
{"totalColumns":13,"row":"\\x05\\x00(01)04600266008909(21)=>n,t_B\\x00\\x01\\x80\\x00\\x01f2&*p\\x00\\x00\\x00\\x00","families":{"d":[{"qualifier":"_0","vlen":2,"tag":[],"timestamp":1538439843665},{"qualifier":"d:st","vlen":1,"tag":[],"timestamp":1538439843665},{"qualifier":"d:pt","vlen":1,"tag":[],"timestamp":1538439843665},{"qualifier":"d:sid","vlen":2,"tag":[],"timestamp":1538439843665}]}}
Is something new for me.
Later prod022 goes to restart:
Oct 02 03:24:52 prod022 hbase[160534]: 2018-10-02 03:24:52,238 ERROR
[RpcServer.default.FPBQ.Fifo.handler=82,queue=2,port=60020]
write.KillServerOnFailurePolicy: Could not update the index table, killing
server region because couldn't write to an index table
Next prod002 (and prod004,prod005,prod015, prod021) also goes to restart:
Oct 02 03:25:12 prod002 hbase[195373]: 2018-10-02 03:25:12,669 WARN
[hconnection-0x3909810e-shared--pool21-t9595] client.AsyncProcess: #102,
table=KM_IDX1, attempt=1/1 failed=6ops, last exception: java.io.IOException:
Call to prod022/10.0.0.22:60020 failed on local exception: java.io.EOFException
on prod022,60020,1538417662358, tracking started Tue Oct 02 03:25:12 MSK 2018;
not retrying 6 - final failure
Oct 02 03:25:12 prod002 hbase[195373]: 2018-10-02 03:25:12,688 WARN
[RpcServer.default.FPBQ.Fifo.handler=58,queue=8,port=60020]
index.PhoenixIndexFailurePolicy: Attempt to disable index KM_IDX1 failed with
code = UNALLOWED_TABLE_MUTATION. Will use default failure policy instead.
Oct 02 03:25:12 prod002 hbase[195373]: 2018-10-02 03:25:12,688 WARN
[RpcServer.default.FPBQ.Fifo.handler=58,queue=8,port=60020]
index.PhoenixIndexFailurePolicy: handleFailure failed
Oct 02 03:25:12 prod002 hbase[195373]: java.io.IOException:
org.apache.hadoop.hbase.DoNotRetryIOException: Attempt to disable KM_IDX1
failed.
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.hadoop.hbase.security.User.runAsLoginUser(User.java:212)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.phoenix.index.PhoenixIndexFailurePolicy.handleFailureWithExceptions(PhoenixIndexFailurePolicy.java:244)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.phoenix.index.PhoenixIndexFailurePolicy.handleFailure(PhoenixIndexFailurePolicy.java:153)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.phoenix.hbase.index.write.IndexWriter.writeAndKillYourselfOnFailure(IndexWriter.java:161)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.phoenix.hbase.index.write.IndexWriter.writeAndKillYourselfOnFailure(IndexWriter.java:145)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.phoenix.hbase.index.Indexer.doPostWithExceptions(Indexer.java:620)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.phoenix.hbase.index.Indexer.doPost(Indexer.java:595)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.phoenix.hbase.index.Indexer.postBatchMutateIndispensably(Indexer.java:578)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost$37.call(RegionCoprocessorHost.java:1048)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost$RegionOperation.call(RegionCoprocessorHost.java:1711)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost.execOperation(RegionCoprocessorHost.java:1789)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost.execOperation(RegionCoprocessorHost.java:1745)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost.postBatchMutateIndispensably(RegionCoprocessorHost.java:1044)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.hadoop.hbase.regionserver.HRegion.doMiniBatchMutation(HRegion.java:3646)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:3108)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:3050)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.hadoop.hbase.regionserver.RSRpcServices.doBatchOp(RSRpcServices.java:916)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.hadoop.hbase.regionserver.RSRpcServices.doNonAtomicRegionMutation(RSRpcServices.java:844)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.hadoop.hbase.regionserver.RSRpcServices.multi(RSRpcServices.java:2405)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.hadoop.hbase.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:36621)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2359)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:124)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:297)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:277)
Oct 02 03:25:12 prod002 hbase[195373]: Caused by:
org.apache.hadoop.hbase.DoNotRetryIOException: Attempt to disable KM_IDX1
failed.
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.phoenix.index.PhoenixIndexFailurePolicy$2.run(PhoenixIndexFailurePolicy.java:280)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.phoenix.index.PhoenixIndexFailurePolicy$2.run(PhoenixIndexFailurePolicy.java:244)
Oct 02 03:25:12 prod002 hbase[195373]: at
java.security.AccessController.doPrivileged(Native Method)
Oct 02 03:25:12 prod002 hbase[195373]: at
javax.security.auth.Subject.doAs(Subject.java:422)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1746)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.hadoop.security.SecurityUtil.doAsUser(SecurityUtil.java:448)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.hadoop.security.SecurityUtil.doAsLoginUser(SecurityUtil.java:429)
Oct 02 03:25:12 prod002 hbase[195373]: at
sun.reflect.GeneratedMethodAccessor160.invoke(Unknown Source)
Oct 02 03:25:12 prod002 hbase[195373]: at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
Oct 02 03:25:12 prod002 hbase[195373]: at
java.lang.reflect.Method.invoke(Method.java:498)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.hadoop.hbase.util.Methods.call(Methods.java:39)
Oct 02 03:25:12 prod002 hbase[195373]: at
org.apache.hadoop.hbase.security.User.runAsLoginUser(User.java:210)
Oct 02 03:25:12 prod002 hbase[195373]: ... 23 more
Oct 02 03:25:12 prod002 hbase[195373]: 2018-10-02 03:25:12,689 ERROR
[RpcServer.default.FPBQ.Fifo.handler=58,queue=8,port=60020]
write.KillServerOnFailurePolicy: Could not update the index table, killing
server region because couldn't write to an index table
> On 16 Sep 2018, at 07:43, Sergey Soldatov <[email protected]> wrote:
>
> Obviously yes. If it's not configured than default handlers would be used
> for index writes and may lead to the distributed deadlock.
>
> Thanks,
> Sergey
>
> On Sat, Sep 15, 2018 at 11:36 AM Batyrshin Alexander <[email protected]
> <mailto:[email protected]>> wrote:
> I've found that we still not configured this:
>
> hbase.region.server.rpc.scheduler.factory.class =
> org.apache.hadoop.hbase.ipc.PhoenixRpcSchedulerFactory
>
> Can this misconfiguration leads to our problems?
>
>> On 15 Sep 2018, at 02:04, Sergey Soldatov <[email protected]
>> <mailto:[email protected]>> wrote:
>>
>> That was the real problem quite a long time ago (couple years?). Can't say
>> for sure in which version that was fixed, but now indexes has a priority
>> over regular tables and their regions open first. So by the moment when we
>> replay WALs for tables, all index regions are supposed to be online. If you
>> see the problem on recent versions that usually means that cluster is not
>> healthy and some of the index regions stuck in RiT state.
>>
>> Thanks,
>> Sergey
>>
>> On Thu, Sep 13, 2018 at 8:12 PM Jonathan Leech <[email protected]
>> <mailto:[email protected]>> wrote:
>> This seems similar to a failure scenario I’ve seen a couple times. I believe
>> after multiple restarts you got lucky and tables were brought up by Hbase in
>> the correct order.
>>
>> What happens is some kind of semi-catastrophic failure where 1 or more
>> region servers go down with edits that weren’t flushed, and are only in the
>> WAL. These edits belong to regions whose tables have secondary indexes.
>> Hbase wants to replay the WAL before bringing up the region server. Phoenix
>> wants to talk to the index region during this, but can’t. It fails enough
>> times then stops.
>>
>> The more region servers / tables / indexes affected, the more likely that a
>> full restart will get stuck in a classic deadlock. A good old-fashioned data
>> center outage is a great way to get started with this kind of problem. You
>> might make some progress and get stuck again, or restart number N might get
>> those index regions initialized before the main table.
>>
>> The sure fire way to recover a cluster in this condition is to strategically
>> disable all the tables that are failing to come up. You can do this from the
>> Hbase shell as long as the master is running. If I remember right, it’s a
>> pain since the disable command will hang. You might need to disable a table,
>> kill the shell, disable the next table, etc. Then restart. You’ll eventually
>> have a cluster with all the region servers finally started, and a bunch of
>> disabled regions. If you disabled index tables, enable one, wait for it to
>> become available; eg its WAL edits will be replayed, then enable the
>> associated main table and wait for it to come online. If Hbase did it’s job
>> without error, and your failure didn’t include losing 4 disks at once, order
>> will be restored. Lather, rinse, repeat until everything is enabled and
>> online.
>>
>> <TLDR> A big enough failure sprinkled with a little bit of bad luck and what
>> seems to be a Phoenix flaw == deadlock trying to get HBASE to start up. Fix
>> by forcing the order that Hbase brings regions online. Finally, never go
>> full restart. </TLDR>
>>
>> > On Sep 10, 2018, at 7:30 PM, Batyrshin Alexander <[email protected]
>> > <mailto:[email protected]>> wrote:
>> >
>> > After update web interface at Master show that every region server now
>> > 1.4.7 and no RITS.
>> >
>> > Cluster recovered only when we restart all regions servers 4 times...
>> >
>> >> On 11 Sep 2018, at 04:08, Josh Elser <[email protected]
>> >> <mailto:[email protected]>> wrote:
>> >>
>> >> Did you update the HBase jars on all RegionServers?
>> >>
>> >> Make sure that you have all of the Regions assigned (no RITs). There
>> >> could be a pretty simple explanation as to why the index can't be written
>> >> to.
>> >>
>> >>> On 9/9/18 3:46 PM, Batyrshin Alexander wrote:
>> >>> Correct me if im wrong.
>> >>> But looks like if you have A and B region server that has index and
>> >>> primary table then possible situation like this.
>> >>> A and B under writes on table with indexes
>> >>> A - crash
>> >>> B failed on index update because A is not operating then B starting
>> >>> aborting
>> >>> A after restart try to rebuild index from WAL but B at this time is
>> >>> aborting then A starting aborting too
>> >>> From this moment nothing happens (0 requests to region servers) and A
>> >>> and B is not responsible from Master-status web interface
>> >>>> On 9 Sep 2018, at 04:38, Batyrshin Alexander <[email protected]
>> >>>> <mailto:[email protected]> <mailto:[email protected]
>> >>>> <mailto:[email protected]>>> wrote:
>> >>>>
>> >>>> After update we still can't recover HBase cluster. Our region servers
>> >>>> ABORTING over and over:
>> >>>>
>> >>>> prod003:
>> >>>> Sep 09 02:51:27 prod003 hbase[1440]: 2018-09-09 02:51:27,395 FATAL
>> >>>> [RpcServer.default.FPBQ.Fifo.handler=92,queue=2,port=60020]
>> >>>> regionserver.HRegionServer: ABORTING region server
>> >>>> prod003,60020,1536446665703: Could not update the index table, killing
>> >>>> server region because couldn't write to an index table
>> >>>> Sep 09 02:51:27 prod003 hbase[1440]: 2018-09-09 02:51:27,395 FATAL
>> >>>> [RpcServer.default.FPBQ.Fifo.handler=77,queue=7,port=60020]
>> >>>> regionserver.HRegionServer: ABORTING region server
>> >>>> prod003,60020,1536446665703: Could not update the index table, killing
>> >>>> server region because couldn't write to an index table
>> >>>> Sep 09 02:52:19 prod003 hbase[1440]: 2018-09-09 02:52:19,224 FATAL
>> >>>> [RpcServer.default.FPBQ.Fifo.handler=82,queue=2,port=60020]
>> >>>> regionserver.HRegionServer: ABORTING region server
>> >>>> prod003,60020,1536446665703: Could not update the index table, killing
>> >>>> server region because couldn't write to an index table
>> >>>> Sep 09 02:52:28 prod003 hbase[1440]: 2018-09-09 02:52:28,922 FATAL
>> >>>> [RpcServer.default.FPBQ.Fifo.handler=94,queue=4,port=60020]
>> >>>> regionserver.HRegionServer: ABORTING region server
>> >>>> prod003,60020,1536446665703: Could not update the index table, killing
>> >>>> server region because couldn't write to an index table
>> >>>> Sep 09 02:55:02 prod003 hbase[957]: 2018-09-09 02:55:02,096 FATAL
>> >>>> [RpcServer.default.FPBQ.Fifo.handler=95,queue=5,port=60020]
>> >>>> regionserver.HRegionServer: ABORTING region server
>> >>>> prod003,60020,1536450772841: Could not update the index table, killing
>> >>>> server region because couldn't write to an index table
>> >>>> Sep 09 02:55:18 prod003 hbase[957]: 2018-09-09 02:55:18,793 FATAL
>> >>>> [RpcServer.default.FPBQ.Fifo.handler=97,queue=7,port=60020]
>> >>>> regionserver.HRegionServer: ABORTING region server
>> >>>> prod003,60020,1536450772841: Could not update the index table, killing
>> >>>> server region because couldn't write to an index table
>> >>>>
>> >>>> prod004:
>> >>>> Sep 09 02:52:13 prod004 hbase[4890]: 2018-09-09 02:52:13,541 FATAL
>> >>>> [RpcServer.default.FPBQ.Fifo.handler=83,queue=3,port=60020]
>> >>>> regionserver.HRegionServer: ABORTING region server
>> >>>> prod004,60020,1536446387325: Could not update the index table, killing
>> >>>> server region because couldn't write to an index table
>> >>>> Sep 09 02:52:50 prod004 hbase[4890]: 2018-09-09 02:52:50,264 FATAL
>> >>>> [RpcServer.default.FPBQ.Fifo.handler=75,queue=5,port=60020]
>> >>>> regionserver.HRegionServer: ABORTING region server
>> >>>> prod004,60020,1536446387325: Could not update the index table, killing
>> >>>> server region because couldn't write to an index table
>> >>>> Sep 09 02:53:40 prod004 hbase[4890]: 2018-09-09 02:53:40,709 FATAL
>> >>>> [RpcServer.default.FPBQ.Fifo.handler=66,queue=6,port=60020]
>> >>>> regionserver.HRegionServer: ABORTING region server
>> >>>> prod004,60020,1536446387325: Could not update the index table, killing
>> >>>> server region because couldn't write to an index table
>> >>>> Sep 09 02:54:00 prod004 hbase[4890]: 2018-09-09 02:54:00,060 FATAL
>> >>>> [RpcServer.default.FPBQ.Fifo.handler=89,queue=9,port=60020]
>> >>>> regionserver.HRegionServer: ABORTING region server
>> >>>> prod004,60020,1536446387325: Could not update the index table, killing
>> >>>> server region because couldn't write to an index table
>> >>>>
>> >>>> prod005:
>> >>>> Sep 09 02:52:50 prod005 hbase[3772]: 2018-09-09 02:52:50,661 FATAL
>> >>>> [RpcServer.default.FPBQ.Fifo.handler=65,queue=5,port=60020]
>> >>>> regionserver.HRegionServer: ABORTING region server
>> >>>> prod005,60020,1536446400009: Could not update the index table, killing
>> >>>> server region because couldn't write to an index table
>> >>>> Sep 09 02:53:27 prod005 hbase[3772]: 2018-09-09 02:53:27,542 FATAL
>> >>>> [RpcServer.default.FPBQ.Fifo.handler=90,queue=0,port=60020]
>> >>>> regionserver.HRegionServer: ABORTING region server
>> >>>> prod005,60020,1536446400009: Could not update the index table, killing
>> >>>> server region because couldn't write to an index table
>> >>>> Sep 09 02:54:00 prod005 hbase[3772]: 2018-09-09 02:53:59,915 FATAL
>> >>>> [RpcServer.default.FPBQ.Fifo.handler=7,queue=7,port=60020]
>> >>>> regionserver.HRegionServer: ABORTING region server
>> >>>> prod005,60020,1536446400009: Could not update the index table, killing
>> >>>> server region because couldn't write to an index table
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: 2018-09-09 02:54:30,058 FATAL
>> >>>> [RpcServer.default.FPBQ.Fifo.handler=16,queue=6,port=60020]
>> >>>> regionserver.HRegionServer: ABORTING region server
>> >>>> prod005,60020,1536446400009: Could not update the index table, killing
>> >>>> server region because couldn't write to an index table
>> >>>>
>> >>>> And so on...
>> >>>>
>> >>>> Trace is the same everywhere:
>> >>>>
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]:
>> >>>> org.apache.phoenix.hbase.index.exception.MultiIndexWriteFailureException:
>> >>>> disableIndexOnFailure=true, Failed to write to multiple index tables:
>> >>>> [KM_IDX1, KM_IDX2, KM_HISTORY_IDX1, KM_HISTORY_IDX2, KM_HISTORY_IDX3]
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.phoenix.hbase.index.write.TrackingParallelWriterIndexCommitter.write(TrackingParallelWriterIndexCommitter.java:235)
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.phoenix.hbase.index.write.IndexWriter.write(IndexWriter.java:195)
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.phoenix.hbase.index.write.IndexWriter.writeAndKillYourselfOnFailure(IndexWriter.java:156)
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.phoenix.hbase.index.write.IndexWriter.writeAndKillYourselfOnFailure(IndexWriter.java:145)
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.phoenix.hbase.index.Indexer.doPostWithExceptions(Indexer.java:620)
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.phoenix.hbase.index.Indexer.doPost(Indexer.java:595)
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.phoenix.hbase.index.Indexer.postBatchMutateIndispensably(Indexer.java:578)
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost$37.call(RegionCoprocessorHost.java:1048)
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost$RegionOperation.call(RegionCoprocessorHost.java:1711)
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost.execOperation(RegionCoprocessorHost.java:1789)
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost.execOperation(RegionCoprocessorHost.java:1745)
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost.postBatchMutateIndispensably(RegionCoprocessorHost.java:1044)
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.hadoop.hbase.regionserver.HRegion.doMiniBatchMutation(HRegion.java:3646)
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:3108)
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:3050)
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.phoenix.coprocessor.UngroupedAggregateRegionObserver.commitBatch(UngroupedAggregateRegionObserver.java:271)
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.phoenix.coprocessor.UngroupedAggregateRegionObserver.commitBatchWithRetries(UngroupedAggregateRegionObserver.java:241)
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.phoenix.coprocessor.UngroupedAggregateRegionObserver.rebuildIndices(UngroupedAggregateRegionObserver.java:1068)
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.phoenix.coprocessor.UngroupedAggregateRegionObserver.doPostScannerOpen(UngroupedAggregateRegionObserver.java:386)
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.phoenix.coprocessor.BaseScannerRegionObserver$RegionScannerHolder.overrideDelegate(BaseScannerRegionObserver.java:239)
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.phoenix.coprocessor.BaseScannerRegionObserver$RegionScannerHolder.nextRaw(BaseScannerRegionObserver.java:287)
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.hadoop.hbase.regionserver.RSRpcServices.scan(RSRpcServices.java:2843)
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.hadoop.hbase.regionserver.RSRpcServices.scan(RSRpcServices.java:3080)
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.hadoop.hbase.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:36613)
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2354)
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:124)
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:297)
>> >>>> Sep 09 02:54:30 prod005 hbase[3772]: at
>> >>>> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:277)
>> >>>>
>> >>>>> On 9 Sep 2018, at 01:44, Batyrshin Alexander <[email protected]
>> >>>>> <mailto:[email protected]> <mailto:[email protected]
>> >>>>> <mailto:[email protected]>>> wrote:
>> >>>>>
>> >>>>> Thank you.
>> >>>>> We're updating our cluster right now...
>> >>>>>
>> >>>>>
>> >>>>>> On 9 Sep 2018, at 01:39, Ted Yu <[email protected]
>> >>>>>> <mailto:[email protected]> <mailto:[email protected]
>> >>>>>> <mailto:[email protected]>>> wrote:
>> >>>>>>
>> >>>>>> It seems you should deploy hbase with the following fix:
>> >>>>>>
>> >>>>>> HBASE-21069 NPE in StoreScanner.updateReaders causes RS to crash
>> >>>>>>
>> >>>>>> 1.4.7 was recently released.
>> >>>>>>
>> >>>>>> FYI
>> >>>>>>
>> >>>>>> On Sat, Sep 8, 2018 at 3:32 PM Batyrshin Alexander <[email protected]
>> >>>>>> <mailto:[email protected]> <mailto:[email protected]
>> >>>>>> <mailto:[email protected]>>> wrote:
>> >>>>>>
>> >>>>>> Hello,
>> >>>>>>
>> >>>>>> We got this exception from *prod006* server
>> >>>>>>
>> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: 2018-09-09 00:38:02,532
>> >>>>>> FATAL [MemStoreFlusher.1] regionserver.HRegionServer: ABORTING
>> >>>>>> region server prod006,60020,1536235102833: Replay of
>> >>>>>> WAL required. Forcing server shutdown
>> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]:
>> >>>>>> org.apache.hadoop.hbase.DroppedSnapshotException:
>> >>>>>> region:
>> >>>>>> KM,c\xEF\xBF\xBD\x16I7\xEF\xBF\xBD\x0A"A\xEF\xBF\xBDd\xEF\xBF\xBD\xEF\xBF\xBD\x19\x07t,1536178245576.60c121ba50e67f2429b9ca2ba2a11bad.
>> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.regionserver.HRegion.internalFlushCacheAndCommit(HRegion.java:2645)
>> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.regionserver.HRegion.internalFlushcache(HRegion.java:2322)
>> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.regionserver.HRegion.internalFlushcache(HRegion.java:2284)
>> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.regionserver.HRegion.flushcache(HRegion.java:2170)
>> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.regionserver.HRegion.flush(HRegion.java:2095)
>> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.regionserver.MemStoreFlusher.flushRegion(MemStoreFlusher.java:508)
>> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.regionserver.MemStoreFlusher.flushRegion(MemStoreFlusher.java:478)
>> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.regionserver.MemStoreFlusher.access$900(MemStoreFlusher.java:76)
>> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.regionserver.MemStoreFlusher$FlushHandler.run(MemStoreFlusher.java:264)
>> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at
>> >>>>>> java.lang.Thread.run(Thread.java:748)
>> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: Caused by:
>> >>>>>> java.lang.NullPointerException
>> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at
>> >>>>>> java.util.ArrayList.<init>(ArrayList.java:178)
>> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.regionserver.StoreScanner.updateReaders(StoreScanner.java:863)
>> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.regionserver.HStore.notifyChangedReadersObservers(HStore.java:1172)
>> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.regionserver.HStore.updateStorefiles(HStore.java:1145)
>> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.regionserver.HStore.access$900(HStore.java:122)
>> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.regionserver.HStore$StoreFlusherImpl.commit(HStore.java:2505)
>> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.regionserver.HRegion.internalFlushCacheAndCommit(HRegion.java:2600)
>> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: ... 9 more
>> >>>>>> Sep 09 00:38:02 prod006 hbase[18907]: 2018-09-09 00:38:02,532
>> >>>>>> FATAL [MemStoreFlusher.1] regionserver.HRegionServer:
>> >>>>>> RegionServer abort: loaded coprocessors
>> >>>>>> are:
>> >>>>>> [org.apache.hadoop.hbase.regionserver.IndexHalfStoreFileReaderGenerator,
>> >>>>>> org.apache.phoenix.coprocessor.SequenceRegionObserver,
>> >>>>>> org.apache.phoenix.c
>> >>>>>>
>> >>>>>> After that we got ABORTING on almost every Region Servers in
>> >>>>>> cluster with different reasons:
>> >>>>>>
>> >>>>>> *prod003*
>> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: 2018-09-09 01:12:11,799
>> >>>>>> FATAL [PostOpenDeployTasks:88bfac1dfd807c4cd1e9c1f31b4f053f]
>> >>>>>> regionserver.HRegionServer: ABORTING region
>> >>>>>> server prod003,60020,1536444066291: Exception running
>> >>>>>> postOpenDeployTasks; region=88bfac1dfd807c4cd1e9c1f31b4f053f
>> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]:
>> >>>>>> java.io <http://java.io/>.InterruptedIOException: #139, interrupted.
>> >>>>>> currentNumberOfTask=8
>> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.client.AsyncProcess.waitForMaximumCurrentTasks(AsyncProcess.java:1853)
>> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.client.AsyncProcess.waitForMaximumCurrentTasks(AsyncProcess.java:1823)
>> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.client.AsyncProcess.waitForAllPreviousOpsAndReset(AsyncProcess.java:1899)
>> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.client.BufferedMutatorImpl.backgroundFlushCommits(BufferedMutatorImpl.java:250)
>> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.client.BufferedMutatorImpl.flush(BufferedMutatorImpl.java:213)
>> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: at
>> >>>>>> org.apache.hadoop.hbase.client.HTable.flushCommits(HTable.java:1484)
>> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: at
>> >>>>>> org.apache.hadoop.hbase.client.HTable.put(HTable.java:1031)
>> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.MetaTableAccessor.put(MetaTableAccessor.java:1033)
>> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.MetaTableAccessor.putToMetaTable(MetaTableAccessor.java:1023)
>> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.MetaTableAccessor.updateLocation(MetaTableAccessor.java:1433)
>> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.MetaTableAccessor.updateRegionLocation(MetaTableAccessor.java:1400)
>> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.regionserver.HRegionServer.postOpenDeployTasks(HRegionServer.java:2041)
>> >>>>>> Sep 09 01:12:11 prod003 hbase[11552]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.regionserver.handler.OpenRegionHandler$PostOpenDeployTasksThread.run(OpenRegionHandler.java:329)
>> >>>>>>
>> >>>>>> *prod002*
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: 2018-09-09 01:12:30,144
>> >>>>>> FATAL
>> >>>>>> [RpcServer.default.FPBQ.Fifo.handler=36,queue=6,port=60020]
>> >>>>>> regionserver.HRegionServer: ABORTING region
>> >>>>>> server prod002,60020,1536235138673: Could not update the index
>> >>>>>> table, killing server region because couldn't write to an index
>> >>>>>> table
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]:
>> >>>>>>
>> >>>>>> org.apache.phoenix.hbase.index.exception.MultiIndexWriteFailureException:
>> >>>>>> disableIndexOnFailure=true, Failed to write to multiple index
>> >>>>>> tables: [KM_IDX1, KM_IDX2, KM_HISTORY1, KM_HISTORY2,
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.phoenix.hbase.index.write.TrackingParallelWriterIndexCommitter.write(TrackingParallelWriterIndexCommitter.java:235)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.phoenix.hbase.index.write.IndexWriter.write(IndexWriter.java:195)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.phoenix.hbase.index.write.IndexWriter.writeAndKillYourselfOnFailure(IndexWriter.java:156)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.phoenix.hbase.index.write.IndexWriter.writeAndKillYourselfOnFailure(IndexWriter.java:145)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.phoenix.hbase.index.Indexer.doPostWithExceptions(Indexer.java:620)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>> org.apache.phoenix.hbase.index.Indexer.doPost(Indexer.java:595)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.phoenix.hbase.index.Indexer.postBatchMutateIndispensably(Indexer.java:578)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost$37.call(RegionCoprocessorHost.java:1048)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost$RegionOperation.call(RegionCoprocessorHost.java:1711)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost.execOperation(RegionCoprocessorHost.java:1789)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost.execOperation(RegionCoprocessorHost.java:1745)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost.postBatchMutateIndispensably(RegionCoprocessorHost.java:1044)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.regionserver.HRegion.doMiniBatchMutation(HRegion.java:3646)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:3108)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:3050)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.phoenix.coprocessor.UngroupedAggregateRegionObserver.commitBatch(UngroupedAggregateRegionObserver.java:271)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.phoenix.coprocessor.UngroupedAggregateRegionObserver.access$000(UngroupedAggregateRegionObserver.java:164)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.phoenix.coprocessor.UngroupedAggregateRegionObserver$1.doMutation(UngroupedAggregateRegionObserver.java:246)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.phoenix.index.PhoenixIndexFailurePolicy.doBatchWithRetries(PhoenixIndexFailurePolicy.java:455)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.phoenix.coprocessor.UngroupedAggregateRegionObserver.handleIndexWriteException(UngroupedAggregateRegionObserver.java:929)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.phoenix.coprocessor.UngroupedAggregateRegionObserver.commitBatchWithRetries(UngroupedAggregateRegionObserver.java:243)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.phoenix.coprocessor.UngroupedAggregateRegionObserver.rebuildIndices(UngroupedAggregateRegionObserver.java:1077)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.phoenix.coprocessor.UngroupedAggregateRegionObserver.doPostScannerOpen(UngroupedAggregateRegionObserver.java:386)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.phoenix.coprocessor.BaseScannerRegionObserver$RegionScannerHolder.overrideDelegate(BaseScannerRegionObserver.java:239)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.phoenix.coprocessor.BaseScannerRegionObserver$RegionScannerHolder.nextRaw(BaseScannerRegionObserver.java:287)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.regionserver.RSRpcServices.scan(RSRpcServices.java:2843)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.regionserver.RSRpcServices.scan(RSRpcServices.java:3080)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:36613)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>> org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2354)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>> org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:124)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:297)
>> >>>>>> Sep 09 01:12:30 prod002 hbase[29056]: at
>> >>>>>>
>> >>>>>> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:277)
>> >>>>>>
>> >>>>>>
>> >>>>>> And etc...
>> >>>>>>
>> >>>>>> Master-status web interface shows that contact lost from this
>> >>>>>> aborted servers.
>> >>>>>
>> >>>>
>> >
>