[ https://issues.apache.org/jira/browse/HBASE-19287?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16421416#comment-16421416 ]
Abhishek Kulkarni commented on HBASE-19287: ------------------------------------------- 2018-03-31 05:00:27,274 WARN [RSProcedureDispatcher-pool3-t10] assignment.RegionTransitionProcedure: Remote call failed pid=12, ppid=11, state=RUNNABLE:REGION_TRANSITION_DISPATCH; AssignProcedure table=hbase:meta, region=1588230740; rit=OPENING, location=abhishekk3.pne.ven.veritas.com,16020,1522486816915; exception=Call to abhishekk3.pne.ven.veritas.com/10.210.62.30:16020 failed on local exception: java.io.IOException: Can not send request because relogin is in progress. 2018-03-31 05:00:27,274 INFO [RSProcedureDispatcher-pool3-t10] assignment.AssignProcedure: Retry=10 of max=10; pid=12, ppid=11, state=RUNNABLE:REGION_TRANSITION_DISPATCH; AssignProcedure table=hbase:meta, region=1588230740; rit=OPENING, location=abhishekk3.pne.ven.veritas.com,16020,1522486816915 2018-03-31 05:00:27,275 INFO [PEWorker-7] assignment.AssignProcedure: Retry=11 of max=10; pid=12, ppid=11, state=RUNNABLE:REGION_TRANSITION_QUEUE; AssignProcedure table=hbase:meta, region=1588230740; rit=OFFLINE, location=null 2018-03-31 05:00:27,408 ERROR [PEWorker-7] procedure2.ProcedureExecutor: CODE-BUG: Uncaught runtime exception for pid=11, state=FAILED:RECOVER_META_ASSIGN_REGIONS, exception=org.apache.hadoop.hbase.client.RetriesExhaustedException via AssignProcedure:org.apache.hadoop.hbase.client.RetriesExhaustedException: Max attempts exceeded; RecoverMetaProcedure failedMetaServer=null, splitWal=true java.lang.UnsupportedOperationException: unhandled state=RECOVER_META_ASSIGN_REGIONS at org.apache.hadoop.hbase.master.procedure.RecoverMetaProcedure.rollbackState(RecoverMetaProcedure.java:188) at org.apache.hadoop.hbase.master.procedure.RecoverMetaProcedure.rollbackState(RecoverMetaProcedure.java:53) at org.apache.hadoop.hbase.procedure2.StateMachineProcedure.rollback(StateMachineProcedure.java:199) at org.apache.hadoop.hbase.procedure2.Procedure.doRollback(Procedure.java:859) at org.apache.hadoop.hbase.procedure2.ProcedureExecutor.executeRollback(ProcedureExecutor.java:1353) at org.apache.hadoop.hbase.procedure2.ProcedureExecutor.executeRollback(ProcedureExecutor.java:1309) at org.apache.hadoop.hbase.procedure2.ProcedureExecutor.executeProcedure(ProcedureExecutor.java:1178) at org.apache.hadoop.hbase.procedure2.ProcedureExecutor.access$800(ProcedureExecutor.java:75) at org.apache.hadoop.hbase.procedure2.ProcedureExecutor$WorkerThread.run(ProcedureExecutor.java:1740) 2018-03-31 05:00:27,413 ERROR [PEWorker-7] procedure2.ProcedureExecutor: CODE-BUG: Uncaught runtime exception for pid=11, state=FAILED:RECOVER_META_ASSIGN_REGIONS, exception=org.apache.hadoop.hbase.client.RetriesExhaustedException via AssignProcedure:org.apache.hadoop.hbase.client.RetriesExhaustedException: Max attempts exceeded; RecoverMetaProcedure failedMetaServer=null, splitWal=true java.lang.UnsupportedOperationException: unhandled state=RECOVER_META_ASSIGN_REGIONS at org.apache.hadoop.hbase.master.procedure.RecoverMetaProcedure.rollbackState(RecoverMetaProcedure.java:188) at org.apache.hadoop.hbase.master.procedure.RecoverMetaProcedure.rollbackState(RecoverMetaProcedure.java:53) at org.apache.hadoop.hbase.procedure2.StateMachineProcedure.rollback(StateMachineProcedure.java:199) at org.apache.hadoop.hbase.procedure2.Procedure.doRollback(Procedure.java:859) at org.apache.hadoop.hbase.procedure2.ProcedureExecutor.executeRollback(ProcedureExecutor.java:1353) at org.apache.hadoop.hbase.procedure2.ProcedureExecutor.executeRollback(ProcedureExecutor.java:1309) at org.apache.hadoop.hbase.procedure2.ProcedureExecutor.executeProcedure(ProcedureExecutor.java:1178) at org.apache.hadoop.hbase.procedure2.ProcedureExecutor.access$800(ProcedureExecutor.java:75) at org.apache.hadoop.hbase.procedure2.ProcedureExecutor$WorkerThread.run(ProcedureExecutor.java:1740) 2018-03-31 05:00:27,423 INFO [PEWorker-7] procedure2.ProcedureExecutor: Rolled back pid=11, state=ROLLEDBACK, exception=org.apache.hadoop.hbase.client.RetriesExhaustedException via AssignProcedure:org.apache.hadoop.hbase.client.RetriesExhaustedException: Max attempts exceeded; RecoverMetaProcedure failedMetaServer=null, splitWal=true exec-time=5.1550sec 2018-03-31 05:00:27,423 ERROR [master/abhishekk1:16000] master.HMaster: Failed to become active master org.apache.hadoop.hbase.client.RetriesExhaustedException: Max attempts exceeded at org.apache.hadoop.hbase.master.assignment.AssignProcedure.startTransition(AssignProcedure.java:181) at org.apache.hadoop.hbase.master.assignment.RegionTransitionProcedure.execute(RegionTransitionProcedure.java:295) at org.apache.hadoop.hbase.master.assignment.RegionTransitionProcedure.execute(RegionTransitionProcedure.java:85) at org.apache.hadoop.hbase.procedure2.Procedure.doExecute(Procedure.java:845) at org.apache.hadoop.hbase.procedure2.ProcedureExecutor.execProcedure(ProcedureExecutor.java:1452) at org.apache.hadoop.hbase.procedure2.ProcedureExecutor.executeProcedure(ProcedureExecutor.java:1221) at org.apache.hadoop.hbase.procedure2.ProcedureExecutor.access$800(ProcedureExecutor.java:75) at org.apache.hadoop.hbase.procedure2.ProcedureExecutor$WorkerThread.run(ProcedureExecutor.java:1740) 2018-03-31 05:00:27,424 ERROR [master/abhishekk1:16000] master.HMaster: Master server abort: loaded coprocessors are: [org.apache.hadoop.hbase.security.access.AccessController] 2018-03-31 05:00:27,424 ERROR [master/abhishekk1:16000] master.HMaster: ***** ABORTING master abhishekk1.pne.ven.veritas.com,16000,1522486813076: Unhandled exception. Starting shutdown. ***** org.apache.hadoop.hbase.client.RetriesExhaustedException: Max attempts exceeded at org.apache.hadoop.hbase.master.assignment.AssignProcedure.startTransition(AssignProcedure.java:181) at org.apache.hadoop.hbase.master.assignment.RegionTransitionProcedure.execute(RegionTransitionProcedure.java:295) at org.apache.hadoop.hbase.master.assignment.RegionTransitionProcedure.execute(RegionTransitionProcedure.java:85) at org.apache.hadoop.hbase.procedure2.Procedure.doExecute(Procedure.java:845) at org.apache.hadoop.hbase.procedure2.ProcedureExecutor.execProcedure(ProcedureExecutor.java:1452) at org.apache.hadoop.hbase.procedure2.ProcedureExecutor.executeProcedure(ProcedureExecutor.java:1221) at org.apache.hadoop.hbase.procedure2.ProcedureExecutor.access$800(ProcedureExecutor.java:75) at org.apache.hadoop.hbase.procedure2.ProcedureExecutor$WorkerThread.run(ProcedureExecutor.java:1740) 2018-03-31 05:00:27,426 INFO [master/abhishekk1:16000] regionserver.HRegionServer: ***** STOPPING region server 'abhishekk1.pne.ven.veritas.com,16000,1522486813076' ***** 2018-03-31 05:00:27,426 INFO [master/abhishekk1:16000] regionserver.HRegionServer: STOPPED: Stopped by master/abhishekk1:16000 2018-03-31 05:00:27,431 DEBUG [main-EventThread] zookeeper.ZKWatcher: master:16000-0x2000003436b0000, quorum=abhishekk1.pne.ven.veritas.com:2181,abhishekk2.pne.ven.veritas.com:2181,abhishekk3.pne.ven.veritas.com:2181, baseZNode=/hbase-secure Received ZooKeeper Event, type=NodeDeleted, state=SyncConnected, path=/hbase-secure/master 2018-03-31 05:00:27,431 DEBUG [main-EventThread] zookeeper.ZKUtil: master:16000-0x2000003436b0000, quorum=abhishekk1.pne.ven.veritas.com:2181,abhishekk2.pne.ven.veritas.com:2181,abhishekk3.pne.ven.veritas.com:2181, baseZNode=/hbase-secure Set watcher on znode that does not yet exist, /hbase-secure/master 2018-03-31 05:00:27,890 INFO [master/abhishekk1:16000.splitLogManager..Chore.1] hbase.ScheduledChore: Chore: SplitLogManager Timeout Monitor was stopped 2018-03-31 05:00:30,434 INFO [master/abhishekk1:16000] ipc.NettyRpcServer: Stopping server on /10.210.62.28:16000 2018-03-31 05:00:30,434 INFO [master/abhishekk1:16000] token.AuthenticationTokenSecretManager: Stopping leader election, because: SecretManager stopping 2018-03-31 05:00:30,441 INFO [master/abhishekk1:16000] regionserver.HRegionServer: Stopping infoServer 2018-03-31 05:00:30,464 INFO [master/abhishekk1:16000] handler.ContextHandler: Stopped o.e.j.w.WebAppContext@1f03fba0\{/,null,UNAVAILABLE}{file:/home/hbase2/hbase-2.0.0-beta-2/hbase-webapps/master} 2018-03-31 05:00:30,469 INFO [master/abhishekk1:16000] server.AbstractConnector: Stopped ServerConnector@2abe9173\{HTTP/1.1,[http/1.1]}{0.0.0.0:16010} 2018-03-31 05:00:30,469 INFO [master/abhishekk1:16000] handler.ContextHandler: Stopped o.e.j.s.ServletContextHandler@20ab3e3a\{/static,file:///home/hbase2/hbase-2.0.0-beta-2/hbase-webapps/static/,UNAVAILABLE} 2018-03-31 05:00:30,469 INFO [master/abhishekk1:16000] handler.ContextHandler: Stopped o.e.j.s.ServletContextHandler@4e1a46fb\{/logs,file:///home/hbase2/hbase-2.0.0-beta-2/logs/,UNAVAILABLE} 2018-03-31 05:00:30,470 INFO [master/abhishekk1:16000] regionserver.HRegionServer: stopping server abhishekk1.pne.ven.veritas.com,16000,1522486813076 2018-03-31 05:00:30,471 INFO [master/abhishekk1:16000] zookeeper.ReadOnlyZKClient: Close zookeeper connection 0x293d3749 to abhishekk1.pne.ven.veritas.com:2181,abhishekk2.pne.ven.veritas.com:2181,abhishekk3.pne.ven.veritas.com:2181 2018-03-31 05:00:30,471 INFO [master/abhishekk1:16000] regionserver.HRegionServer: stopping server abhishekk1.pne.ven.veritas.com,16000,1522486813076; all regions closed. 2018-03-31 05:00:30,472 INFO [master/abhishekk1:16000] hbase.ChoreService: Chore service for: master/abhishekk1:16000 had [[ScheduledChore: Name: ReplicationZKNodeCleanerChore Period: 600000 Unit: MILLISECONDS], [ScheduledChore: Name: HFileCleaner Period: 600000 Unit: MILLISECONDS], [ScheduledChore: Name: LogsCleaner Period: 600000 Unit: MILLISECONDS]] on shutdown 2018-03-31 05:00:30,474 DEBUG [master/abhishekk1:16000] zookeeper.ZKUtil: master:16000-0x2000003436b0000, quorum=abhishekk1.pne.ven.veritas.com:2181,abhishekk2.pne.ven.veritas.com:2181,abhishekk3.pne.ven.veritas.com:2181, baseZNode=/hbase-secure Unable to get data of znode /hbase-secure/master because node does not exist (not an error) 2018-03-31 05:00:30,475 WARN [master/abhishekk1:16000] master.ActiveMasterManager: Failed get of master address: java.io.IOException: Can't get master address from ZooKeeper; znode data == null 2018-03-31 05:00:30,475 INFO [master/abhishekk1:16000] assignment.AssignmentManager: Stopping assignment manager 2018-03-31 05:00:30,475 INFO [master/abhishekk1:16000] procedure2.RemoteProcedureDispatcher: Stopping procedure remote dispatcher 2018-03-31 05:00:30,475 INFO [ReadOnlyZKClient-abhishekk1.pne.ven.veritas.com:2181,abhishekk2.pne.ven.veritas.com:2181,abhishekk3.pne.ven.veritas.com:2181@0x293d3749] zookeeper.ZooKeeper: Session: 0x1000004c8490001 closed 2018-03-31 05:00:30,476 INFO [ReadOnlyZKClient-abhishekk1.pne.ven.veritas.com:2181,abhishekk2.pne.ven.veritas.com:2181,abhishekk3.pne.ven.veritas.com:2181@0x293d3749-EventThread] zookeeper.ClientCnxn: EventThread shut down for session: 0x1000004c8490001 2018-03-31 05:00:30,476 INFO [master/abhishekk1:16000] procedure2.ProcedureExecutor: Stopping 2018-03-31 05:00:30,482 INFO [master/abhishekk1:16000] wal.WALProcedureStore: Stopping the WAL Procedure Store, isAbort=false 2018-03-31 05:00:30,504 INFO [master/abhishekk1:16000] hbase.ChoreService: Chore service for: master/abhishekk1:16000.splitLogManager. had [] on shutdown 2018-03-31 05:00:30,505 INFO [master/abhishekk1:16000] flush.MasterFlushTableProcedureManager: stop: server shutting down. 2018-03-31 05:00:30,505 ERROR [master/abhishekk1:16000] access.TableAuthManager: Something wrong with the TableAuthManager reference counting: org.apache.hadoop.hbase.security.access.TableAuthManager@10afe71a whose count is null 2018-03-31 05:00:30,512 INFO [master/abhishekk1:16000] zookeeper.ZooKeeper: Session: 0x2000003436b0000 closed 2018-03-31 05:00:30,512 INFO [master/abhishekk1:16000] regionserver.HRegionServer: Exiting; stopping=abhishekk1.pne.ven.veritas.com,16000,1522486813076; zookeeper connection closed. 2018-03-31 05:00:30,512 INFO [main-EventThread] zookeeper.ClientCnxn: EventThread shut down for session: 0x2000003436b0000 > master hangs forever if RecoverMeta send assign meta region request to target > server fail > ----------------------------------------------------------------------------------------- > > Key: HBASE-19287 > URL: https://issues.apache.org/jira/browse/HBASE-19287 > Project: HBase > Issue Type: Bug > Components: proc-v2 > Affects Versions: 2.0.0 > Reporter: Yi Liang > Assignee: Yi Liang > Priority: Major > Fix For: 2.0.0-beta-1, 2.0.0 > > Attachments: HBASE-19287-master-v3.patch, > HBASE-19287-master-v3.patch, HBASE-19287-master-v4.patch, > hbase-19287-master-v2.patch, master.patch > > > 2017-11-10 19:26:56,019 INFO [ProcExecWrkr-1] > procedure.RecoverMetaProcedure: pid=138, > state=RUNNABLE:RECOVER_META_ASSIGN_REGIONS; RecoverMetaProcedure > failedMetaServer=null, splitWal=true; Retaining meta assignment to > server=hadoop-slave1.hadoop,16020,1510341981454 > 2017-11-10 19:26:56,029 INFO [ProcExecWrkr-1] procedure2.ProcedureExecutor: > Initialized subprocedures=[{pid=139, ppid=138, > state=RUNNABLE:REGION_TRANSITION_QUEUE; AssignProcedure table=hbase:meta, > region=1588230740, target=hadoop-slave1.hadoop,16020,1510341981454}] > 2017-11-10 19:26:56,067 INFO [ProcExecWrkr-2] > procedure.MasterProcedureScheduler: pid=139, ppid=138, > state=RUNNABLE:REGION_TRANSITION_QUEUE; AssignProcedure table=hbase:meta, > region=1588230740, target=hadoop-slave1.hadoop,16020,1510341981454 hbase:meta > hbase:meta,,1.1588230740 > 2017-11-10 19:26:56,071 INFO [ProcExecWrkr-2] assignment.AssignProcedure: > Start pid=139, ppid=138, state=RUNNABLE:REGION_TRANSITION_QUEUE; > AssignProcedure table=hbase:meta, region=1588230740, > target=hadoop-slave1.hadoop,16020,1510341981454; rit=OFFLINE, > location=hadoop-slave1.hadoop,16020,1510341981454; forceNewPlan=false, > retain=false > 2017-11-10 19:26:56,224 INFO [ProcExecWrkr-4] zookeeper.MetaTableLocator: > Setting hbase:meta (replicaId=0) location in ZooKeeper as > hadoop-slave2.hadoop,16020,1510341988652 > 2017-11-10 19:26:56,230 INFO [ProcExecWrkr-4] > assignment.RegionTransitionProcedure: Dispatch pid=139, ppid=138, > state=RUNNABLE:REGION_TRANSITION_DISPATCH; AssignProcedure table=hbase:meta, > region=1588230740, target=hadoop-slave1.hadoop,16020,1510341981454; > rit=OPENING, location=hadoop-slave2.hadoop,16020,1510341988652 > 2017-11-10 19:26:56,382 INFO [ProcedureDispatcherTimeoutThread] > procedure.RSProcedureDispatcher: Using procedure batch rpc execution for > serverName=hadoop-slave2.hadoop,16020,1510341988652 version=2097152 > 2017-11-10 19:26:57,542 INFO [main-EventThread] > zookeeper.RegionServerTracker: RegionServer ephemeral node deleted, > processing expiration [hadoop-slave2.hadoop,16020,1510341988652] > 2017-11-10 19:26:57,543 INFO [main-EventThread] master.ServerManager: Master > doesn't enable ServerShutdownHandler during initialization, delay expiring > server hadoop-slave2.hadoop,16020,1510341988652 > 2017-11-10 19:26:58,875 INFO > [RpcServer.default.FPBQ.Fifo.handler=29,queue=2,port=16000] > master.ServerManager: Registering > server=hadoop-slave1.hadoop,16020,1510342016106 > 2017-11-10 19:27:05,832 INFO > [RpcServer.default.FPBQ.Fifo.handler=29,queue=2,port=16000] > master.ServerManager: Registering > server=hadoop-slave2.hadoop,16020,1510342023184 > 2017-11-10 19:27:05,832 INFO > [RpcServer.default.FPBQ.Fifo.handler=29,queue=2,port=16000] > master.ServerManager: Triggering server recovery; existingServer > hadoop-slave2.hadoop,16020,1510341988652 looks stale, new > server:hadoop-slave2.hadoop,16020,1510342023184 > 2017-11-10 19:27:05,832 INFO > [RpcServer.default.FPBQ.Fifo.handler=29,queue=2,port=16000] > master.ServerManager: Master doesn't enable ServerShutdownHandler during > initialization, delay expiring server hadoop-slave2.hadoop,16020,1510341988652 > 2017-11-10 19:27:49,815 INFO > [RpcServer.default.FPBQ.Fifo.handler=29,queue=2,port=16000] > client.RpcRetryingCallerImpl: tarted=38594 ms ago, cancelled=false, > msg=org.apache.hadoop.hbase.NotServingRegionException: hbase:meta,,1 is not > online on hadoop-slave2.hadoop,16020,1510342023184 > at > org.apache.hadoop.hbase.regionserver.HRegionServer.getRegionByEncodedName(HRegionServer.java:3290) > at > org.apache.hadoop.hbase.regionserver.RSRpcServices.getRegion(RSRpcServices.java:1370) > at > org.apache.hadoop.hbase.regionserver.RSRpcServices.get(RSRpcServices.java:2401) > at > org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:41544) > at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:406) > at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:133) > at > org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:278) > at > org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:258) > row 'hbase:namespace' on table 'hbase:meta' at > region=hbase:meta,,1.1588230740, > hostname=hadoop-slave2.hadoop,16020,1510341988652, seqNum=0 -- This message was sent by Atlassian JIRA (v7.6.3#76005)