[ 
https://issues.apache.org/jira/browse/HBASE-9746?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14105843#comment-14105843
 ] 

Andrew Purtell commented on HBASE-9746:
---------------------------------------

I think we should allow the source RS to come up, with continued attempts to 
start replication, and WARN level logging at each failed attempt. This means we 
need to deal with UnknownHostException and related exceptions. We could make 
the retry interval configurable. Using the WAL roll period as the default value 
for that could be a reasonable thing to do. 

> RegionServer can't start when replication tries to replicate to an unknown 
> host
> -------------------------------------------------------------------------------
>
>                 Key: HBASE-9746
>                 URL: https://issues.apache.org/jira/browse/HBASE-9746
>             Project: HBase
>          Issue Type: Bug
>    Affects Versions: 0.94.12
>            Reporter: Lars Hofhansl
>            Priority: Minor
>             Fix For: 0.99.0, 2.0.0, 0.98.7, 0.94.24
>
>
> Just ran into this:
> {code}
> 13/10/11 00:37:02 [regionserver60020] WARN  zookeeper.ZKConfig(204): 
> java.net.UnknownHostException: <old-host>: Name or service not known
>       at java.net.Inet6AddressImpl.lookupAllHostAddr(Native Method)
>       at java.net.InetAddress$1.lookupAllHostAddr(InetAddress.java:894)
>       at 
> java.net.InetAddress.getAddressesFromNameService(InetAddress.java:1286)
>       at java.net.InetAddress.getAllByName0(InetAddress.java:1239)
>       at java.net.InetAddress.getAllByName(InetAddress.java:1155)
>       at java.net.InetAddress.getAllByName(InetAddress.java:1091)
>       at java.net.InetAddress.getByName(InetAddress.java:1041)
>       at 
> org.apache.hadoop.hbase.zookeeper.ZKConfig.getZKQuorumServersString(ZKConfig.java:201)
>       at 
> org.apache.hadoop.hbase.zookeeper.ZKConfig.getZKQuorumServersString(ZKConfig.java:245)
>       at 
> org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher.<init>(ZooKeeperWatcher.java:147)
>       at 
> org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher.<init>(ZooKeeperWatcher.java:127)
>       at 
> org.apache.hadoop.hbase.replication.ReplicationPeer.reloadZkWatcher(ReplicationPeer.java:170)
>       at 
> org.apache.hadoop.hbase.replication.ReplicationPeer.<init>(ReplicationPeer.java:69)
>       at 
> org.apache.hadoop.hbase.replication.ReplicationZookeeper.getPeer(ReplicationZookeeper.java:343)
>       at 
> org.apache.hadoop.hbase.replication.ReplicationZookeeper.connectToPeer(ReplicationZookeeper.java:308)
>       at 
> org.apache.hadoop.hbase.replication.ReplicationZookeeper.connectExistingPeers(ReplicationZookeeper.java:189)
>       at 
> org.apache.hadoop.hbase.replication.ReplicationZookeeper.<init>(ReplicationZookeeper.java:156)
>       at 
> org.apache.hadoop.hbase.replication.regionserver.Replication.initialize(Replication.java:89)
>       at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.newReplicationInstance(HRegionServer.java:3986)
>       at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.createNewReplicationInstance(HRegionServer.java:3955)
>       at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.setupWALAndReplication(HRegionServer.java:1412)
>       at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.handleReportForDutyResponse(HRegionServer.java:1096)
>       at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.run(HRegionServer.java:749)
>       at java.lang.Thread.run(Thread.java:722)
> 13/10/11 00:37:02 [regionserver60020] ERROR zookeeper.ZKConfig(210): no valid 
> quorum servers found in zoo.cfg
> 13/10/11 00:37:02 [regionserver60020] WARN  regionserver.HRegionServer(1108): 
> Exception in region server : 
> java.io.IOException: Unable to determine ZooKeeper ensemble
>       at org.apache.hadoop.hbase.zookeeper.ZKUtil.connect(ZKUtil.java:116)
>       at 
> org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher.<init>(ZooKeeperWatcher.java:153)
>       at 
> org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher.<init>(ZooKeeperWatcher.java:127)
>       at 
> org.apache.hadoop.hbase.replication.ReplicationPeer.reloadZkWatcher(ReplicationPeer.java:170)
>       at 
> org.apache.hadoop.hbase.replication.ReplicationPeer.<init>(ReplicationPeer.java:69)
>       at 
> org.apache.hadoop.hbase.replication.ReplicationZookeeper.getPeer(ReplicationZookeeper.java:343)
>       at 
> org.apache.hadoop.hbase.replication.ReplicationZookeeper.connectToPeer(ReplicationZookeeper.java:308)
>       at 
> org.apache.hadoop.hbase.replication.ReplicationZookeeper.connectExistingPeers(ReplicationZookeeper.java:189)
>       at 
> org.apache.hadoop.hbase.replication.ReplicationZookeeper.<init>(ReplicationZookeeper.java:156)
>       at 
> org.apache.hadoop.hbase.replication.regionserver.Replication.initialize(Replication.java:89)
>       at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.newReplicationInstance(HRegionServer.java:3986)
>       at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.createNewReplicationInstance(HRegionServer.java:3955)
>       at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.setupWALAndReplication(HRegionServer.java:1412)
>       at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.handleReportForDutyResponse(HRegionServer.java:1096)
>       at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.run(HRegionServer.java:749)
>       at java.lang.Thread.run(Thread.java:722)
> 13/10/11 00:37:02 [regionserver60020] INFO  regionserver.HRegionServer(1823): 
> STOPPED: Failed initialization
> 13/10/11 00:37:02 [regionserver60020] ERROR regionserver.HRegionServer(1228): 
> Failed init
> java.io.IOException: Unable to determine ZooKeeper ensemble
>       at org.apache.hadoop.hbase.zookeeper.ZKUtil.connect(ZKUtil.java:116)
>       at 
> org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher.<init>(ZooKeeperWatcher.java:153)
>       at 
> org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher.<init>(ZooKeeperWatcher.java:127)
>       at 
> org.apache.hadoop.hbase.replication.ReplicationPeer.reloadZkWatcher(ReplicationPeer.java:170)
>       at 
> org.apache.hadoop.hbase.replication.ReplicationPeer.<init>(ReplicationPeer.java:69)
>       at 
> org.apache.hadoop.hbase.replication.ReplicationZookeeper.getPeer(ReplicationZookeeper.java:343)
>       at 
> org.apache.hadoop.hbase.replication.ReplicationZookeeper.connectToPeer(ReplicationZookeeper.java:308)
>       at 
> org.apache.hadoop.hbase.replication.ReplicationZookeeper.connectExistingPeers(ReplicationZookeeper.java:189)
>       at 
> org.apache.hadoop.hbase.replication.ReplicationZookeeper.<init>(ReplicationZookeeper.java:156)
>       at 
> org.apache.hadoop.hbase.replication.regionserver.Replication.initialize(Replication.java:89)
>       at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.newReplicationInstance(HRegionServer.java:3986)
>       at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.createNewReplicationInstance(HRegionServer.java:3955)
>       at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.setupWALAndReplication(HRegionServer.java:1412)
>       at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.handleReportForDutyResponse(HRegionServer.java:1096)
>       at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.run(HRegionServer.java:749)
>       at java.lang.Thread.run(Thread.java:722)
> 13/10/11 00:37:02 [regionserver60020] FATAL regionserver.HRegionServer(1898): 
> ABORTING region server XXXXXXXX,60020,1381451821216: Unhandled exception: 
> Unable to determine ZooKeeper ensemble
> java.io.IOException: Unable to determine ZooKeeper ensemble
>       at org.apache.hadoop.hbase.zookeeper.ZKUtil.connect(ZKUtil.java:116)
>       at 
> org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher.<init>(ZooKeeperWatcher.java:153)
>       at 
> org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher.<init>(ZooKeeperWatcher.java:127)
>       at 
> org.apache.hadoop.hbase.replication.ReplicationPeer.reloadZkWatcher(ReplicationPeer.java:170)
>       at 
> org.apache.hadoop.hbase.replication.ReplicationPeer.<init>(ReplicationPeer.java:69)
>       at 
> org.apache.hadoop.hbase.replication.ReplicationZookeeper.getPeer(ReplicationZookeeper.java:343)
>       at 
> org.apache.hadoop.hbase.replication.ReplicationZookeeper.connectToPeer(ReplicationZookeeper.java:308)
>       at 
> org.apache.hadoop.hbase.replication.ReplicationZookeeper.connectExistingPeers(ReplicationZookeeper.java:189)
>       at 
> org.apache.hadoop.hbase.replication.ReplicationZookeeper.<init>(ReplicationZookeeper.java:156)
>       at 
> org.apache.hadoop.hbase.replication.regionserver.Replication.initialize(Replication.java:89)
>       at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.newReplicationInstance(HRegionServer.java:3986)
>       at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.createNewReplicationInstance(HRegionServer.java:3955)
>       at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.setupWALAndReplication(HRegionServer.java:1412)
>       at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.handleReportForDutyResponse(HRegionServer.java:1096)
>       at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.run(HRegionServer.java:749)
>       at java.lang.Thread.run(Thread.java:722)
> {code}



--
This message was sent by Atlassian JIRA
(v6.2#6252)

Reply via email to