This is an automated email from the ASF dual-hosted git repository. andor pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/zookeeper.git
The following commit(s) were added to refs/heads/master by this push: new a89c094 Revert "ZOOKEEPER-3320: Leader election port stop listen when hostname unresolvable for some time" a89c094 is described below commit a89c0942e45bb16e5282eee9d3a56ebbddbaae15 Author: Andor Molnar <an...@apache.org> AuthorDate: Wed Jul 31 11:02:00 2019 +0200 Revert "ZOOKEEPER-3320: Leader election port stop listen when hostname unresolvable for some time" This reverts commit 05ee9413e7a31703395b81fb8d72baf1cb09a46d. --- .../src/main/resources/markdown/zookeeperAdmin.md | 12 ------ .../zookeeper/server/quorum/QuorumCnxManager.java | 46 ++++++---------------- .../zookeeper/server/quorum/CnxManagerTest.java | 29 -------------- 3 files changed, 13 insertions(+), 74 deletions(-) diff --git a/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md b/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md index 835e357..5c71832 100644 --- a/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md +++ b/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md @@ -1076,18 +1076,6 @@ As an example, this will enable all four letter word commands: properly, check your operating system's options regarding TCP keepalive for more information. Defaults to **false**. - -* *zookeeper.electionPortBindRetry* : - (Java system property only: **zookeeper.electionPortBindRetry**) - Property set max retry count when Zookeeper server fails to bind - leader election port. Such errors can be temporary and recoverable, - such as DNS issue described in [ZOOKEEPER-3320](https://issues.apache.org/jira/projects/ZOOKEEPER/issues/ZOOKEEPER-3320), - or non-retryable, such as port already in use. - In case of transient errors, this property can improve availability - of Zookeeper server and help it to self recover. - Default value 3. In container environment, especially in Kubernetes, - this value should be increased to overcome issues related to DNS name resolving. - * *observer.reconnectDelayMs* : (Java system property: **zookeeper.observer.reconnectDelayMs**) diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java index 4be8fa6..d97da2a 100644 --- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java +++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java @@ -18,8 +18,6 @@ package org.apache.zookeeper.server.quorum; -import static org.apache.zookeeper.common.NetUtils.formatInetAddr; - import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.DataInputStream; @@ -38,7 +36,6 @@ import java.util.Collections; import java.util.Enumeration; import java.util.HashSet; import java.util.Map; -import java.util.NoSuchElementException; import java.util.Set; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.ConcurrentHashMap; @@ -46,20 +43,24 @@ import java.util.concurrent.SynchronousQueue; import java.util.concurrent.ThreadFactory; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; +import java.util.NoSuchElementException; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; -import javax.net.ssl.SSLSocket; + import org.apache.zookeeper.common.X509Exception; import org.apache.zookeeper.server.ExitCode; -import org.apache.zookeeper.server.ZooKeeperThread; import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException; +import org.apache.zookeeper.server.util.ConfigUtils; +import org.apache.zookeeper.server.ZooKeeperThread; import org.apache.zookeeper.server.quorum.auth.QuorumAuthLearner; import org.apache.zookeeper.server.quorum.auth.QuorumAuthServer; import org.apache.zookeeper.server.quorum.flexible.QuorumVerifier; -import org.apache.zookeeper.server.util.ConfigUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import javax.net.ssl.SSLSocket; +import static org.apache.zookeeper.common.NetUtils.formatInetAddr; + /** * This class implements a connection manager for leader election using TCP. It * maintains one connection for every pair of servers. The tricky part is to @@ -847,30 +848,12 @@ public class QuorumCnxManager { */ public class Listener extends ZooKeeperThread { - private static final String ELECTION_PORT_BIND_RETRY = "zookeeper.electionPortBindRetry"; - private static final int DEFAULT_PORT_BIND_MAX_RETRY = 3; - - private final int portBindMaxRetry; volatile ServerSocket ss = null; public Listener() { // During startup of thread, thread name will be overridden to // specific election address super("ListenerThread"); - - // maximum retry count while trying to bind to election port - // see ZOOKEEPER-3320 for more details - final Integer maxRetry = Integer.getInteger(ELECTION_PORT_BIND_RETRY, - DEFAULT_PORT_BIND_MAX_RETRY); - if (maxRetry >= 0) { - LOG.info("Election port bind maximum retries is {}", maxRetry); - portBindMaxRetry = maxRetry; - } else { - LOG.info("'{}' contains invalid value: {}(must be >= 0). " - + "Use default value of {} instead.", - ELECTION_PORT_BIND_RETRY, maxRetry, DEFAULT_PORT_BIND_MAX_RETRY); - portBindMaxRetry = DEFAULT_PORT_BIND_MAX_RETRY; - } } /** @@ -882,7 +865,7 @@ public class QuorumCnxManager { InetSocketAddress addr; Socket client = null; Exception exitException = null; - while((!shutdown) && (numRetries < portBindMaxRetry)){ + while((!shutdown) && (numRetries < 3)){ try { if (self.shouldUsePortUnification()) { LOG.info("Creating TLS-enabled quorum server socket"); @@ -952,14 +935,11 @@ public class QuorumCnxManager { } LOG.info("Leaving listener"); if (!shutdown) { - LOG.error("As I'm leaving the listener thread after " - + numRetries + " errors. " - + "I won't be able to participate in leader " - + "election any longer: " - + formatInetAddr(self.getElectionAddress()) - + ". Use " + ELECTION_PORT_BIND_RETRY + " property to " - + "increase retry count."); - if (exitException instanceof SocketException) { + LOG.error("As I'm leaving the listener thread, " + + "I won't be able to participate in leader " + + "election any longer: " + + formatInetAddr(self.getElectionAddress())); + if (exitException instanceof BindException) { // After leaving listener thread, the host cannot join the // quorum anymore, this is a severe error that we cannot // recover from, so we need to exit diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java index 200ed99..878e41b 100644 --- a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java +++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java @@ -291,35 +291,6 @@ public class CnxManagerTest extends ZKTestCase { } /** - * Test for bug described in {@link https://issues.apache.org/jira/browse/ZOOKEEPER-3320}. - * Test create peer with address which contains unresolvable DNS name, - * leader election listener thread should stop after N errors. - * - * @throws Exception - */ - @Test - public void testCnxManagerListenerThreadConfigurableRetry() throws Exception { - final Map<Long,QuorumServer> unresolvablePeers = new HashMap<>(); - final long myid = 1L; - unresolvablePeers.put(myid, new QuorumServer(myid, "unresolvable-domain.org:2182:2183;2181")); - final QuorumPeer peer = new QuorumPeer(unresolvablePeers, - ClientBase.createTmpDir(), - ClientBase.createTmpDir(), - 2181, 3, myid, 1000, 2, 2, 2); - final QuorumCnxManager cnxManager = peer.createCnxnManager(); - QuorumCnxManager.Listener listener = cnxManager.listener; - listener.start(); - // listener thread should stop and throws error which notify QuorumPeer about error. - // QuorumPeer should start shutdown process - listener.join(15000); // set wait time, if listener contains bug and thread not stops. - Assert.assertFalse(listener.isAlive()); - Assert.assertFalse(peer.isRunning()); - peer.join(15000); - Assert.assertFalse(QuorumPeer.class.getSimpleName() + " not stopped after " - + "listener thread death", listener.isAlive()); - } - - /** * Tests a bug in QuorumCnxManager that causes a NPE when a 3.4.6 * observer connects to a 3.5.0 server. * see https://issues.apache.org/jira/browse/ZOOKEEPER-1789