[ https://issues.apache.org/jira/browse/ZOOKEEPER-2355?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16085153#comment-16085153 ]
ASF GitHub Bot commented on ZOOKEEPER-2355: ------------------------------------------- Github user hanm commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/304#discussion_r127123221 --- Diff: src/java/test/org/apache/zookeeper/server/quorum/EphemeralNodeDeletionTest.java --- @@ -0,0 +1,219 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.zookeeper.server.quorum; + +import static org.apache.zookeeper.test.ClientBase.CONNECTION_TIMEOUT; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; + +import java.io.IOException; +import java.net.SocketTimeoutException; + +import org.apache.zookeeper.CreateMode; +import org.apache.zookeeper.PortAssignment; +import org.apache.zookeeper.ZooDefs.Ids; +import org.apache.zookeeper.ZooKeeper; +import org.apache.zookeeper.data.Stat; +import org.apache.zookeeper.server.persistence.FileTxnSnapLog; +import org.apache.zookeeper.server.quorum.QuorumPeer.ServerState; +import org.apache.zookeeper.test.ClientBase; +import org.apache.zookeeper.test.ClientBase.CountdownWatcher; +import org.junit.After; +import org.junit.Assert; +import org.junit.Test; + +import javax.security.sasl.SaslException; + +public class EphemeralNodeDeletionTest extends QuorumPeerTestBase { + private static int SERVER_COUNT = 3; + private MainThread[] mt = new MainThread[SERVER_COUNT]; + + /** + * Test case for https://issues.apache.org/jira/browse/ZOOKEEPER-2355. + * ZooKeeper ephemeral node is never deleted if follower fail while reading + * the proposal packet. + */ + + @Test(timeout = 120000) + public void testEphemeralNodeDeletion() throws Exception { + final int clientPorts[] = new int[SERVER_COUNT]; + StringBuilder sb = new StringBuilder(); + String server; + + for (int i = 0; i < SERVER_COUNT; i++) { + clientPorts[i] = PortAssignment.unique(); + server = "server." + i + "=127.0.0.1:" + PortAssignment.unique() + + ":" + PortAssignment.unique(); + sb.append(server + "\n"); + } + String currentQuorumCfgSection = sb.toString(); + System.out.println(currentQuorumCfgSection); + // start all the servers + for (int i = 0; i < SERVER_COUNT; i++) { + mt[i] = new MainThread(i, clientPorts[i], currentQuorumCfgSection) { + @Override + public TestQPMain getTestQPMain() { + return new MockTestQPMain(); + } + }; + mt[i].start(); + } + + // ensure all servers started + for (int i = 0; i < SERVER_COUNT; i++) { + Assert.assertTrue("waiting for server " + i + " being up", + ClientBase.waitForServerUp("127.0.0.1:" + clientPorts[i], + CONNECTION_TIMEOUT)); + } + + CountdownWatcher watch = new CountdownWatcher(); + ZooKeeper zk = new ZooKeeper("127.0.0.1:" + clientPorts[1], + ClientBase.CONNECTION_TIMEOUT, watch); + watch.waitForConnected(ClientBase.CONNECTION_TIMEOUT); + + /** + * now the problem scenario starts + */ + + // 1: create ephemeral node + String nodePath = "/e1"; + zk.create(nodePath, "1".getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL); + + // 2: inject network problem in one of the follower + CustomQuorumPeer follower = (CustomQuorumPeer) getByServerState(mt, + ServerState.FOLLOWING); + follower.setInjectError(true); + + // 3: close the session so that ephemeral node is deleted + zk.close(); + + // remove the error + follower.setInjectError(false); + + Assert.assertTrue("Faulted Follower should have joined quorum by now", + ClientBase.waitForServerUp( + "127.0.0.1:" + follower.getClientPort(), + CONNECTION_TIMEOUT)); + + QuorumPeer leader = getByServerState(mt, ServerState.LEADING); + assertNotNull("Leader should not be null", leader); + Assert.assertTrue("Leader must be running", ClientBase.waitForServerUp( + "127.0.0.1:" + leader.getClientPort(), CONNECTION_TIMEOUT)); + + watch = new CountdownWatcher(); + zk = new ZooKeeper("127.0.0.1:" + leader.getClientPort(), + ClientBase.CONNECTION_TIMEOUT, watch); + watch.waitForConnected(ClientBase.CONNECTION_TIMEOUT); + + Stat exists = zk.exists(nodePath, false); + assertNull("Node must have been deleted from leader", exists); + + CountdownWatcher followerWatch = new CountdownWatcher(); + ZooKeeper followerZK = new ZooKeeper( + "127.0.0.1:" + follower.getClientPort(), + ClientBase.CONNECTION_TIMEOUT, followerWatch); + followerWatch.waitForConnected(ClientBase.CONNECTION_TIMEOUT); + Stat nodeAtFollower = followerZK.exists(nodePath, false); + + // Problem 1: Follower had one extra ephemeral node /e1 + assertNull("ephemeral node must not exist", nodeAtFollower); + + // Create the node with another session + zk.create(nodePath, "2".getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL); + + // close the session and newly created ephemeral node should be deleted + zk.close(); + + nodeAtFollower = followerZK.exists(nodePath, false); + + // Problem 2: Before fix, after session close the ephemeral node + // was not getting deleted. But now after the fix after session close + // ephemeral node is getting deleted. + assertNull("After session close ephemeral node must be deleted", + nodeAtFollower); + followerZK.close(); + } + + @After + public void tearDown() { + // stop all severs + for (int i = 0; i < mt.length; i++) { + try { + mt[i].shutdown(); + } catch (InterruptedException e) { + LOG.warn("Quorum Peer interrupted while shutting it down", e); + } + } + } + + private QuorumPeer getByServerState(MainThread[] mt, ServerState state) { + for (int i = mt.length - 1; i >= 0; i--) { + QuorumPeer quorumPeer = mt[i].getQuorumPeer(); + if (null != quorumPeer && state == quorumPeer.getPeerState()) { + return quorumPeer; + } + } + return null; + } + + static class CustomQuorumPeer extends QuorumPeer { + private boolean injectError = false; + + public CustomQuorumPeer() throws SaslException { + } + + @Override + protected Follower makeFollower(FileTxnSnapLog logFactory) + throws IOException { + return new Follower(this, new FollowerZooKeeperServer(logFactory, + this, null /*DataTreeBuilder is never used*/, + this.getZkDb())) { + + @Override + void readPacket(QuorumPacket pp) throws IOException { + /** + * In real scenario got SocketTimeoutException while reading + * the packet from leader because of network problem, but + * here throwing SocketTimeoutException based on whether + * error is injected or not + */ + super.readPacket(pp); + if (injectError && pp.getType() == Leader.PROPOSAL) { + String type = LearnerHandler.packetToString(pp); + throw new SocketTimeoutException( + "Socket timeout while reading the packet for operation " + + type); + } + } + + }; + } + + public void setInjectError(boolean injectError) { + this.injectError = injectError; + } + + } + + static class MockTestQPMain extends TestQPMain { + @Override + protected QuorumPeer getQuorumPeer() throws SaslException { + return new CustomQuorumPeer(); + } + } +} --- End diff -- I'll update once I collect all feedbacks. > Ephemeral node is never deleted if follower fails while reading the proposal > packet > ----------------------------------------------------------------------------------- > > Key: ZOOKEEPER-2355 > URL: https://issues.apache.org/jira/browse/ZOOKEEPER-2355 > Project: ZooKeeper > Issue Type: Bug > Components: quorum, server > Affects Versions: 3.4.8, 3.4.9, 3.4.10, 3.5.1, 3.5.2, 3.5.3 > Reporter: Mohammad Arshad > Assignee: Mohammad Arshad > Priority: Critical > Attachments: ZOOKEEPER-2355-01.patch, ZOOKEEPER-2355-02.patch, > ZOOKEEPER-2355-03.patch, ZOOKEEPER-2355-04.patch, ZOOKEEPER-2355-05.patch > > > ZooKeeper ephemeral node is never deleted if follower fail while reading the > proposal packet > The scenario is as follows: > # Configure three node ZooKeeper cluster, lets say nodes are A, B and C, > start all, assume A is leader, B and C are follower > # Connect to any of the server and create ephemeral node /e1 > # Close the session, ephemeral node /e1 will go for deletion > # While receiving delete proposal make Follower B to fail with > {{SocketTimeoutException}}. This we need to do to reproduce the scenario > otherwise in production environment it happens because of network fault. > # Remove the fault, just check that faulted Follower is now connected with > quorum > # Connect to any of the server, create the same ephemeral node /e1, created > is success. > # Close the session, ephemeral node /e1 will go for deletion > # {color:red}/e1 is not deleted from the faulted Follower B, It should have > been deleted as it was again created with another session{color} > # {color:green}/e1 is deleted from Leader A and other Follower C{color} -- This message was sent by Atlassian JIRA (v6.4.14#64029)