Repository: activemq-artemis
Updated Branches:
  refs/heads/master d1c9bc0f2 -> a076c50e5


https://issues.apache.org/jira/browse/ARTEMIS-1565

ARTEMIS-1565 - replica should retry quorum vote

https://issues.apache.org/jira/browse/ARTEMIS-1565


Project: http://git-wip-us.apache.org/repos/asf/activemq-artemis/repo
Commit: http://git-wip-us.apache.org/repos/asf/activemq-artemis/commit/6067a285
Tree: http://git-wip-us.apache.org/repos/asf/activemq-artemis/tree/6067a285
Diff: http://git-wip-us.apache.org/repos/asf/activemq-artemis/diff/6067a285

Branch: refs/heads/master
Commit: 6067a285bdbb7e33ad427bf7ea911cc547b1717c
Parents: d1c9bc0
Author: Andy Taylor <andy.tayl...@gmail.com>
Authored: Tue Dec 19 11:14:50 2017 +0000
Committer: Andy Taylor <andy.tayl...@gmail.com>
Committed: Tue Dec 19 11:30:20 2017 +0000

----------------------------------------------------------------------
 .../config/ActiveMQDefaultConfiguration.java    | 13 +++++
 .../artemis/core/config/ConfigurationUtils.java |  4 +-
 .../config/ha/ReplicaPolicyConfiguration.java   | 20 +++++++
 .../ha/ReplicatedPolicyConfiguration.java       | 21 +++++++
 .../deployers/impl/FileConfigurationParser.java |  8 +++
 .../core/server/cluster/ha/ReplicaPolicy.java   | 28 +++++++++-
 .../server/cluster/ha/ReplicatedPolicy.java     | 16 +++++-
 .../core/server/cluster/impl/BridgeImpl.java    |  3 +-
 .../server/cluster/qourum/QuorumManager.java    | 16 ++++++
 .../qourum/SharedNothingBackupQuorum.java       | 58 ++++++++++++++++----
 .../impl/SharedNothingBackupActivation.java     |  2 +-
 .../resources/schema/artemis-configuration.xsd  | 30 ++++++++++
 docs/user-manual/en/network-isolation.md        | 17 +++++-
 13 files changed, 216 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/activemq-artemis/blob/6067a285/artemis-core-client/src/main/java/org/apache/activemq/artemis/api/config/ActiveMQDefaultConfiguration.java
----------------------------------------------------------------------
diff --git 
a/artemis-core-client/src/main/java/org/apache/activemq/artemis/api/config/ActiveMQDefaultConfiguration.java
 
b/artemis-core-client/src/main/java/org/apache/activemq/artemis/api/config/ActiveMQDefaultConfiguration.java
index a409ffb..9b5d75d 100644
--- 
a/artemis-core-client/src/main/java/org/apache/activemq/artemis/api/config/ActiveMQDefaultConfiguration.java
+++ 
b/artemis-core-client/src/main/java/org/apache/activemq/artemis/api/config/ActiveMQDefaultConfiguration.java
@@ -485,6 +485,12 @@ public final class ActiveMQDefaultConfiguration {
 
    public static boolean DEFAULT_VOTE_ON_REPLICATION_FAILURE = false;
 
+   //how many times we retry a vote before restarting as a backup
+   private static int DEFAULT_VOTE_RETRIES = 12;
+
+   //how long we wait between votes, 5 secs
+   private static long DEFAULT_VOTE_RETRY_WAIT = 5000;
+
    public static int DEFAULT_QUORUM_SIZE = -1;
 
    public static final boolean DEFAULT_ANALYZE_CRITICAL = true;
@@ -1334,4 +1340,11 @@ public final class ActiveMQDefaultConfiguration {
    }
 
 
+   public static int getDefaultVoteRetries() {
+      return DEFAULT_VOTE_RETRIES;
+   }
+
+   public static long getDefaultVoteRetryWait() {
+      return DEFAULT_VOTE_RETRY_WAIT;
+   }
 }

http://git-wip-us.apache.org/repos/asf/activemq-artemis/blob/6067a285/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ConfigurationUtils.java
----------------------------------------------------------------------
diff --git 
a/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ConfigurationUtils.java
 
b/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ConfigurationUtils.java
index 95f524f..7ea26ae 100644
--- 
a/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ConfigurationUtils.java
+++ 
b/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ConfigurationUtils.java
@@ -72,11 +72,11 @@ public final class ConfigurationUtils {
          }
          case REPLICATED: {
             ReplicatedPolicyConfiguration pc = (ReplicatedPolicyConfiguration) 
conf;
-            return new ReplicatedPolicy(pc.isCheckForLiveServer(), 
pc.getGroupName(), pc.getClusterName(), pc.getInitialReplicationSyncTimeout(), 
server.getNetworkHealthCheck(), pc.getVoteOnReplicationFailure(), 
pc.getQuorumSize());
+            return new ReplicatedPolicy(pc.isCheckForLiveServer(), 
pc.getGroupName(), pc.getClusterName(), pc.getInitialReplicationSyncTimeout(), 
server.getNetworkHealthCheck(), pc.getVoteOnReplicationFailure(), 
pc.getQuorumSize(), pc.getVoteRetries(), pc.getVoteRetryWait());
          }
          case REPLICA: {
             ReplicaPolicyConfiguration pc = (ReplicaPolicyConfiguration) conf;
-            return new ReplicaPolicy(pc.getClusterName(), 
pc.getMaxSavedReplicatedJournalsSize(), pc.getGroupName(), 
pc.isRestartBackup(), pc.isAllowFailBack(), 
pc.getInitialReplicationSyncTimeout(), 
getScaleDownPolicy(pc.getScaleDownConfiguration()), 
server.getNetworkHealthCheck(), pc.getVoteOnReplicationFailure(), 
pc.getQuorumSize());
+            return new ReplicaPolicy(pc.getClusterName(), 
pc.getMaxSavedReplicatedJournalsSize(), pc.getGroupName(), 
pc.isRestartBackup(), pc.isAllowFailBack(), 
pc.getInitialReplicationSyncTimeout(), 
getScaleDownPolicy(pc.getScaleDownConfiguration()), 
server.getNetworkHealthCheck(), pc.getVoteOnReplicationFailure(), 
pc.getQuorumSize(), pc.getVoteRetries(), pc.getVoteRetryWait());
          }
          case SHARED_STORE_MASTER: {
             SharedStoreMasterPolicyConfiguration pc = 
(SharedStoreMasterPolicyConfiguration) conf;

http://git-wip-us.apache.org/repos/asf/activemq-artemis/blob/6067a285/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicaPolicyConfiguration.java
----------------------------------------------------------------------
diff --git 
a/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicaPolicyConfiguration.java
 
b/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicaPolicyConfiguration.java
index 0b50882..0e6c82d 100644
--- 
a/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicaPolicyConfiguration.java
+++ 
b/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicaPolicyConfiguration.java
@@ -43,6 +43,10 @@ public class ReplicaPolicyConfiguration implements 
HAPolicyConfiguration {
 
    private int quorumSize = 
ActiveMQDefaultConfiguration.getDefaultQuorumSize();
 
+   private int voteRetries = 
ActiveMQDefaultConfiguration.getDefaultVoteRetries();
+
+   private long voteRetryWait = 
ActiveMQDefaultConfiguration.getDefaultVoteRetryWait();
+
    public ReplicaPolicyConfiguration() {
    }
 
@@ -139,4 +143,20 @@ public class ReplicaPolicyConfiguration implements 
HAPolicyConfiguration {
    public void setQuorumSize(int quorumSize) {
       this.quorumSize = quorumSize;
    }
+
+   public int getVoteRetries() {
+      return voteRetries;
+   }
+
+   public void setVoteRetries(int voteRetries) {
+      this.voteRetries = voteRetries;
+   }
+
+   public void setVoteRetryWait(long voteRetryWait) {
+      this.voteRetryWait = voteRetryWait;
+   }
+
+   public long getVoteRetryWait() {
+      return voteRetryWait;
+   }
 }

http://git-wip-us.apache.org/repos/asf/activemq-artemis/blob/6067a285/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicatedPolicyConfiguration.java
----------------------------------------------------------------------
diff --git 
a/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicatedPolicyConfiguration.java
 
b/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicatedPolicyConfiguration.java
index 9072822..68d69bb 100644
--- 
a/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicatedPolicyConfiguration.java
+++ 
b/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicatedPolicyConfiguration.java
@@ -33,6 +33,10 @@ public class ReplicatedPolicyConfiguration implements 
HAPolicyConfiguration {
 
    private int quorumSize = 
ActiveMQDefaultConfiguration.getDefaultQuorumSize();
 
+   private int voteRetries = 
ActiveMQDefaultConfiguration.getDefaultVoteRetries();
+
+   private long voteRetryWait = 
ActiveMQDefaultConfiguration.getDefaultVoteRetryWait();
+
    public ReplicatedPolicyConfiguration() {
    }
 
@@ -91,4 +95,21 @@ public class ReplicatedPolicyConfiguration implements 
HAPolicyConfiguration {
    public void setQuorumSize(int quorumSize) {
       this.quorumSize = quorumSize;
    }
+
+
+   public int getVoteRetries() {
+      return voteRetries;
+   }
+
+   public void setVoteRetries(int voteRetries) {
+      this.voteRetries = voteRetries;
+   }
+
+   public void setVoteRetryWait(long voteRetryWait) {
+      this.voteRetryWait = voteRetryWait;
+   }
+
+   public long getVoteRetryWait() {
+      return voteRetryWait;
+   }
 }

http://git-wip-us.apache.org/repos/asf/activemq-artemis/blob/6067a285/artemis-server/src/main/java/org/apache/activemq/artemis/core/deployers/impl/FileConfigurationParser.java
----------------------------------------------------------------------
diff --git 
a/artemis-server/src/main/java/org/apache/activemq/artemis/core/deployers/impl/FileConfigurationParser.java
 
b/artemis-server/src/main/java/org/apache/activemq/artemis/core/deployers/impl/FileConfigurationParser.java
index 6c64a3b..7f71c86 100644
--- 
a/artemis-server/src/main/java/org/apache/activemq/artemis/core/deployers/impl/FileConfigurationParser.java
+++ 
b/artemis-server/src/main/java/org/apache/activemq/artemis/core/deployers/impl/FileConfigurationParser.java
@@ -1284,6 +1284,10 @@ public final class FileConfigurationParser extends 
XMLConfigurationUtil {
 
       configuration.setVoteOnReplicationFailure(getBoolean(policyNode, 
"vote-on-replication-failure", configuration.getVoteOnReplicationFailure()));
 
+      configuration.setVoteRetries(getInteger(policyNode, "vote-retries", 
configuration.getVoteRetries(), Validators.MINUS_ONE_OR_GE_ZERO));
+
+      configuration.setVoteRetryWait(getLong(policyNode, "vote-retry-wait", 
configuration.getVoteRetryWait(), Validators.GT_ZERO));
+
       configuration.setQuorumSize(getInteger(policyNode, "quorum-size", 
configuration.getQuorumSize(), Validators.MINUS_ONE_OR_GT_ZERO));
 
       return configuration;
@@ -1308,6 +1312,10 @@ public final class FileConfigurationParser extends 
XMLConfigurationUtil {
 
       configuration.setVoteOnReplicationFailure(getBoolean(policyNode, 
"vote-on-replication-failure", configuration.getVoteOnReplicationFailure()));
 
+      configuration.setVoteRetries(getInteger(policyNode, "vote-retries", 
configuration.getVoteRetries(), Validators.MINUS_ONE_OR_GE_ZERO));
+
+      configuration.setVoteRetryWait(getLong(policyNode, "vote-retry-wait", 
configuration.getVoteRetryWait(), Validators.GT_ZERO));
+
       configuration.setQuorumSize(getInteger(policyNode, "quorum-size", 
configuration.getQuorumSize(), Validators.MINUS_ONE_OR_GT_ZERO));
 
       return configuration;

http://git-wip-us.apache.org/repos/asf/activemq-artemis/blob/6067a285/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/ReplicaPolicy.java
----------------------------------------------------------------------
diff --git 
a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/ReplicaPolicy.java
 
b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/ReplicaPolicy.java
index 2339610..40559cf 100644
--- 
a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/ReplicaPolicy.java
+++ 
b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/ReplicaPolicy.java
@@ -53,6 +53,10 @@ public class ReplicaPolicy extends BackupPolicy {
 
    private final NetworkHealthCheck networkHealthCheck;
 
+   private int voteRetries;
+
+   private long voteRetryWait;
+
    public ReplicaPolicy(final NetworkHealthCheck networkHealthCheck) {
       this.networkHealthCheck = networkHealthCheck;
    }
@@ -72,7 +76,9 @@ public class ReplicaPolicy extends BackupPolicy {
                         ScaleDownPolicy scaleDownPolicy,
                         NetworkHealthCheck networkHealthCheck,
                         boolean voteOnReplicationFailure,
-                        int quorumSize) {
+                        int quorumSize,
+                        int voteRetries,
+                        long voteRetryWait) {
       this.clusterName = clusterName;
       this.maxSavedReplicatedJournalsSize = maxSavedReplicatedJournalsSize;
       this.groupName = groupName;
@@ -80,6 +86,8 @@ public class ReplicaPolicy extends BackupPolicy {
       this.allowFailback = allowFailback;
       this.initialReplicationSyncTimeout = initialReplicationSyncTimeout;
       this.quorumSize = quorumSize;
+      this.voteRetries = voteRetries;
+      this.voteRetryWait = voteRetryWait;
       this.scaleDownPolicy = scaleDownPolicy;
       this.networkHealthCheck = networkHealthCheck;
       this.voteOnReplicationFailure = voteOnReplicationFailure;
@@ -115,7 +123,7 @@ public class ReplicaPolicy extends BackupPolicy {
 
    public ReplicatedPolicy getReplicatedPolicy() {
       if (replicatedPolicy == null) {
-         replicatedPolicy = new ReplicatedPolicy(false, allowFailback, 
initialReplicationSyncTimeout, groupName, clusterName, this, 
networkHealthCheck, voteOnReplicationFailure, quorumSize);
+         replicatedPolicy = new ReplicatedPolicy(false, allowFailback, 
initialReplicationSyncTimeout, groupName, clusterName, this, 
networkHealthCheck, voteOnReplicationFailure, quorumSize, voteRetries, 
voteRetryWait);
       }
       return replicatedPolicy;
    }
@@ -210,4 +218,20 @@ public class ReplicaPolicy extends BackupPolicy {
    public boolean isVoteOnReplicationFailure() {
       return voteOnReplicationFailure;
    }
+
+   public void setVoteRetries(int voteRetries) {
+      this.voteRetries = voteRetries;
+   }
+
+   public void setVoteRetryWait(long voteRetryWait) {
+      this.voteRetryWait = voteRetryWait;
+   }
+
+   public int getVoteRetries() {
+      return voteRetries;
+   }
+
+   public long getVoteRetryWait() {
+      return voteRetryWait;
+   }
 }

http://git-wip-us.apache.org/repos/asf/activemq-artemis/blob/6067a285/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/ReplicatedPolicy.java
----------------------------------------------------------------------
diff --git 
a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/ReplicatedPolicy.java
 
b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/ReplicatedPolicy.java
index f8892af..135e8d0 100644
--- 
a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/ReplicatedPolicy.java
+++ 
b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/ReplicatedPolicy.java
@@ -50,6 +50,10 @@ public class ReplicatedPolicy implements 
HAPolicy<LiveActivation> {
    * */
    private int quorumSize;
 
+   private int voteRetries;
+
+   private long voteRetryWait;
+
    /*
    * this are only used as the policy when the server is started as a live 
after a failover
    * */
@@ -68,7 +72,9 @@ public class ReplicatedPolicy implements 
HAPolicy<LiveActivation> {
                            long initialReplicationSyncTimeout,
                            NetworkHealthCheck networkHealthCheck,
                            boolean voteOnReplicationFailure,
-                           int quorumSize) {
+                           int quorumSize,
+                           int voteRetries,
+                           long voteRetryWait) {
       this.checkForLiveServer = checkForLiveServer;
       this.groupName = groupName;
       this.clusterName = clusterName;
@@ -76,6 +82,8 @@ public class ReplicatedPolicy implements 
HAPolicy<LiveActivation> {
       this.networkHealthCheck = networkHealthCheck;
       this.voteOnReplicationFailure = voteOnReplicationFailure;
       this.quorumSize = quorumSize;
+      this.voteRetries = voteRetries;
+      this.voteRetryWait = voteRetryWait;
    }
 
    public ReplicatedPolicy(boolean checkForLiveServer,
@@ -86,7 +94,9 @@ public class ReplicatedPolicy implements 
HAPolicy<LiveActivation> {
                            ReplicaPolicy replicaPolicy,
                            NetworkHealthCheck networkHealthCheck,
                            boolean voteOnReplicationFailure,
-                           int quorumSize) {
+                           int quorumSize,
+                           int voteRetries,
+                           long voteRetryWait) {
       this.checkForLiveServer = checkForLiveServer;
       this.clusterName = clusterName;
       this.groupName = groupName;
@@ -140,6 +150,8 @@ public class ReplicatedPolicy implements 
HAPolicy<LiveActivation> {
          replicaPolicy = new ReplicaPolicy(networkHealthCheck, this);
          replicaPolicy.setQuorumSize(quorumSize);
          replicaPolicy.setVoteOnReplicationFailure(voteOnReplicationFailure);
+         replicaPolicy.setVoteRetries(voteRetries);
+         replicaPolicy.setVoteRetryWait(voteRetryWait);
          if (clusterName != null && clusterName.length() > 0) {
             replicaPolicy.setClusterName(clusterName);
          }

http://git-wip-us.apache.org/repos/asf/activemq-artemis/blob/6067a285/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/impl/BridgeImpl.java
----------------------------------------------------------------------
diff --git 
a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/impl/BridgeImpl.java
 
b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/impl/BridgeImpl.java
index 3aa82a1..4790fda 100644
--- 
a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/impl/BridgeImpl.java
+++ 
b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/impl/BridgeImpl.java
@@ -778,7 +778,8 @@ public class BridgeImpl implements Bridge, 
SessionFailureListener, SendAcknowled
 
    protected void fail(final boolean permanently) {
       logger.debug(this + "\n\t::fail being called, permanently=" + 
permanently);
-
+      //we need to make sure we remove the node from the topology so any 
incoming quorum requests are voted correctly
+      serverLocator.notifyNodeDown(System.currentTimeMillis(), targetNodeID);
       if (queue != null) {
          try {
             if (logger.isTraceEnabled()) {

http://git-wip-us.apache.org/repos/asf/activemq-artemis/blob/6067a285/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/qourum/QuorumManager.java
----------------------------------------------------------------------
diff --git 
a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/qourum/QuorumManager.java
 
b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/qourum/QuorumManager.java
index 77a7d18..9b8b647 100644
--- 
a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/qourum/QuorumManager.java
+++ 
b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/qourum/QuorumManager.java
@@ -252,6 +252,22 @@ public final class QuorumManager implements 
ClusterTopologyListener, ActiveMQCom
       return handlers.get(handler);
    }
 
+   public TransportConfiguration getLiveTransportConfiguration(String 
targetServerID) {
+      TopologyMemberImpl member = 
clusterController.getDefaultClusterTopology().getMember(targetServerID);
+      return member != null ? member.getLive() : null;
+   }
+
+   public boolean checkLive(TransportConfiguration liveTransportConfiguration) 
{
+      try {
+         ClusterControl control = 
clusterController.connectToNode(liveTransportConfiguration);
+         control.close();
+         return true;
+      } catch (Throwable t) {
+         return false;
+      }
+   }
+
+
    private final class VoteRunnableHolder {
 
       private final QuorumVote quorumVote;

http://git-wip-us.apache.org/repos/asf/activemq-artemis/blob/6067a285/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/qourum/SharedNothingBackupQuorum.java
----------------------------------------------------------------------
diff --git 
a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/qourum/SharedNothingBackupQuorum.java
 
b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/qourum/SharedNothingBackupQuorum.java
index 330b53a..029767b 100644
--- 
a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/qourum/SharedNothingBackupQuorum.java
+++ 
b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/qourum/SharedNothingBackupQuorum.java
@@ -21,6 +21,7 @@ import java.util.concurrent.ScheduledExecutorService;
 import java.util.concurrent.TimeUnit;
 
 import org.apache.activemq.artemis.api.core.ActiveMQException;
+import org.apache.activemq.artemis.api.core.TransportConfiguration;
 import org.apache.activemq.artemis.api.core.client.SessionFailureListener;
 import 
org.apache.activemq.artemis.core.client.impl.ClientSessionFactoryInternal;
 import org.apache.activemq.artemis.core.client.impl.Topology;
@@ -33,6 +34,8 @@ import org.apache.activemq.artemis.core.server.NodeManager;
 
 public class SharedNothingBackupQuorum implements Quorum, 
SessionFailureListener {
 
+   private TransportConfiguration liveTransportConfiguration;
+
    public enum BACKUP_ACTIVATION {
       FAIL_OVER, FAILURE_REPLICATING, ALREADY_REPLICATING, STOP;
    }
@@ -47,6 +50,12 @@ public class SharedNothingBackupQuorum implements Quorum, 
SessionFailureListener
    private final ScheduledExecutorService scheduledPool;
    private final int quorumSize;
 
+   private final int voteRetries;
+
+   private final long voteRetryWait;
+
+   private final Object voteGuard = new Object();
+
    private CountDownLatch latch;
 
    private ClientSessionFactoryInternal sessionFactory;
@@ -68,13 +77,17 @@ public class SharedNothingBackupQuorum implements Quorum, 
SessionFailureListener
                                     NodeManager nodeManager,
                                     ScheduledExecutorService scheduledPool,
                                     NetworkHealthCheck networkHealthCheck,
-                                    int quorumSize) {
+                                    int quorumSize,
+                                    int voteRetries,
+                                    long voteRetryWait) {
       this.storageManager = storageManager;
       this.scheduledPool = scheduledPool;
       this.quorumSize = quorumSize;
       this.latch = new CountDownLatch(1);
       this.nodeManager = nodeManager;
       this.networkHealthCheck = networkHealthCheck;
+      this.voteRetries = voteRetries;
+      this.voteRetryWait = voteRetryWait;
    }
 
    private volatile BACKUP_ACTIVATION signal;
@@ -129,6 +142,7 @@ public class SharedNothingBackupQuorum implements Quorum, 
SessionFailureListener
    public void liveIDSet(String liveID) {
       targetServerID = liveID;
       nodeManager.setNodeID(liveID);
+      liveTransportConfiguration = 
quorumManager.getLiveTransportConfiguration(targetServerID);
       //now we are replicating we can start waiting for disconnect 
notifications so we can fail over
       // sessionFactory.addFailureListener(this);
    }
@@ -267,20 +281,44 @@ public class SharedNothingBackupQuorum implements Quorum, 
SessionFailureListener
     * @return the voting decision
     */
    private boolean isLiveDown() {
+      //lets assume live is not down
+      Boolean decision = false;
+      int voteAttempts = 0;
       int size = quorumSize == -1 ? quorumManager.getMaxClusterSize() : 
quorumSize;
 
-      QuorumVoteServerConnect quorumVote = new QuorumVoteServerConnect(size, 
targetServerID);
+      synchronized (voteGuard) {
+         while (!decision && voteAttempts++ < voteRetries) {
+            // a quick check to see if the live actually is dead
+            if (quorumManager.checkLive(liveTransportConfiguration)) {
+               //the live is still alive so we best not failover
+               return false;
+            }
+            //the live is dead so lets vote for quorum
+            QuorumVoteServerConnect quorumVote = new 
QuorumVoteServerConnect(size, targetServerID);
 
-      quorumManager.vote(quorumVote);
+            quorumManager.vote(quorumVote);
 
-      try {
-         quorumVote.await(LATCH_TIMEOUT, TimeUnit.SECONDS);
-      } catch (InterruptedException interruption) {
-         // No-op. The best the quorum can do now is to return the latest 
number it has
-      }
+            try {
+               quorumVote.await(LATCH_TIMEOUT, TimeUnit.SECONDS);
+            } catch (InterruptedException interruption) {
+               // No-op. The best the quorum can do now is to return the 
latest number it has
+            }
+
+            quorumManager.voteComplete(quorumVote);
 
-      quorumManager.voteComplete(quorumVote);
+            decision = quorumVote.getDecision();
+
+            if (decision) {
+               return decision;
+            }
+            try {
+               voteGuard.wait(voteRetryWait);
+            } catch (InterruptedException e) {
+               //nothing to do here
+            }
+         }
+      }
 
-      return quorumVote.getDecision();
+      return decision;
    }
 }

http://git-wip-us.apache.org/repos/asf/activemq-artemis/blob/6067a285/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/SharedNothingBackupActivation.java
----------------------------------------------------------------------
diff --git 
a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/SharedNothingBackupActivation.java
 
b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/SharedNothingBackupActivation.java
index 06a3afb..58742de 100644
--- 
a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/SharedNothingBackupActivation.java
+++ 
b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/SharedNothingBackupActivation.java
@@ -131,7 +131,7 @@ public final class SharedNothingBackupActivation extends 
Activation {
             logger.trace("Entered a synchronized");
             if (closed)
                return;
-            backupQuorum = new 
SharedNothingBackupQuorum(activeMQServer.getStorageManager(), 
activeMQServer.getNodeManager(), activeMQServer.getScheduledPool(), 
networkHealthCheck, replicaPolicy.getQuorumSize());
+            backupQuorum = new 
SharedNothingBackupQuorum(activeMQServer.getStorageManager(), 
activeMQServer.getNodeManager(), activeMQServer.getScheduledPool(), 
networkHealthCheck, replicaPolicy.getQuorumSize(), 
replicaPolicy.getVoteRetries(), replicaPolicy.getVoteRetryWait());
             
activeMQServer.getClusterManager().getQuorumManager().registerQuorum(backupQuorum);
             
activeMQServer.getClusterManager().getQuorumManager().registerQuorumHandler(new 
ServerConnectVoteHandler(activeMQServer));
          }

http://git-wip-us.apache.org/repos/asf/activemq-artemis/blob/6067a285/artemis-server/src/main/resources/schema/artemis-configuration.xsd
----------------------------------------------------------------------
diff --git a/artemis-server/src/main/resources/schema/artemis-configuration.xsd 
b/artemis-server/src/main/resources/schema/artemis-configuration.xsd
index a615554..e76478c 100644
--- a/artemis-server/src/main/resources/schema/artemis-configuration.xsd
+++ b/artemis-server/src/main/resources/schema/artemis-configuration.xsd
@@ -2171,6 +2171,22 @@
                </xsd:documentation>
             </xsd:annotation>
          </xsd:element>
+
+         <xsd:element name="vote-retries" type="xsd:integer" default="12" 
minOccurs="0" maxOccurs="1">
+            <xsd:annotation>
+               <xsd:documentation>
+                  If we start as a replica and lose connection to the master, 
how many times should we attempt to vote
+                  for quorum before restarting
+               </xsd:documentation>
+            </xsd:annotation>
+         </xsd:element>
+         <xsd:element name="vote-retries" type="xsd:long" default="5000" 
minOccurs="0" maxOccurs="1">
+            <xsd:annotation>
+               <xsd:documentation>
+                  How long to wait (in milliseconds) between each vote
+               </xsd:documentation>
+            </xsd:annotation>
+         </xsd:element>
       </xsd:all>
    </xsd:complexType>
    <xsd:complexType name="replicaPolicyType">
@@ -2259,6 +2275,20 @@
                </xsd:documentation>
             </xsd:annotation>
          </xsd:element>
+         <xsd:element name="vote-retries" type="xsd:integer" default="12" 
minOccurs="0" maxOccurs="1">
+            <xsd:annotation>
+               <xsd:documentation>
+                  If we lose connection to the master, how many times should 
we attempt to vote for quorum before restarting
+               </xsd:documentation>
+            </xsd:annotation>
+         </xsd:element>
+         <xsd:element name="vote-retries" type="xsd:long" default="5000" 
minOccurs="0" maxOccurs="1">
+            <xsd:annotation>
+               <xsd:documentation>
+                  How long to wait (in milliseconds) between each vote
+               </xsd:documentation>
+            </xsd:annotation>
+         </xsd:element>
       </xsd:all>
    </xsd:complexType>
    <xsd:complexType name="colocatedReplicaPolicyType">

http://git-wip-us.apache.org/repos/asf/activemq-artemis/blob/6067a285/docs/user-manual/en/network-isolation.md
----------------------------------------------------------------------
diff --git a/docs/user-manual/en/network-isolation.md 
b/docs/user-manual/en/network-isolation.md
index 06eda31..ded89b7 100644
--- a/docs/user-manual/en/network-isolation.md
+++ b/docs/user-manual/en/network-isolation.md
@@ -17,9 +17,22 @@ react which the following details:
 By default if a replica loses its replication connection to the live broker it 
makes a decision as to whether to start or not
 with a quorum vote. This of course requires that there be at least 3 pairs of 
live/backup nodes in the cluster. For a 3 node 
 cluster it will start if it gets 2 votes back saying that its live server is 
no longer available, for 4 nodes this would be 
-3 votes and so on.
+3 votes and so on. When a backup loses connection to the master it will keep 
voting for a quorum until it either receives a vote 
+allowing it to start or it detects that the master is still live. for the 
latter it will then restart as a backup. How many votes 
+and how long between each vote the backup should wait is configured like so:
 
-It's also possible to statically set the quorum size that should be used fotr 
the case where the cluster size is known up front,
+```xml
+<ha-policy>
+  <replication>
+    <slave>
+       <vote-retries>12</vote-retries>
+       <vote-retry-wait>5000</vote-retry-wait>
+    </slave>
+  </replication>
+</ha-policy>
+```
+
+It's also possible to statically set the quorum size that should be used for 
the case where the cluster size is known up front,
 this is done on the Replica Policy like so:
 
 ```xml

Reply via email to