When a server becomes unstable due to overloading or intermittent
partitioning, it may miss some heartbeats and then starts election with
a new term, which would disrupt the otherwise healthy cluster formed by
the rest of the healthy nodes.  Such situation may exist for a long time
until the "flapping" server is shutdown or recovered completely, which
can severely impact the availability of the cluster. The pre-vote
mechanism introduced in the raft paper section 9.6 can prevent such
problems. This patch implements the pre-vote mechanism, in a way that is
backward compatible and safe for upgrades.

Signed-off-by: Han Zhou <hz...@ovn.org>
---
 ovsdb/raft-rpc.c       | 19 ++++++++-
 ovsdb/raft-rpc.h       |  3 ++
 ovsdb/raft.c           | 88 ++++++++++++++++++++++++++++++------------
 tests/ovsdb-cluster.at | 42 ++++++++++++++++++++
 4 files changed, 127 insertions(+), 25 deletions(-)

diff --git a/ovsdb/raft-rpc.c b/ovsdb/raft-rpc.c
index dd14d81091fc..f750ba897046 100644
--- a/ovsdb/raft-rpc.c
+++ b/ovsdb/raft-rpc.c
@@ -283,6 +283,10 @@ raft_vote_request_to_jsonrpc(const struct 
raft_vote_request *rq,
         json_object_put(args, "leadership_transfer",
                         json_boolean_create(true));
     }
+    if (rq->is_prevote) {
+        json_object_put(args, "is_prevote",
+                        json_boolean_create(true));
+    }
 }
 
 static void
@@ -294,6 +298,8 @@ raft_vote_request_from_jsonrpc(struct ovsdb_parser *p,
     rq->last_log_term = raft_parse_required_uint64(p, "last_log_term");
     rq->leadership_transfer
         = raft_parse_optional_boolean(p, "leadership_transfer") == 1;
+    rq->is_prevote
+        = raft_parse_optional_boolean(p, "is_prevote") == 1;
 }
 
 static void
@@ -305,6 +311,9 @@ raft_format_vote_request(const struct raft_vote_request 
*rq, struct ds *s)
     if (rq->leadership_transfer) {
         ds_put_cstr(s, " leadership_transfer=true");
     }
+    if (rq->is_prevote) {
+        ds_put_cstr(s, " is_prevote=true");
+    }
 }
 
 /* raft_vote_reply. */
@@ -326,6 +335,9 @@ raft_vote_reply_to_jsonrpc(const struct raft_vote_reply 
*rpy,
 {
     raft_put_uint64(args, "term", rpy->term);
     json_object_put_format(args, "vote", UUID_FMT, UUID_ARGS(&rpy->vote));
+    if (rpy->is_prevote) {
+        json_object_put(args, "is_prevote", json_boolean_create(true));
+    }
 }
 
 static void
@@ -334,6 +346,7 @@ raft_vote_reply_from_jsonrpc(struct ovsdb_parser *p,
 {
     rpy->term = raft_parse_required_uint64(p, "term");
     rpy->vote = raft_parse_required_uuid(p, "vote");
+    rpy->is_prevote = raft_parse_optional_boolean(p, "is_prevote") == 1;
 }
 
 static void
@@ -341,6 +354,9 @@ raft_format_vote_reply(const struct raft_vote_reply *rpy, 
struct ds *s)
 {
     ds_put_format(s, " term=%"PRIu64, rpy->term);
     ds_put_format(s, " vote="SID_FMT, SID_ARGS(&rpy->vote));
+    if (rpy->is_prevote) {
+        ds_put_cstr(s, " is_prevote=true");
+    }
 }
 
 /* raft_add_server_request */
@@ -1008,7 +1024,8 @@ raft_rpc_get_vote(const union raft_rpc *rpc)
         return NULL;
 
     case RAFT_RPC_VOTE_REPLY:
-        return &raft_vote_reply_cast(rpc)->vote;
+        return raft_vote_reply_cast(rpc)->is_prevote ? NULL :
+            &raft_vote_reply_cast(rpc)->vote;
 
     default:
         OVS_NOT_REACHED();
diff --git a/ovsdb/raft-rpc.h b/ovsdb/raft-rpc.h
index 221f24d00128..10f30618e3e0 100644
--- a/ovsdb/raft-rpc.h
+++ b/ovsdb/raft-rpc.h
@@ -125,12 +125,15 @@ struct raft_vote_request {
     uint64_t last_log_index; /* Index of candidate's last log entry. */
     uint64_t last_log_term;  /* Term of candidate's last log entry. */
     bool leadership_transfer;  /* True to override minimum election timeout. */
+    bool is_prevote;         /* True: pre-vote; False: real vote (default). */
 };
 
 struct raft_vote_reply {
     struct raft_rpc_common common;
     uint64_t term;          /* Current term, for candidate to update itself. */
     struct uuid vote;       /* Server ID of vote. */
+    bool is_prevote;        /* Copy the is_prevote from the request, primarily
+                               for validation. */
 };
 
 struct raft_add_server_request {
diff --git a/ovsdb/raft.c b/ovsdb/raft.c
index b2361b1737a2..4d7a3f112cad 100644
--- a/ovsdb/raft.c
+++ b/ovsdb/raft.c
@@ -305,6 +305,11 @@ struct raft {
 
     /* Candidates only.  Reinitialized at start of election. */
     int n_votes;                /* Number of votes for me. */
+    bool prevote_passed;        /* Indicates if it passed pre-vote phase.
+                                   Pre-vote mechanism is introduced in raft
+                                   paper section 9.6. We implement it as a
+                                   sub-state of candidate to minize the change
+                                   and keep backward compatibility. */
 
     /* Followers and candidates only. */
     bool candidate_retrying;    /* The earlier election timed-out and we are
@@ -372,7 +377,8 @@ static void raft_become_follower(struct raft *);
 static void raft_reset_election_timer(struct raft *);
 static void raft_reset_ping_timer(struct raft *);
 static void raft_send_heartbeats(struct raft *);
-static void raft_start_election(struct raft *, bool leadership_transfer);
+static void raft_start_election(struct raft *, bool is_prevote,
+                                bool leadership_transfer);
 static bool raft_truncate(struct raft *, uint64_t new_end);
 static void raft_get_servers_from_log(struct raft *, enum vlog_level);
 static void raft_get_election_timer_from_log(struct raft *);
@@ -1069,7 +1075,8 @@ raft_open(struct ovsdb_log *log, struct raft **raftp)
         /* If there's only one server, start an election right away so that the
          * cluster bootstraps quickly. */
         if (hmap_count(&raft->servers) == 1) {
-            raft_start_election(raft, false);
+            /* No pre-vote needed since we are the only one. */
+            raft_start_election(raft, false, false);
         }
     } else {
         raft->join_timeout = time_msec() + 1000;
@@ -1360,7 +1367,7 @@ void
 raft_take_leadership(struct raft *raft)
 {
     if (raft->role != RAFT_LEADER) {
-        raft_start_election(raft, true);
+        raft_start_election(raft, false, true);
     }
 }
 
@@ -1766,12 +1773,12 @@ raft_set_term(struct raft *raft, uint64_t term, const 
struct uuid *vote)
     return true;
 }
 
-static void
+static bool
 raft_accept_vote(struct raft *raft, struct raft_server *s,
                  const struct uuid *vote)
 {
     if (uuid_equals(&s->vote, vote)) {
-        return;
+        return false;
     }
     if (!uuid_is_zero(&s->vote)) {
         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
@@ -1785,13 +1792,18 @@ raft_accept_vote(struct raft *raft, struct raft_server 
*s,
     s->vote = *vote;
     if (uuid_equals(vote, &raft->sid)
         && ++raft->n_votes > hmap_count(&raft->servers) / 2) {
-        raft_become_leader(raft);
+        return true;
     }
+    return false;
 }
 
 static void
-raft_start_election(struct raft *raft, bool leadership_transfer)
+raft_start_election(struct raft *raft, bool is_prevote,
+                    bool leadership_transfer)
 {
+    /* Leadership transfer doesn't use prevote. */
+    ovs_assert(!is_prevote || !leadership_transfer);
+
     if (raft->leaving) {
         return;
     }
@@ -1801,7 +1813,7 @@ raft_start_election(struct raft *raft, bool 
leadership_transfer)
         return;
     }
 
-    if (!raft_set_term(raft, raft->term + 1, &raft->sid)) {
+    if (!is_prevote && !raft_set_term(raft, raft->term + 1, &raft->sid)) {
         return;
     }
 
@@ -1809,26 +1821,33 @@ raft_start_election(struct raft *raft, bool 
leadership_transfer)
 
     raft->leader_sid = UUID_ZERO;
     raft->role = RAFT_CANDIDATE;
-    /* If there was no leader elected since last election, we know we are
-     * retrying now. */
-    raft->candidate_retrying = !raft->had_leader;
-    raft->had_leader = false;
+    raft->prevote_passed = !is_prevote;
+
+    if (is_prevote || leadership_transfer) {
+        /* If there was no leader elected since last election, we know we are
+         * retrying now. */
+        raft->candidate_retrying = !raft->had_leader;
+        raft->had_leader = false;
+
+        raft->election_start = time_msec();
+        raft->election_won = 0;
+    }
 
     raft->n_votes = 0;
 
-    raft->election_start = time_msec();
-    raft->election_won = 0;
     raft->leadership_transfer = leadership_transfer;
 
     static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
     if (!VLOG_DROP_INFO(&rl)) {
         long long int now = time_msec();
+        char *comment = is_prevote ? "prevote" : "vote";
         if (now >= raft->election_timeout) {
             VLOG_INFO("term %"PRIu64": %lld ms timeout expired, "
-                      "starting election",
-                      raft->term, now - raft->election_base);
+                      "starting election (%s)",
+                      raft->term, now - raft->election_base, comment);
         } else {
-            VLOG_INFO("term %"PRIu64": starting election", raft->term);
+            VLOG_INFO("term %"PRIu64": starting election (%s)",
+                      raft->term, comment);
         }
     }
     raft_reset_election_timer(raft);
@@ -1853,6 +1872,7 @@ raft_start_election(struct raft *raft, bool 
leadership_transfer)
                     ? raft->entries[raft->log_end - raft->log_start - 1].term
                     : raft->snap.term),
                 .leadership_transfer = leadership_transfer,
+                .is_prevote = is_prevote,
             },
         };
         if (failure_test != FT_DONT_SEND_VOTE_REQUEST) {
@@ -1861,7 +1881,13 @@ raft_start_election(struct raft *raft, bool 
leadership_transfer)
     }
 
     /* Vote for ourselves. */
-    raft_accept_vote(raft, me, &raft->sid);
+    if (raft_accept_vote(raft, me, &raft->sid)) {
+        /* We just started vote, so it shouldn't be accepted yet unless this is
+         * a one-node cluster. In such case we don't do pre-vote, and become
+         * leader immediately. */
+        ovs_assert(!is_prevote);
+        raft_become_leader(raft);
+    }
 }
 
 static void
@@ -2041,10 +2067,10 @@ raft_run(struct raft *raft)
                 raft_reset_election_timer(raft);
             } else {
                 raft_become_follower(raft);
-                raft_start_election(raft, false);
+                raft_start_election(raft, true, false);
             }
         } else {
-            raft_start_election(raft, false);
+            raft_start_election(raft, true, false);
         }
 
     }
@@ -3673,6 +3699,10 @@ raft_handle_vote_request__(struct raft *raft,
         return false;
     }
 
+    if (rq->is_prevote) {
+        return true;
+    }
+
     /* Record a vote for the peer. */
     if (!raft_set_term(raft, raft->term, &rq->common.sid)) {
         return false;
@@ -3685,7 +3715,7 @@ raft_handle_vote_request__(struct raft *raft,
 
 static void
 raft_send_vote_reply(struct raft *raft, const struct uuid *dst,
-                     const struct uuid *vote)
+                     const struct uuid *vote, bool is_prevote)
 {
     union raft_rpc rpy = {
         .vote_reply = {
@@ -3695,6 +3725,7 @@ raft_send_vote_reply(struct raft *raft, const struct uuid 
*dst,
             },
             .term = raft->term,
             .vote = *vote,
+            .is_prevote = is_prevote,
         },
     };
     raft_send(raft, &rpy);
@@ -3705,7 +3736,9 @@ raft_handle_vote_request(struct raft *raft,
                          const struct raft_vote_request *rq)
 {
     if (raft_handle_vote_request__(raft, rq)) {
-        raft_send_vote_reply(raft, &rq->common.sid, &raft->vote);
+        raft_send_vote_reply(raft, &rq->common.sid,
+                             rq->is_prevote ? &rq->common.sid : &raft->vote,
+                             rq->is_prevote);
     }
 }
 
@@ -3723,7 +3756,14 @@ raft_handle_vote_reply(struct raft *raft,
 
     struct raft_server *s = raft_find_peer(raft, &rpy->common.sid);
     if (s) {
-        raft_accept_vote(raft, s, &rpy->vote);
+        if (raft_accept_vote(raft, s, &rpy->vote)) {
+            if (raft->prevote_passed) {
+                raft_become_leader(raft);
+            } else {
+                /* Start the real election. */
+                raft_start_election(raft, false, false);
+            }
+        }
     }
 }
 
@@ -4357,7 +4397,7 @@ raft_handle_become_leader(struct raft *raft,
         VLOG_INFO("received leadership transfer from %s in term %"PRIu64,
                   raft_get_nickname(raft, &rq->common.sid, buf, sizeof buf),
                   rq->term);
-        raft_start_election(raft, true);
+        raft_start_election(raft, false, true);
     }
 }
 
diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at
index 9fbf5dc897f2..8a81136999e0 100644
--- a/tests/ovsdb-cluster.at
+++ b/tests/ovsdb-cluster.at
@@ -715,6 +715,48 @@ done
 
 AT_CLEANUP
 
+
+AT_SETUP([OVSDB cluster - disruptive server])
+AT_KEYWORDS([ovsdb server negative unix cluster disruptive])
+
+n=3
+schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
+ordinal_schema > schema
+AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db 
$abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr])
+cid=`ovsdb-tool db-cid s1.db`
+schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
+for i in `seq 2 $n`; do
+    AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft 
unix:s1.raft])
+done
+
+on_exit 'kill `cat *.pid`'
+for i in `seq $n`; do
+    AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir 
--log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb 
s$i.db])
+done
+for i in `seq $n`; do
+    AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
+done
+
+# An unstable follower shouldn't disrupt the healthy cluster - shouldn't
+# trigger term change.
+AT_CHECK([ovs-appctl -t "`pwd`"/s2 cluster/failure-test stop-raft-rpc], [0], 
[ignore])
+OVS_WAIT_UNTIL([ovs-appctl -t "`pwd`"/s2 cluster/status $schema_name | grep 
"Role: candidate"])
+AT_CHECK([ovs-appctl -t "`pwd`"/s2 cluster/failure-test clear], [0], [ignore])
+
+# Should step back to follower.
+OVS_WAIT_UNTIL([ovs-appctl -t "`pwd`"/s2 cluster/status $schema_name | grep 
"Role: follower"])
+
+# No term change.
+for i in `seq $n`; do
+    AT_CHECK([ovs-appctl -t "`pwd`"/s$i cluster/status $schema_name | grep 
"Term: 1"], [0], [ignore])
+done
+
+for i in `seq $n`; do
+    OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid])
+done
+
+AT_CLEANUP
+
 
 AT_BANNER([OVSDB - cluster tests])
 
-- 
2.30.2

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to