On Fri, Mar 15, 2024 at 1:15 PM Ilya Maximets <i.maxim...@ovn.org> wrote:
>
> Consider the following sequence of events:
>
>  1. Cluster with 2 nodes - A and B.  A is a leader.
>  2. C connects to A and sends a join request.
>  3. A sends an append request to C.  C is in CATCHUP phase for A.
>  4. A looses leadership to B.  Sends join failure notification to C.
>  5. C sends append reply to A.
>  6. A discards append reply (not leader).
>  7. B looses leadership back to A.
>  8. C sends a new join request to A.
>  9. A replies with failure (already in progress).
>  10. GoTo step 8.
>
> At this point A is waiting for an append reply that it already
> discarded at step 6 and fails all the new attempts of C to join with
> 'already in progress' verdict.  C stays forever in a joining state
> and in a CATCHUP phase from A's perspective.
>
> This is a similar case to a sudden disconnect from a leader fixed in
> commit 999ba294fb4f ("ovsdb: raft: Fix inability to join the cluster
> after interrupted attempt."), but since we're not disconnecting, the
> servers are not getting destroyed.
>
> Fix that by destroying all the servers that are not yet part of the
> configuration after leadership is lost.  This way, server C will be
> able to simply re-start the joining process from scratch.
>
> New failure test command is added in order to simulate leadership
> change before we receive the append reply, so it gets discarded.
> New cluster test is added to exercise this scenario.
>
> Fixes: 1b1d2e6daa56 ("ovsdb: Introduce experimental support for clustered
databases.")
> Reported-at: https://github.com/ovn-org/ovn/issues/235
> Signed-off-by: Ilya Maximets <i.maxim...@ovn.org>
> ---
>  ovsdb/raft.c           | 16 ++++++++++++-
>  tests/ovsdb-cluster.at | 53 ++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 68 insertions(+), 1 deletion(-)
>
> diff --git a/ovsdb/raft.c b/ovsdb/raft.c
> index c41419052..f9e760a08 100644
> --- a/ovsdb/raft.c
> +++ b/ovsdb/raft.c
> @@ -81,6 +81,7 @@ enum raft_failure_test {
>      FT_STOP_RAFT_RPC,
>      FT_TRANSFER_LEADERSHIP,
>      FT_TRANSFER_LEADERSHIP_AFTER_SEND_APPEND_REQ,
> +    FT_TRANSFER_LEADERSHIP_AFTER_STARTING_TO_ADD,
>  };
>  static enum raft_failure_test failure_test;
>
> @@ -2702,15 +2703,22 @@ raft_become_follower(struct raft *raft)
>       * new configuration.  Our AppendEntries processing will properly
update
>       * the server configuration later, if necessary.
>       *
> +     * However, since we're sending replies about a failure to add,
those new
> +     * servers has to be cleaned up.  Otherwise, they will stuck in a
'CATCHUP'
> +     * phase in case this server regains leadership before they join
through
> +     * the current new leader.  They are not yet in 'raft->servers', so
not
> +     * part of the shared configuration.
> +     *
>       * Also we do not complete commands here, as they can still be
completed
>       * if their log entries have already been replicated to other
servers.
>       * If the entries were actually committed according to the new
leader, our
>       * AppendEntries processing will complete the corresponding commands.
>       */
>      struct raft_server *s;
> -    HMAP_FOR_EACH (s, hmap_node, &raft->add_servers) {
> +    HMAP_FOR_EACH_POP (s, hmap_node, &raft->add_servers) {
>          raft_send_add_server_reply__(raft, &s->sid, s->address, false,
>                                       RAFT_SERVER_LOST_LEADERSHIP);
> +        raft_server_destroy(s);
>      }
>      if (raft->remove_server) {
>          raft_send_remove_server_reply__(raft, &raft->remove_server->sid,
> @@ -3985,6 +3993,10 @@ raft_handle_add_server_request(struct raft *raft,
>                   "to cluster "CID_FMT, s->nickname, SID_ARGS(&s->sid),
>                   rq->address, CID_ARGS(&raft->cid));
>      raft_send_append_request(raft, s, 0, "initialize new server");
> +
> +    if (failure_test == FT_TRANSFER_LEADERSHIP_AFTER_STARTING_TO_ADD) {
> +        failure_test = FT_TRANSFER_LEADERSHIP;
> +    }
>  }
>
>  static void
> @@ -5110,6 +5122,8 @@ raft_unixctl_failure_test(struct unixctl_conn *conn
OVS_UNUSED,
>      } else if (!strcmp(test,
>
"transfer-leadership-after-sending-append-request")) {
>          failure_test = FT_TRANSFER_LEADERSHIP_AFTER_SEND_APPEND_REQ;
> +    } else if (!strcmp(test,
"transfer-leadership-after-starting-to-add")) {
> +        failure_test = FT_TRANSFER_LEADERSHIP_AFTER_STARTING_TO_ADD;
>      } else if (!strcmp(test, "transfer-leadership")) {
>          failure_test = FT_TRANSFER_LEADERSHIP;
>      } else if (!strcmp(test, "clear")) {
> diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at
> index 482e4e02d..9d8b4d06a 100644
> --- a/tests/ovsdb-cluster.at
> +++ b/tests/ovsdb-cluster.at
> @@ -525,6 +525,59 @@ for i in $(seq $n); do
>      OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s$i], [s$i.pid])
>  done
>
> +AT_CLEANUP
> +
> +AT_SETUP([OVSDB cluster - leadership change before replication while
joining])
> +AT_KEYWORDS([ovsdb server negative unix cluster join])
> +
> +n=5
> +AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db
dnl
> +              $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [],
[stderr])
> +cid=$(ovsdb-tool db-cid s1.db)
> +schema_name=$(ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema)
> +for i in $(seq 2 $n); do
> +    AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft
unix:s1.raft])
> +done
> +
> +on_exit 'kill $(cat *.pid)'
> +on_exit "
> +  for i in \$(ls $(pwd)/s[[0-$n]]); do
> +    ovs-appctl --timeout 1 -t \$i cluster/status $schema_name;
> +  done
> +"
> +
> +dnl Starting servers one by one asking all exisitng servers to transfer
> +dnl leadership right after starting to add a server.  Joining server will
> +dnl need to find a new leader that will also transfer leadership.
> +dnl This will continue until the same server will not become a leader
> +dnl for the second time and will be able to add a new server.
> +for i in $(seq $n); do
> +    dnl Make sure that all already started servers joined the cluster.
> +    for j in $(seq $((i - 1)) ); do
> +        AT_CHECK([ovsdb_client_wait unix:s$j.ovsdb $schema_name
connected])
> +    done
> +    for j in $(seq $((i - 1)) ); do
> +        OVS_WAIT_UNTIL([ovs-appctl -t "$(pwd)"/s$j \
> +                          cluster/failure-test \
> +                            transfer-leadership-after-starting-to-add \
> +                        | grep -q "engaged"])
> +    done
> +
> +    AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off \
> +                           --detach --no-chdir --log-file=s$i.log \
> +                           --pidfile=s$i.pid --unixctl=s$i \
> +                           --remote=punix:s$i.ovsdb s$i.db])
> +done
> +
> +dnl Make sure that all servers joined the cluster.
> +for i in $(seq $n); do
> +    AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
> +done
> +
> +for i in $(seq $n); do
> +    OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s$i], [s$i.pid])
> +done
> +
>  AT_CLEANUP
>
>
> --
> 2.43.0
>

Thanks Ilya. Looks good to me.

Acked-by: Han Zhou <hz...@ovn.org>
_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to