Here is the case:
The coordinator IMMND on PL-3 was crashed, the active IMMD then elected the new
coordinator on the standby node SC-2 but failed because IMMND on the SC-2 was
also restarted. As the result, the active IMMD exited and failure-over
happened. After that, SC-2 took active role and found no candidate for new IMMND
coordinator, so cluster was reboot.

We can prevent this happen if the active IMMD prioritizes to elect the
coordinator which is located on the same site with himself if the IMMND database
is up-to-date.
---
 src/imm/immd/immd_proc.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/imm/immd/immd_proc.c b/src/imm/immd/immd_proc.c
index 1882eef..e80f2db 100644
--- a/src/imm/immd/immd_proc.c
+++ b/src/imm/immd/immd_proc.c
@@ -331,16 +331,22 @@ bool immd_proc_elect_coord(IMMD_CB *cb, bool new_active)
                 */
        } else {
                /* Try to elect a new coord. */
+               bool has_coord_candidate = false;
                cb->payload_coord_dest = 0LL;
                memset(&key, 0, sizeof(MDS_DEST));
                immd_immnd_info_node_getnext(&cb->immnd_tree, &key,
                                             &immnd_info_node);
+
+               // Prioritize to elect the new coordinator which is
+               // located at the active node (same site with the active IMMD).
                while (immnd_info_node) {
                        key = immnd_info_node->immnd_dest;
                        if ((immnd_info_node->isOnController) &&
+                           (immnd_info_node->immnd_key == cb->node_id) &&
                            (immnd_info_node->epoch == cb->mRulingEpoch)) {
                                /*We found a new candidate for cordinator */
                                immnd_info_node->isCoord = true;
+                               has_coord_candidate = true;
                                break;
                        }
 
@@ -348,7 +354,26 @@ bool immd_proc_elect_coord(IMMD_CB *cb, bool new_active)
                                                     &immnd_info_node);
                }
 
-               if (!immnd_info_node && cb->mScAbsenceAllowed) {
+                if (!has_coord_candidate) {
+                  memset(&key, 0, sizeof(MDS_DEST));
+                  immd_immnd_info_node_getnext(&cb->immnd_tree, &key,
+                                               &immnd_info_node);
+
+                  while (immnd_info_node) {
+                    key = immnd_info_node->immnd_dest;
+                    if ((immnd_info_node->isOnController) &&
+                        (immnd_info_node->epoch == cb->mRulingEpoch)) {
+                      /*We found a new candidate for cordinator */
+                      immnd_info_node->isCoord = true;
+                      break;
+                    }
+
+                    immd_immnd_info_node_getnext(&cb->immnd_tree, &key,
+                                                 &immnd_info_node);
+                  }
+                }
+
+                if (!immnd_info_node && cb->mScAbsenceAllowed) {
                        /* If SC absence is allowed and no SC based IMMND is
                           available then elect an IMMND coord at a payload.
                           Note this means that an IMMND at a payload may be
-- 
1.9.1


------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to