---
src/fm/fmd/fm_main.cc | 26 +++++---------------------
src/fm/fmd/fm_mds.cc | 2 ++
src/fm/fmd/fm_rda.cc | 15 +++++++++++++--
3 files changed, 20 insertions(+), 23 deletions(-)
diff --git a/src/fm/fmd/fm_main.cc b/src/fm/fmd/fm_main.cc
index 73c9b9ccd..3371ec5e8 100644
--- a/src/fm/fmd/fm_main.cc
+++ b/src/fm/fmd/fm_main.cc
@@ -551,21 +551,12 @@ static void fm_mbx_msg_handler(FM_CB *fm_cb, FM_EVT
*fm_mbx_evt) {
* trigerred quicker than the node_down event
* has been received.
*/
- if (fm_cb->role == PCS_RDA_STANDBY) {
- const std::string current_active =
- consensus_service.CurrentActive();
- if (current_active.compare(osaf_extended_name_borrow(
- &fm_cb->peer_clm_node_name)) == 0) {
- // update consensus service, before fencing old active controller
- consensus_service.DemoteCurrentActive();
- }
- }
if (fm_cb->use_remote_fencing) {
if (fm_cb->peer_node_terminated == false) {
// if peer_sc_up is true then
// the node has come up already
- if (fm_cb->peer_sc_up == false && fm_cb->immnd_down == true) {
+ if (consensus_service.IsEnabled() == false) {
opensaf_reboot(fm_cb->peer_node_id,
(char *)fm_cb->peer_clm_node_name.value,
"Received Node Down for peer controller");
@@ -580,8 +571,7 @@ static void fm_mbx_msg_handler(FM_CB *fm_cb, FM_EVT
*fm_mbx_evt) {
fm_cb->mutex_.Lock();
peer_node_name = fm_cb->peer_node_name;
fm_cb->mutex_.Unlock();
- opensaf_reboot(fm_cb->peer_node_id,
- peer_node_name.c_str(),
+ opensaf_reboot(fm_cb->peer_node_id, peer_node_name.c_str(),
"Received Node Down for peer controller");
}
if (!((fm_cb->role == PCS_RDA_ACTIVE) &&
@@ -632,12 +622,6 @@ static void fm_mbx_msg_handler(FM_CB *fm_cb, FM_EVT
*fm_mbx_evt) {
}
Consensus consensus_service;
- const std::string current_active = consensus_service.CurrentActive();
- if (current_active.compare(
- osaf_extended_name_borrow(&fm_cb->peer_clm_node_name)) == 0) {
- // update consensus service, before fencing old active controller
- consensus_service.DemoteCurrentActive();
- }
/* Now. Try resetting other blade */
fm_cb->role = PCS_RDA_ACTIVE;
@@ -645,7 +629,8 @@ static void fm_mbx_msg_handler(FM_CB *fm_cb, FM_EVT
*fm_mbx_evt) {
LOG_NO("Reseting peer controller node id: %x",
unsigned(fm_cb->peer_node_id));
if (fm_cb->use_remote_fencing) {
- if (fm_cb->peer_node_terminated == false) {
+ if (fm_cb->peer_node_terminated == false &&
+ consensus_service.IsEnabled() == false) {
opensaf_reboot(fm_cb->peer_node_id,
(char *)fm_cb->peer_clm_node_name.value,
"Received Node Down for peer controller");
@@ -658,8 +643,7 @@ static void fm_mbx_msg_handler(FM_CB *fm_cb, FM_EVT
*fm_mbx_evt) {
fm_cb->mutex_.Lock();
peer_node_name = fm_cb->peer_node_name;
fm_cb->mutex_.Unlock();
- opensaf_reboot(fm_cb->peer_node_id,
- peer_node_name.c_str(),
+ opensaf_reboot(fm_cb->peer_node_id, peer_node_name.c_str(),
"Received Node Down for Active peer");
}
fm_rda_set_role(fm_cb, PCS_RDA_ACTIVE);
diff --git a/src/fm/fmd/fm_mds.cc b/src/fm/fmd/fm_mds.cc
index 277a357d2..be25a5610 100644
--- a/src/fm/fmd/fm_mds.cc
+++ b/src/fm/fmd/fm_mds.cc
@@ -373,6 +373,7 @@ static uint32_t fm_mds_node_evt(FM_CB *cb,
case NCSMDS_NODE_DOWN:
if (cb->cluster_size != 0) {
--cb->cluster_size;
+ TRACE("cluster_size %" PRIu64, cb->cluster_size);
TRACE("Node down event for node id %x, cluster size is now: %llu",
node_evt->node_id, (unsigned long long)cb->cluster_size);
check_for_node_isolation(cb);
@@ -397,6 +398,7 @@ static uint32_t fm_mds_node_evt(FM_CB *cb,
case NCSMDS_NODE_UP:
++cb->cluster_size;
+ TRACE("cluster_size %" PRIu64, cb->cluster_size);
TRACE("Node up event for node id %x, cluster size is now: %llu",
node_evt->node_id, (unsigned long long)cb->cluster_size);
check_for_node_isolation(cb);
diff --git a/src/fm/fmd/fm_rda.cc b/src/fm/fmd/fm_rda.cc
index 47e1f1d32..1bbf2369d 100644
--- a/src/fm/fmd/fm_rda.cc
+++ b/src/fm/fmd/fm_rda.cc
@@ -87,13 +87,24 @@ uint32_t fm_rda_set_role(FM_CB *fm_cb, PCS_RDA_ROLE role) {
osafassert(role == PCS_RDA_ACTIVE);
Consensus consensus_service;
- rc = consensus_service.PromoteThisNode();
- if (rc != SA_AIS_OK) {
+ rc = consensus_service.PromoteThisNode(true, fm_cb->cluster_size);
+ if (rc != SA_AIS_OK && rc != SA_AIS_ERR_EXIST) {
LOG_ER("Unable to set active controller in consensus service");
opensaf_reboot(0, nullptr,
"Unable to set active controller in consensus service");
}
+ // @todo if we don't reboot, we don't seem to recover from this. Can we
+ // improve?
+ if (rc == SA_AIS_ERR_EXIST) {
+ LOG_ER(
+ "A controller is already active. We were separated from the "
+ "cluster?");
+ opensaf_reboot(0, nullptr,
+ "A controller is already active. We were separated "
+ "from the cluster?");
+ }
+
rc = pcs_rda_request(&rda_req);
if (rc != PCSRDA_RC_SUCCESS) {
syslog(LOG_INFO,
--
2.14.1
------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel