osaf/services/saf/amf/amfd/include/node.h |   3 +
 osaf/services/saf/amf/amfd/include/sg.h   |   1 +
 osaf/services/saf/amf/amfd/nodegroup.cc   |  83 +++++++++++++++++++++++++++++++
 osaf/services/saf/amf/amfd/role.cc        |  20 +++---
 osaf/services/saf/amf/amfd/sg.cc          |  15 ++--
 osaf/services/saf/amf/amfd/sgproc.cc      |   2 +-
 osaf/services/saf/amf/amfd/siass.cc       |   4 +-
 7 files changed, 109 insertions(+), 19 deletions(-)


The SG becomes unstable because some variables used in nodegroup operation are 
not
restored after headless if this admin operation on nodegroup was interrupted 
just
before cluster goes into headless stage.

In order to restore nodegroup operation, AMF needs to know exactly whether 
nodegroup
operation was running during headless up on @susi assignment. If susi is in 
QUIESCED,
QUIESCING or being removed while its related entities su, si, sg are not in 
LOCKED
and SHUTTING_DOWN, that means either node or nodegroup MUST be in LOCKED or 
SHUTTING
DOWN. In case of SHUTTING_DOWN saAmfNGAdminState, that's enough to know a 
nodegroup
operation was running. However, if saAmfNGAdminState is in LOCKED, this case is 
an
ambiguity of locking a node. The reason of differentiation of locking a node or 
node
group is because 2N SG uses both AVD_SG_FSM_SG_ADMIN and AVD_SG_FSM_SU_OPER for 
node
group operation while AVD_SG_FSM_SU_OPER is only used for node operation. When 
2N SG
uses AVD_SG_FSM_SG_ADMIN for nodegroup, the saAmfSGAdminState is borrowed (but 
not
updated to IMM) to run the admin operation sequence. Therefore, after headless 
if
AVD_SG_FSM_SG_ADMIN was being used for nodegroup then saAmfSGAdminState also 
needs to
be set.

Because SG FSM state is used to restore nodegroup during restoring susi 
assignment,
the osaAmfSGFsmState (RTA) needs to be read earlier than reading susi 
assignment. This
needs active AMFD become implementer earlier than reading sg object. There was 
a known
ticket reported in 1720, if only make active AMFD as early implementer than it 
will
cause the standby AMFD missing ccb apply callback.This patch also needs to set 
both
active and standby AMFD become implementer and applier earlier so that AMFD can 
read
osaAmfSGFsmState and do not cause regression of 1720.

diff --git a/osaf/services/saf/amf/amfd/include/node.h 
b/osaf/services/saf/amf/amfd/include/node.h
--- a/osaf/services/saf/amf/amfd/include/node.h
+++ b/osaf/services/saf/amf/amfd/include/node.h
@@ -45,6 +45,7 @@
 #include <set>
 #include <vector>
 #include <string>
+#include <susi.h>
 
 class AVD_SU;
 struct avd_cluster_tag;
@@ -232,4 +233,6 @@ extern void ng_complete_admin_op(AVD_AMF
 extern void avd_ng_admin_state_set(AVD_AMF_NG* ng, SaAmfAdminStateT state);
 extern bool are_all_ngs_in_unlocked_state(const AVD_AVND *node);
 extern bool any_ng_in_locked_in_state(const AVD_AVND *node);
+void avd_ng_restore_headless_states(AVD_CL_CB *cb, struct avd_su_si_rel_tag* 
susi);
+
 #endif
diff --git a/osaf/services/saf/amf/amfd/include/sg.h 
b/osaf/services/saf/amf/amfd/include/sg.h
--- a/osaf/services/saf/amf/amfd/include/sg.h
+++ b/osaf/services/saf/amf/amfd/include/sg.h
@@ -46,6 +46,7 @@
 class AVD_SU;
 class AVD_SI;
 class AVD_APP;
+class AVD_AMF_NG;
 
 /* The valid SG FSM states. */
 typedef enum {
diff --git a/osaf/services/saf/amf/amfd/nodegroup.cc 
b/osaf/services/saf/amf/amfd/nodegroup.cc
--- a/osaf/services/saf/amf/amfd/nodegroup.cc
+++ b/osaf/services/saf/amf/amfd/nodegroup.cc
@@ -1356,3 +1356,86 @@ void avd_ng_constructor(void)
                        ng_ccb_apply_cb);
 }
 
+/*
+ * @brief: Restore nodegroup related variables which are used in nodegroup
+ *         operations before headless. The main variables are @admin_ng,
+ *         @ng_using_saAmfSGAdminState, and the saAmfSGAdminState which
+ *         is borrowed by nodegroup operation on 2N SG
+ * @param: control block, and susi is being read after headless
+ * @return: void
+ */
+
+void avd_ng_restore_headless_states(AVD_CL_CB *cb, struct avd_su_si_rel_tag* 
susi) {
+
+       TRACE_ENTER();
+       osafassert(cb->scs_absence_max_duration > 0);
+       // In order to restore nodegroup operation, AMF needs to know exactly
+       // whether nodegroup operation was running during headless up on @susi
+       // If susi is in QUIESCED/QUIESCING or being removed while its related
+       // entities su, si, sg are not in LOCKED and SHUTTING_DOWN, that means
+       // either node or nodegroup MUST be in LOCKED or SHUTTING_DOWN.
+       // In case of SHUTTING_DOWN saAmfNGAdminState, that's enough to know
+       // a nodegroup operation was running. However, if saAmfNGAdminState is
+       // in LOCKED, this case is an ambiguity of locking a node.
+       // The reason of differentiation of locking a node or nodegroup is 
because
+       // 2N SG uses both AVD_SG_FSM_SG_ADMIN and AVD_SG_FSM_SU_OPER for 
nodegroup
+       // operation while AVD_SG_FSM_SU_OPER is only used for node operation.
+       // When 2N SG uses AVD_SG_FSM_SG_ADMIN for nodegroup, the 
saAmfSGAdminState
+       // is borrowed (but not updated to IMM) to run the admin operation 
sequence.
+       // Therefore, after headless if AVD_SG_FSM_SG_ADMIN was being used for 
nodegroup
+       // then saAmfSGAdminState also needs to be set.
+
+       // Make sure su, si, sg are not in LOCKED, SHUTTING_DOWN
+       if (susi->su->saAmfSUAdminState != SA_AMF_ADMIN_LOCKED &&
+                       susi->su->saAmfSUAdminState != 
SA_AMF_ADMIN_SHUTTING_DOWN &&
+                       susi->si->saAmfSIAdminState != SA_AMF_ADMIN_LOCKED &&
+                       susi->si->saAmfSIAdminState != 
SA_AMF_ADMIN_SHUTTING_DOWN &&
+                       susi->su->sg_of_su->saAmfSGAdminState != 
SA_AMF_ADMIN_LOCKED &&
+                       susi->su->sg_of_su->saAmfSGAdminState != 
SA_AMF_ADMIN_SHUTTING_DOWN) {
+
+               // susi are in state transition
+               if (susi->state == SA_AMF_HA_QUIESCED || susi->state == 
SA_AMF_HA_QUIESCING ||
+                               susi->fsm == AVD_SU_SI_STATE_UNASGN) {
+
+               for (std::map<std::string, AVD_AMF_NG*>::const_iterator it = 
nodegroup_db->begin();
+                       it != nodegroup_db->end(); it++) {
+                       AVD_AMF_NG *ng = it->second;
+                       for (std::set<std::string>::const_iterator iter = 
ng->saAmfNGNodeList.begin();
+                                       iter != ng->saAmfNGNodeList.end(); 
++iter) {
+                               AVD_AVND *node = avd_node_get(*iter);
+
+                               if (susi->su->su_on_node == node) {
+                                       // Nodegroup was shutting down, also 
needs to check whether nodegroup
+                                       // operation was borrowing SG ADMIN FSM 
code (for 2N only)
+                                       if (ng->saAmfNGAdminState == 
SA_AMF_ADMIN_SHUTTING_DOWN) {
+                                               node->admin_ng = ng;
+                                               if 
(susi->su->sg_of_su->sg_fsm_state == AVD_SG_FSM_SG_ADMIN) {
+                                                       
osafassert(susi->su->sg_of_su->sg_redundancy_model == 
SA_AMF_2N_REDUNDANCY_MODEL);
+                                                               
susi->su->sg_of_su->ng_using_saAmfSGAdminState = true;
+                                                               
susi->su->sg_of_su->saAmfSGAdminState = ng->saAmfNGAdminState;
+                                                               
m_AVSV_SEND_CKPT_UPDT_ASYNC_UPDT(cb, susi->su->sg_of_su, 
AVSV_CKPT_SG_ADMIN_STATE);
+                                               }
+                                       }
+                                       // Node group was LOCKED, AMFD treat 
nodegroup and node operation in the same view,
+                                       // except nodegroup was borrowing SG 
ADMIN FSM. This is because the other case that
+                                       // nodegroup operation was only SG 
SU_OPER FSM, which is the same as of node operation
+                                       if (ng->saAmfNGAdminState == 
SA_AMF_ADMIN_LOCKED) {
+                                               
if(susi->su->sg_of_su->sg_fsm_state == AVD_SG_FSM_SG_ADMIN) {
+                                                       
osafassert(susi->su->sg_of_su->sg_redundancy_model == 
SA_AMF_2N_REDUNDANCY_MODEL);
+                                                               node->admin_ng 
= ng;
+                                                               
susi->su->sg_of_su->ng_using_saAmfSGAdminState = true;
+                                                               
susi->su->sg_of_su->saAmfSGAdminState = ng->saAmfNGAdminState;
+                                                               
m_AVSV_SEND_CKPT_UPDT_ASYNC_UPDT(cb, susi->su->sg_of_su, 
AVSV_CKPT_SG_ADMIN_STATE);
+                                               }
+                                       }
+                                               TRACE("Restore nodegroup 
operation, SG(%s), ng_using_saAmfSGAdminState: %u, saAmfSGAdminState:%u",
+                                                               
susi->su->sg_of_su->name.c_str(), 
susi->su->sg_of_su->ng_using_saAmfSGAdminState,
+                                                               
susi->su->sg_of_su->saAmfSGAdminState);
+
+                               }
+                       }
+               }
+               }
+       }
+       TRACE_LEAVE();
+}
diff --git a/osaf/services/saf/amf/amfd/role.cc 
b/osaf/services/saf/amf/amfd/role.cc
--- a/osaf/services/saf/amf/amfd/role.cc
+++ b/osaf/services/saf/amf/amfd/role.cc
@@ -260,6 +260,11 @@ uint32_t avd_active_role_initialization(
 
        TRACE_ENTER();
 
+       if (avd_imm_impl_set() != SA_AIS_OK) {
+               LOG_ER("avd_imm_impl_set FAILED");
+               goto done;
+       }
+
        if (avd_imm_config_get() != NCSCC_RC_SUCCESS) {
                LOG_ER("avd_imm_config_get FAILED");
                goto done;
@@ -267,11 +272,6 @@ uint32_t avd_active_role_initialization(
 
        cb->init_state = AVD_CFG_DONE;
 
-       if (avd_imm_impl_set() != SA_AIS_OK) {
-               LOG_ER("avd_imm_impl_set FAILED");
-               goto done;
-       }
-
        avd_imm_update_runtime_attrs();
 
        status = NCSCC_RC_SUCCESS;
@@ -294,6 +294,11 @@ uint32_t avd_standby_role_initialization
 
        TRACE_ENTER();
 
+       if (avd_imm_applier_set() != SA_AIS_OK) {
+               LOG_ER("avd_imm_applier_set FAILED");
+               goto done;
+       }
+
        if (avd_imm_config_get() != NCSCC_RC_SUCCESS) {
                LOG_ER("avd_imm_config_get FAILED");
                goto done;
@@ -301,11 +306,6 @@ uint32_t avd_standby_role_initialization
 
        cb->init_state = AVD_CFG_DONE;
 
-       if (avd_imm_applier_set() != SA_AIS_OK) {
-               LOG_ER("avd_imm_applier_set FAILED");
-               goto done;
-       }
-
        status = NCSCC_RC_SUCCESS;
 done:
        TRACE_LEAVE();
diff --git a/osaf/services/saf/amf/amfd/sg.cc b/osaf/services/saf/amf/amfd/sg.cc
--- a/osaf/services/saf/amf/amfd/sg.cc
+++ b/osaf/services/saf/amf/amfd/sg.cc
@@ -374,6 +374,13 @@ static AVD_SG *sg_create(const std::stri
        if (immutil_getAttr(const_cast<SaImmAttrNameT>("saAmfSGAdminState"), 
attributes, 0, &sg->saAmfSGAdminState) != SA_AIS_OK) {
                sg->saAmfSGAdminState = SA_AMF_ADMIN_UNLOCKED;
        }
+       // only read SG FSM State for non-ncs SG
+       if (sg_name.find("safApp=OpenSAF") == std::string::npos) {
+               if 
(immutil_getAttr(const_cast<SaImmAttrNameT>("osafAmfSGFsmState"), attributes, 
0, &sg->sg_fsm_state) != SA_AIS_OK) {
+                       sg->sg_fsm_state = AVD_SG_FSM_STABLE;
+               }
+               TRACE("sg_fsm_state(%u) read from osafAmfSGFsmState", 
sg->sg_fsm_state);
+       }
 
        /*  TODO use value in type instead? */
        sg->sg_redundancy_model = sgt->saAmfSgtRedundancyModel;
@@ -422,6 +429,7 @@ SaAisErrorT avd_sg_config_get(const std:
                const_cast<SaImmAttrNameT>("saAmfSGSuRestartProb"),
                const_cast<SaImmAttrNameT>("saAmfSGSuRestartMax"),
                const_cast<SaImmAttrNameT>("saAmfSGAdminState"),
+               const_cast<SaImmAttrNameT>("osafAmfSGFsmState"),
                nullptr
        };
 
@@ -2094,10 +2102,8 @@ void avd_sg_read_headless_cached_rta(AVD
        AVD_SG *sg;
        unsigned int num_of_values = 0;
        const SaImmAttrValuesT_2 **attributes;
-       AVD_SG_FSM_STATE imm_sg_fsm_state;
        const char *className = "SaAmfSG";
        const SaImmAttrNameT searchAttributes[] = {
-               const_cast<SaImmAttrNameT>("osafAmfSGFsmState"),
                const_cast<SaImmAttrNameT>("osafAmfSGSuOperationList"),
                NULL
        };
@@ -2122,11 +2128,6 @@ void avd_sg_read_headless_cached_rta(AVD
                                        (SaImmAttrValuesT_2 ***)&attributes)) 
== SA_AIS_OK) {
                sg = sg_db->find(Amf::to_string(&sg_dn));
                if (sg && sg->sg_ncs_spec == false) {
-                       // Read sg fsm state
-                       rc = 
immutil_getAttr(const_cast<SaImmAttrNameT>("osafAmfSGFsmState"),
-                                       attributes, 0, &imm_sg_fsm_state);
-                       osafassert(rc == SA_AIS_OK);
-                       sg->set_fsm_state(imm_sg_fsm_state, false);
                        // Read sg operation list
                        if 
(immutil_getAttrValuesNumber(const_cast<SaImmAttrNameT>("osafAmfSGSuOperationList"),
 attributes, &num_of_values) == SA_AIS_OK) {
                                unsigned int i;
diff --git a/osaf/services/saf/amf/amfd/sgproc.cc 
b/osaf/services/saf/amf/amfd/sgproc.cc
--- a/osaf/services/saf/amf/amfd/sgproc.cc
+++ b/osaf/services/saf/amf/amfd/sgproc.cc
@@ -1112,7 +1112,7 @@ void avd_su_si_assign_evh(AVD_CL_CB *cb,
                }
 
                if (su->list_of_susi == AVD_SU_SI_REL_NULL) {
-                       LOG_ER("%s: no susis", __FUNCTION__);
+                       LOG_WA("%s: no susis", __FUNCTION__);
                        goto done;
                }
 
diff --git a/osaf/services/saf/amf/amfd/siass.cc 
b/osaf/services/saf/amf/amfd/siass.cc
--- a/osaf/services/saf/amf/amfd/siass.cc
+++ b/osaf/services/saf/amf/amfd/siass.cc
@@ -261,7 +261,9 @@ void avd_susi_read_headless_cached_rta(A
                                                m_AVD_SET_SG_ADMIN_SI(cb, si);
                                }
                        }
-
+                       // only restore if not done
+                       if (susi->su->su_on_node->admin_ng == nullptr)
+                               avd_ng_restore_headless_states(cb, susi);
                } else {
                        // This susi does not exist after headless, but it's 
still in IMM
                        // delete it for now

------------------------------------------------------------------------------
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to