osaf/services/saf/clmsv/clms/clms_cb.h | 6 ++++
osaf/services/saf/clmsv/clms/clms_evt.c | 48 ++++++++++++++++++++++++++++----
2 files changed, 47 insertions(+), 7 deletions(-)
There is a possiblity that the checkpointing message for a NODE_DOWN reaches
the STANDBY first, i.e.
before the MDS delivers the NODE_DOWN event the the standby.
This can result in stale node_down record getting stored in the node_down list
which is a designated list
for processing of node downs that occur during role change from standby to
active.
The patch introduces a variable that checks whether the checkpoint event for
node_down has
arrived first, followed by a check during role change to ignore such stale
events.
diff --git a/osaf/services/saf/clmsv/clms/clms_cb.h
b/osaf/services/saf/clmsv/clms/clms_cb.h
--- a/osaf/services/saf/clmsv/clms/clms_cb.h
+++ b/osaf/services/saf/clmsv/clms/clms_cb.h
@@ -37,6 +37,11 @@ typedef enum {
IMM_RECONFIGURED = 5
} ADMIN_OP;
+typedef enum {
+ CHECKPOINT_PROCESSED = 1,
+ MDS_DOWN_PROCESSED
+} NODE_DOWN_STATUS;
+
/* Cluster Properties */
typedef struct cluster_db_t {
SaNameT name;
@@ -124,6 +129,7 @@ typedef struct clma_down_list_tag {
typedef struct node_down_list_tag {
SaClmNodeIdT node_id;
+ NODE_DOWN_STATUS ndown_status;
struct node_down_list_tag *next;
} NODE_DOWN_LIST;
diff --git a/osaf/services/saf/clmsv/clms/clms_evt.c
b/osaf/services/saf/clmsv/clms/clms_evt.c
--- a/osaf/services/saf/clmsv/clms/clms_evt.c
+++ b/osaf/services/saf/clmsv/clms/clms_evt.c
@@ -471,6 +471,13 @@ void clms_track_send_node_down(CLMS_CLUS
node->stat_change = SA_TRUE;
node->change = SA_CLM_NODE_LEFT;
++(clms_cb->cluster_view_num);
+
+ if(clms_cb->ha_state == SA_AMF_HA_ACTIVE){
+ ckpt_node_rec(node);
+ ckpt_node_down_rec(node);
+ ckpt_cluster_rec();
+ }
+
clms_send_track(clms_cb, node, SA_CLM_CHANGE_COMPLETED);
/* Clear node->stat_change after sending the callback to its clients */
node->stat_change = SA_FALSE;
@@ -480,12 +487,6 @@ void clms_track_send_node_down(CLMS_CLUS
clms_node_update_rattr(node);
clms_cluster_update_rattr(osaf_cluster);
- if(clms_cb->ha_state == SA_AMF_HA_ACTIVE){
- ckpt_node_rec(node);
- ckpt_node_down_rec(node);
- ckpt_cluster_rec();
- }
-
/*For the NODE DOWN, boottimestamp will not be updated */
/* Delete the node reference from the nodeid database */
@@ -592,6 +593,7 @@ static uint32_t proc_mds_node_evt(CLMSV_
clms_cb->node_down_list_tail->next =
node_down_rec;
}
clms_cb->node_down_list_tail = node_down_rec;
+ node_down_rec->ndown_status = MDS_DOWN_PROCESSED;
}
done:
@@ -1613,6 +1615,7 @@ void clms_remove_node_down_rec(SaClmNode
{
NODE_DOWN_LIST *node_down_rec = clms_cb->node_down_list_head;
NODE_DOWN_LIST *prev_rec = NULL;
+ bool record_found = false;
while (node_down_rec) {
if (node_down_rec->node_id == node_id) {
@@ -1638,11 +1641,36 @@ void clms_remove_node_down_rec(SaClmNode
/* Free the NODE_DOWN_REC */
free(node_down_rec);
node_down_rec = NULL;
+ record_found = true;
break;
}
prev_rec = node_down_rec; /* Remember address of this
entry */
node_down_rec = node_down_rec->next; /* Go to next entry */
}
+
+ if (!record_found) {
+ /* MDS node_down has not yet reached the STANDBY,
+ * Just add this checkupdate record to the list. MDS_DOWN
processing will delete it.
+ * If role change happens before MDS_DOWN is recieved,
+ * then role change processing just ignores the record and
removes it
+ * from the list.
+ */
+ node_down_rec = NULL;
+ if ((node_down_rec = (NODE_DOWN_LIST *)
malloc(sizeof(NODE_DOWN_LIST))) == NULL) {
+ LOG_CR("Memory Allocation for NODE_DOWN_LIST failed");
+ return;
+ }
+ memset(node_down_rec, 0, sizeof(NODE_DOWN_LIST));
+ node_down_rec->node_id = node_id;
+ if (clms_cb->node_down_list_head == NULL) {
+ clms_cb->node_down_list_head = node_down_rec;
+ } else {
+ if (clms_cb->node_down_list_tail)
+ clms_cb->node_down_list_tail->next =
node_down_rec;
+ }
+ clms_cb->node_down_list_tail = node_down_rec;
+ node_down_rec->ndown_status = CHECKPOINT_PROCESSED;
+ }
}
/**
@@ -1696,7 +1724,13 @@ void proc_downs_during_rolechange (void)
/*Remove NODE_DOWN_REC from the NODE_DOWN_LIST */
node = clms_node_get_by_id(node_down_rec->node_id);
temp_node_down_rec = node_down_rec;
- if (node != NULL)
+ /* If nodedown status is CHECKPOINT_PROCESSED, it means that
+ * a checkpoint update was received when this node was STANDBY,
but
+ * the MDS node_down did not reach the STANDBY. An extremely
rare chance,
+ * but good to have protection for it, by ignoring the record
+ * if the record is in CHECKPOINT_PROCESSED state.
+ */
+ if ((node != NULL) && (temp_node_down_rec->ndown_status !=
CHECKPOINT_PROCESSED))
clms_track_send_node_down(node);
node_down_rec = node_down_rec->next;
/*Free the NODE_DOWN_REC */
------------------------------------------------------------------------------
Meet PCI DSS 3.0 Compliance Requirements with EventLog Analyzer
Achieve PCI DSS 3.0 Compliant Status with Out-of-the-box PCI DSS Reports
Are you Audit-Ready for PCI DSS 3.0 Compliance? Download White paper
Comply to PCI DSS 3.0 Requirement 10 and 11.5 with EventLog Analyzer
http://pubads.g.doubleclick.net/gampad/clk?id=154622311&iu=/4140/ostg.clktrk
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel