I added the following:
void clms_track_send_node_down(CLMS_CLUSTER_NODE *node)
:
if(clms_cb->ha_state == SA_AMF_HA_ACTIVE){
if (!(rand() % 7)) {
reboot(0x4321fedc /*LINUX_REBOOT_CMD_POWER_OFF*/);
}
:
to simulate a real case where the SC-1 was powered off, and the pc
probably at this address.
./opensaf nodestop 4
./opensaf nodestart 4
./opensaf nodestop 3
./opensaf nodestart 3
/HansN
On 09/23/14 08:13, Hans Feldt wrote:
> What node did originally leave the cluster since you ended up in
> clms_track_send_node_down?
>
> You must have killed first one node (PLx?) and then the active SC was
> rebooted in the middle of handling this.
>
> /HansF
>
>> -----Original Message-----
>> From: Hans Nordebäck [mailto:[email protected]]
>> Sent: den 23 september 2014 08:06
>> To: [email protected]
>> Cc: [email protected]
>> Subject: Re: [devel] [PATCH 1 of 1] clm: avoid stale node down processing
>> and unexpected track callback [#1120]
>>
>> Hi Mathi, I tested the patch and SA_CLM_NODE_LEFT are sent to active node:
>>
>> ./rootfs/var/PL-4/log/messages:Sep 23 07:31:23 PL-4 local0.notice
>> osafamfnd[391]: NO This node has exited the cluster
>>
>> if run in UML and power off is done in clms_track_send_node_down before
>> sending checkpoint data.
>>
>> /Regards HansN
>>
>> On 09/23/14 02:12, [email protected] wrote:
>>> osaf/services/saf/clmsv/clms/clms_cb.h | 6 ++++
>>> osaf/services/saf/clmsv/clms/clms_evt.c | 48
>>> ++++++++++++++++++++++++++++----
>>> 2 files changed, 47 insertions(+), 7 deletions(-)
>>>
>>>
>>> There is a possiblity that the checkpointing message for a NODE_DOWN
>>> reaches the STANDBY first, i.e.
>>> before the MDS delivers the NODE_DOWN event the the standby.
>>> This can result in stale node_down record getting stored in the
>>> node_down list which is a designated list
>>> for processing of node downs that occur during role change from standby to
>>> active.
>>> The patch introduces a variable that checks whether the checkpoint event
>>> for node_down has
>>> arrived first, followed by a check during role change to ignore such stale
>>> events.
>>>
>>> diff --git a/osaf/services/saf/clmsv/clms/clms_cb.h
>>> b/osaf/services/saf/clmsv/clms/clms_cb.h
>>> --- a/osaf/services/saf/clmsv/clms/clms_cb.h
>>> +++ b/osaf/services/saf/clmsv/clms/clms_cb.h
>>> @@ -37,6 +37,11 @@ typedef enum {
>>> IMM_RECONFIGURED = 5
>>> } ADMIN_OP;
>>>
>>> +typedef enum {
>>> + CHECKPOINT_PROCESSED = 1,
>>> + MDS_DOWN_PROCESSED
>>> +} NODE_DOWN_STATUS;
>>> +
>>> /* Cluster Properties */
>>> typedef struct cluster_db_t {
>>> SaNameT name;
>>> @@ -124,6 +129,7 @@ typedef struct clma_down_list_tag {
>>>
>>> typedef struct node_down_list_tag {
>>> SaClmNodeIdT node_id;
>>> + NODE_DOWN_STATUS ndown_status;
>>> struct node_down_list_tag *next;
>>> } NODE_DOWN_LIST;
>>>
>>> diff --git a/osaf/services/saf/clmsv/clms/clms_evt.c
>>> b/osaf/services/saf/clmsv/clms/clms_evt.c
>>> --- a/osaf/services/saf/clmsv/clms/clms_evt.c
>>> +++ b/osaf/services/saf/clmsv/clms/clms_evt.c
>>> @@ -471,6 +471,13 @@ void clms_track_send_node_down(CLMS_CLUS
>>> node->stat_change = SA_TRUE;
>>> node->change = SA_CLM_NODE_LEFT;
>>> ++(clms_cb->cluster_view_num);
>>> +
>>> + if(clms_cb->ha_state == SA_AMF_HA_ACTIVE){
>>> + ckpt_node_rec(node);
>>> + ckpt_node_down_rec(node);
>>> + ckpt_cluster_rec();
>>> + }
>>> +
>>> clms_send_track(clms_cb, node, SA_CLM_CHANGE_COMPLETED);
>>> /* Clear node->stat_change after sending the callback to its clients */
>>> node->stat_change = SA_FALSE;
>>> @@ -480,12 +487,6 @@ void clms_track_send_node_down(CLMS_CLUS
>>> clms_node_update_rattr(node);
>>> clms_cluster_update_rattr(osaf_cluster);
>>>
>>> - if(clms_cb->ha_state == SA_AMF_HA_ACTIVE){
>>> - ckpt_node_rec(node);
>>> - ckpt_node_down_rec(node);
>>> - ckpt_cluster_rec();
>>> - }
>>> -
>>> /*For the NODE DOWN, boottimestamp will not be updated */
>>>
>>> /* Delete the node reference from the nodeid database */
>>> @@ -592,6 +593,7 @@ static uint32_t proc_mds_node_evt(CLMSV_
>>> clms_cb->node_down_list_tail->next =
>>> node_down_rec;
>>> }
>>> clms_cb->node_down_list_tail = node_down_rec;
>>> + node_down_rec->ndown_status = MDS_DOWN_PROCESSED;
>>> }
>>>
>>> done:
>>> @@ -1613,6 +1615,7 @@ void clms_remove_node_down_rec(SaClmNode
>>> {
>>> NODE_DOWN_LIST *node_down_rec = clms_cb->node_down_list_head;
>>> NODE_DOWN_LIST *prev_rec = NULL;
>>> + bool record_found = false;
>>>
>>> while (node_down_rec) {
>>> if (node_down_rec->node_id == node_id) {
>>> @@ -1638,11 +1641,36 @@ void clms_remove_node_down_rec(SaClmNode
>>> /* Free the NODE_DOWN_REC */
>>> free(node_down_rec);
>>> node_down_rec = NULL;
>>> + record_found = true;
>>> break;
>>> }
>>> prev_rec = node_down_rec; /* Remember address of this
>>> entry */
>>> node_down_rec = node_down_rec->next; /* Go to next entry */
>>> }
>>> +
>>> + if (!record_found) {
>>> + /* MDS node_down has not yet reached the STANDBY,
>>> + * Just add this checkupdate record to the list. MDS_DOWN
>>> processing will delete it.
>>> + * If role change happens before MDS_DOWN is recieved,
>>> + * then role change processing just ignores the record and
>>> removes it
>>> + * from the list.
>>> + */
>>> + node_down_rec = NULL;
>>> + if ((node_down_rec = (NODE_DOWN_LIST *)
>>> malloc(sizeof(NODE_DOWN_LIST))) == NULL) {
>>> + LOG_CR("Memory Allocation for NODE_DOWN_LIST failed");
>>> + return;
>>> + }
>>> + memset(node_down_rec, 0, sizeof(NODE_DOWN_LIST));
>>> + node_down_rec->node_id = node_id;
>>> + if (clms_cb->node_down_list_head == NULL) {
>>> + clms_cb->node_down_list_head = node_down_rec;
>>> + } else {
>>> + if (clms_cb->node_down_list_tail)
>>> + clms_cb->node_down_list_tail->next =
>>> node_down_rec;
>>> + }
>>> + clms_cb->node_down_list_tail = node_down_rec;
>>> + node_down_rec->ndown_status = CHECKPOINT_PROCESSED;
>>> + }
>>> }
>>>
>>> /**
>>> @@ -1696,7 +1724,13 @@ void proc_downs_during_rolechange (void)
>>> /*Remove NODE_DOWN_REC from the NODE_DOWN_LIST */
>>> node = clms_node_get_by_id(node_down_rec->node_id);
>>> temp_node_down_rec = node_down_rec;
>>> - if (node != NULL)
>>> + /* If nodedown status is CHECKPOINT_PROCESSED, it means that
>>> + * a checkpoint update was received when this node was STANDBY,
>>> but
>>> + * the MDS node_down did not reach the STANDBY. An extremely
>>> rare chance,
>>> + * but good to have protection for it, by ignoring the record
>>> + * if the record is in CHECKPOINT_PROCESSED state.
>>> + */
>>> + if ((node != NULL) && (temp_node_down_rec->ndown_status !=
>>> CHECKPOINT_PROCESSED))
>>> clms_track_send_node_down(node);
>>> node_down_rec = node_down_rec->next;
>>> /*Free the NODE_DOWN_REC */
>>
>> ------------------------------------------------------------------------------
>> Meet PCI DSS 3.0 Compliance Requirements with EventLog Analyzer
>> Achieve PCI DSS 3.0 Compliant Status with Out-of-the-box PCI DSS Reports
>> Are you Audit-Ready for PCI DSS 3.0 Compliance? Download White paper
>> Comply to PCI DSS 3.0 Requirement 10 and 11.5 with EventLog Analyzer
>> http://pubads.g.doubleclick.net/gampad/clk?id=154622311&iu=/4140/ostg.clktrk
>> _______________________________________________
>> Opensaf-devel mailing list
>> [email protected]
>> https://lists.sourceforge.net/lists/listinfo/opensaf-devel
------------------------------------------------------------------------------
Meet PCI DSS 3.0 Compliance Requirements with EventLog Analyzer
Achieve PCI DSS 3.0 Compliant Status with Out-of-the-box PCI DSS Reports
Are you Audit-Ready for PCI DSS 3.0 Compliance? Download White paper
Comply to PCI DSS 3.0 Requirement 10 and 11.5 with EventLog Analyzer
http://pubads.g.doubleclick.net/gampad/clk?id=154622311&iu=/4140/ostg.clktrk
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel