Re: [devel] [PATCH 1 of 1] clm: avoid stale node down processing and unexpected track callback [#1120]

Hans Nordebäck Mon, 22 Sep 2014 23:06:07 -0700

Hi Mathi, I tested the patch and SA_CLM_NODE_LEFT are sent to active node:

./rootfs/var/PL-4/log/messages:Sep 23 07:31:23 PL-4 local0.notice 
osafamfnd[391]: NO This node has exited the cluster


if run in UML and power off is done in clms_track_send_node_down before 
sending checkpoint data.

/Regards HansN

On 09/23/14 02:12, mathi.naic...@oracle.com wrote:
>   osaf/services/saf/clmsv/clms/clms_cb.h  |   6 ++++
>   osaf/services/saf/clmsv/clms/clms_evt.c |  48 
> ++++++++++++++++++++++++++++----
>   2 files changed, 47 insertions(+), 7 deletions(-)
>
>
> There is a possiblity that the checkpointing message for a NODE_DOWN reaches 
> the STANDBY first, i.e.
> before the MDS delivers the NODE_DOWN event the the standby.
>   This can result in stale node_down record getting stored in the node_down 
> list which is a designated list
> for processing of node downs that occur during role change from standby to 
> active.
> The patch introduces a variable that checks whether the checkpoint event for 
> node_down has
> arrived first, followed by a check during role change to ignore such stale 
> events.
>
> diff --git a/osaf/services/saf/clmsv/clms/clms_cb.h 
> b/osaf/services/saf/clmsv/clms/clms_cb.h
> --- a/osaf/services/saf/clmsv/clms/clms_cb.h
> +++ b/osaf/services/saf/clmsv/clms/clms_cb.h
> @@ -37,6 +37,11 @@ typedef enum {
>       IMM_RECONFIGURED = 5
>   } ADMIN_OP;
>   
> +typedef enum {
> +     CHECKPOINT_PROCESSED = 1,
> +     MDS_DOWN_PROCESSED
> +} NODE_DOWN_STATUS;
> +
>   /* Cluster Properties */
>   typedef struct cluster_db_t {
>       SaNameT name;
> @@ -124,6 +129,7 @@ typedef struct clma_down_list_tag {
>   
>   typedef struct node_down_list_tag {
>       SaClmNodeIdT node_id;
> +     NODE_DOWN_STATUS ndown_status;
>       struct node_down_list_tag *next;
>   } NODE_DOWN_LIST;
>   
> diff --git a/osaf/services/saf/clmsv/clms/clms_evt.c 
> b/osaf/services/saf/clmsv/clms/clms_evt.c
> --- a/osaf/services/saf/clmsv/clms/clms_evt.c
> +++ b/osaf/services/saf/clmsv/clms/clms_evt.c
> @@ -471,6 +471,13 @@ void clms_track_send_node_down(CLMS_CLUS
>       node->stat_change = SA_TRUE;
>       node->change = SA_CLM_NODE_LEFT;
>       ++(clms_cb->cluster_view_num);
> +
> +     if(clms_cb->ha_state == SA_AMF_HA_ACTIVE){
> +             ckpt_node_rec(node);
> +             ckpt_node_down_rec(node);
> +             ckpt_cluster_rec();
> +     }
> +
>       clms_send_track(clms_cb, node, SA_CLM_CHANGE_COMPLETED);
>       /* Clear node->stat_change after sending the callback to its clients */
>       node->stat_change = SA_FALSE;
> @@ -480,12 +487,6 @@ void clms_track_send_node_down(CLMS_CLUS
>       clms_node_update_rattr(node);
>       clms_cluster_update_rattr(osaf_cluster);
>   
> -     if(clms_cb->ha_state == SA_AMF_HA_ACTIVE){
> -             ckpt_node_rec(node);
> -             ckpt_node_down_rec(node);
> -             ckpt_cluster_rec();
> -     }
> -
>       /*For the NODE DOWN, boottimestamp will not be updated */
>   
>       /* Delete the node reference from the nodeid database */
> @@ -592,6 +593,7 @@ static uint32_t proc_mds_node_evt(CLMSV_
>                               clms_cb->node_down_list_tail->next = 
> node_down_rec;
>               }
>               clms_cb->node_down_list_tail = node_down_rec;
> +             node_down_rec->ndown_status = MDS_DOWN_PROCESSED;
>       }
>   
>    done:
> @@ -1613,6 +1615,7 @@ void clms_remove_node_down_rec(SaClmNode
>   {
>       NODE_DOWN_LIST *node_down_rec = clms_cb->node_down_list_head;
>       NODE_DOWN_LIST *prev_rec = NULL;
> +     bool record_found = false;
>   
>       while (node_down_rec) {
>               if (node_down_rec->node_id == node_id) {
> @@ -1638,11 +1641,36 @@ void clms_remove_node_down_rec(SaClmNode
>                       /* Free the NODE_DOWN_REC */
>                       free(node_down_rec);
>                       node_down_rec = NULL;
> +                     record_found = true;
>                       break;
>               }
>               prev_rec = node_down_rec;       /* Remember address of this 
> entry */
>               node_down_rec = node_down_rec->next;    /* Go to next entry */
>       }
> +
> +     if (!record_found) {
> +             /* MDS node_down has not yet reached the STANDBY,
> +              * Just add this checkupdate record to the list. MDS_DOWN 
> processing will delete it.
> +              * If role change happens before MDS_DOWN is recieved,
> +              * then role change processing just ignores the record and 
> removes it
> +              * from the list.
> +              */
> +             node_down_rec = NULL;
> +             if ((node_down_rec = (NODE_DOWN_LIST *) 
> malloc(sizeof(NODE_DOWN_LIST))) == NULL) {
> +                     LOG_CR("Memory Allocation for NODE_DOWN_LIST failed");
> +                     return;
> +             }
> +             memset(node_down_rec, 0, sizeof(NODE_DOWN_LIST));
> +             node_down_rec->node_id = node_id;
> +             if (clms_cb->node_down_list_head == NULL) {
> +                     clms_cb->node_down_list_head = node_down_rec;
> +             } else {
> +                     if (clms_cb->node_down_list_tail)
> +                             clms_cb->node_down_list_tail->next = 
> node_down_rec;
> +             }
> +             clms_cb->node_down_list_tail = node_down_rec;
> +             node_down_rec->ndown_status = CHECKPOINT_PROCESSED;
> +     }
>   }
>   
>   /**
> @@ -1696,7 +1724,13 @@ void proc_downs_during_rolechange (void)
>               /*Remove NODE_DOWN_REC from the NODE_DOWN_LIST */
>               node = clms_node_get_by_id(node_down_rec->node_id);
>               temp_node_down_rec = node_down_rec;
> -             if (node != NULL)
> +             /* If nodedown status is CHECKPOINT_PROCESSED, it means that
> +              * a checkpoint update was received when this node was STANDBY, 
> but
> +              * the MDS node_down did not reach the STANDBY. An extremely 
> rare chance,
> +              * but good to have protection for it, by ignoring the record
> +              * if the record is in CHECKPOINT_PROCESSED state.
> +              */
> +             if ((node != NULL) && (temp_node_down_rec->ndown_status != 
> CHECKPOINT_PROCESSED))
>                       clms_track_send_node_down(node);
>               node_down_rec = node_down_rec->next;
>               /*Free the NODE_DOWN_REC */


------------------------------------------------------------------------------
Meet PCI DSS 3.0 Compliance Requirements with EventLog Analyzer
Achieve PCI DSS 3.0 Compliant Status with Out-of-the-box PCI DSS Reports
Are you Audit-Ready for PCI DSS 3.0 Compliance? Download White paper
Comply to PCI DSS 3.0 Requirement 10 and 11.5 with EventLog Analyzer
http://pubads.g.doubleclick.net/gampad/clk?id=154622311&iu=/4140/ostg.clktrk
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1 of 1] clm: avoid stale node down processing and unexpected track callback [#1120]

Reply via email to