Re: [devel] [PATCH 1 of 1] clm: avoid stale node down processing and unexpected track callback [#1120]

Hans Feldt Mon, 22 Sep 2014 08:06:01 -0700

I think this looks good. Just one small comment. In the commit message nothing 
is mentioned about the moved checkpointing from after to before sending track 
callbacks.


Please comment this or even preferably move it to its own changeset.

Thanks,
Hans

> -----Original Message-----
> From: [email protected] [mailto:[email protected]]
> Sent: den 23 september 2014 02:12
> To: Hans Nordebäck; [email protected]
> Cc: [email protected]
> Subject: [devel] [PATCH 1 of 1] clm: avoid stale node down processing and 
> unexpected track callback [#1120]
> 
>  osaf/services/saf/clmsv/clms/clms_cb.h  |   6 ++++
>  osaf/services/saf/clmsv/clms/clms_evt.c |  48 
> ++++++++++++++++++++++++++++----
>  2 files changed, 47 insertions(+), 7 deletions(-)
> 
> 
> There is a possiblity that the checkpointing message for a NODE_DOWN reaches 
> the STANDBY first, i.e.
> before the MDS delivers the NODE_DOWN event the the standby.
>  This can result in stale node_down record getting stored in the node_down 
> list which is a designated list
> for processing of node downs that occur during role change from standby to 
> active.
> The patch introduces a variable that checks whether the checkpoint event for 
> node_down has
> arrived first, followed by a check during role change to ignore such stale 
> events.
> 
> diff --git a/osaf/services/saf/clmsv/clms/clms_cb.h 
> b/osaf/services/saf/clmsv/clms/clms_cb.h
> --- a/osaf/services/saf/clmsv/clms/clms_cb.h
> +++ b/osaf/services/saf/clmsv/clms/clms_cb.h
> @@ -37,6 +37,11 @@ typedef enum {
>       IMM_RECONFIGURED = 5
>  } ADMIN_OP;
> 
> +typedef enum {
> +     CHECKPOINT_PROCESSED = 1,
> +     MDS_DOWN_PROCESSED
> +} NODE_DOWN_STATUS;
> +
>  /* Cluster Properties */
>  typedef struct cluster_db_t {
>       SaNameT name;
> @@ -124,6 +129,7 @@ typedef struct clma_down_list_tag {
> 
>  typedef struct node_down_list_tag {
>       SaClmNodeIdT node_id;
> +     NODE_DOWN_STATUS ndown_status;
>       struct node_down_list_tag *next;
>  } NODE_DOWN_LIST;
> 
> diff --git a/osaf/services/saf/clmsv/clms/clms_evt.c 
> b/osaf/services/saf/clmsv/clms/clms_evt.c
> --- a/osaf/services/saf/clmsv/clms/clms_evt.c
> +++ b/osaf/services/saf/clmsv/clms/clms_evt.c
> @@ -471,6 +471,13 @@ void clms_track_send_node_down(CLMS_CLUS
>       node->stat_change = SA_TRUE;
>       node->change = SA_CLM_NODE_LEFT;
>       ++(clms_cb->cluster_view_num);
> +
> +     if(clms_cb->ha_state == SA_AMF_HA_ACTIVE){
> +             ckpt_node_rec(node);
> +             ckpt_node_down_rec(node);
> +             ckpt_cluster_rec();
> +     }
> +
>       clms_send_track(clms_cb, node, SA_CLM_CHANGE_COMPLETED);
>       /* Clear node->stat_change after sending the callback to its clients */
>       node->stat_change = SA_FALSE;
> @@ -480,12 +487,6 @@ void clms_track_send_node_down(CLMS_CLUS
>       clms_node_update_rattr(node);
>       clms_cluster_update_rattr(osaf_cluster);
> 
> -     if(clms_cb->ha_state == SA_AMF_HA_ACTIVE){
> -             ckpt_node_rec(node);
> -             ckpt_node_down_rec(node);
> -             ckpt_cluster_rec();
> -     }
> -
>       /*For the NODE DOWN, boottimestamp will not be updated */
> 
>       /* Delete the node reference from the nodeid database */
> @@ -592,6 +593,7 @@ static uint32_t proc_mds_node_evt(CLMSV_
>                               clms_cb->node_down_list_tail->next = 
> node_down_rec;
>               }
>               clms_cb->node_down_list_tail = node_down_rec;
> +             node_down_rec->ndown_status = MDS_DOWN_PROCESSED;
>       }
> 
>   done:
> @@ -1613,6 +1615,7 @@ void clms_remove_node_down_rec(SaClmNode
>  {
>       NODE_DOWN_LIST *node_down_rec = clms_cb->node_down_list_head;
>       NODE_DOWN_LIST *prev_rec = NULL;
> +     bool record_found = false;
> 
>       while (node_down_rec) {
>               if (node_down_rec->node_id == node_id) {
> @@ -1638,11 +1641,36 @@ void clms_remove_node_down_rec(SaClmNode
>                       /* Free the NODE_DOWN_REC */
>                       free(node_down_rec);
>                       node_down_rec = NULL;
> +                     record_found = true;
>                       break;
>               }
>               prev_rec = node_down_rec;       /* Remember address of this 
> entry */
>               node_down_rec = node_down_rec->next;    /* Go to next entry */
>       }
> +
> +     if (!record_found) {
> +             /* MDS node_down has not yet reached the STANDBY,
> +              * Just add this checkupdate record to the list. MDS_DOWN 
> processing will delete it.
> +              * If role change happens before MDS_DOWN is recieved,
> +              * then role change processing just ignores the record and 
> removes it
> +              * from the list.
> +              */
> +             node_down_rec = NULL;
> +             if ((node_down_rec = (NODE_DOWN_LIST *) 
> malloc(sizeof(NODE_DOWN_LIST))) == NULL) {
> +                     LOG_CR("Memory Allocation for NODE_DOWN_LIST failed");
> +                     return;
> +             }
> +             memset(node_down_rec, 0, sizeof(NODE_DOWN_LIST));
> +             node_down_rec->node_id = node_id;
> +             if (clms_cb->node_down_list_head == NULL) {
> +                     clms_cb->node_down_list_head = node_down_rec;
> +             } else {
> +                     if (clms_cb->node_down_list_tail)
> +                             clms_cb->node_down_list_tail->next = 
> node_down_rec;
> +             }
> +             clms_cb->node_down_list_tail = node_down_rec;
> +             node_down_rec->ndown_status = CHECKPOINT_PROCESSED;
> +     }
>  }
> 
>  /**
> @@ -1696,7 +1724,13 @@ void proc_downs_during_rolechange (void)
>               /*Remove NODE_DOWN_REC from the NODE_DOWN_LIST */
>               node = clms_node_get_by_id(node_down_rec->node_id);
>               temp_node_down_rec = node_down_rec;
> -             if (node != NULL)
> +             /* If nodedown status is CHECKPOINT_PROCESSED, it means that
> +              * a checkpoint update was received when this node was STANDBY, 
> but
> +              * the MDS node_down did not reach the STANDBY. An extremely 
> rare chance,
> +              * but good to have protection for it, by ignoring the record
> +              * if the record is in CHECKPOINT_PROCESSED state.
> +              */
> +             if ((node != NULL) && (temp_node_down_rec->ndown_status != 
> CHECKPOINT_PROCESSED))
>                       clms_track_send_node_down(node);
>               node_down_rec = node_down_rec->next;
>               /*Free the NODE_DOWN_REC */
> 
> ------------------------------------------------------------------------------
> Meet PCI DSS 3.0 Compliance Requirements with EventLog Analyzer
> Achieve PCI DSS 3.0 Compliant Status with Out-of-the-box PCI DSS Reports
> Are you Audit-Ready for PCI DSS 3.0 Compliance? Download White paper
> Comply to PCI DSS 3.0 Requirement 10 and 11.5 with EventLog Analyzer
> http://pubads.g.doubleclick.net/gampad/clk?id=154622311&iu=/4140/ostg.clktrk
> _______________________________________________
> Opensaf-devel mailing list
> [email protected]
> https://lists.sourceforge.net/lists/listinfo/opensaf-devel

------------------------------------------------------------------------------
Meet PCI DSS 3.0 Compliance Requirements with EventLog Analyzer
Achieve PCI DSS 3.0 Compliant Status with Out-of-the-box PCI DSS Reports
Are you Audit-Ready for PCI DSS 3.0 Compliance? Download White paper
Comply to PCI DSS 3.0 Requirement 10 and 11.5 with EventLog Analyzer
http://pubads.g.doubleclick.net/gampad/clk?id=154622311&iu=/4140/ostg.clktrk
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1 of 1] clm: avoid stale node down processing and unexpected track callback [#1120]

Reply via email to