I'll move it to a separate changeset.

Thanks,
Mathi.

----- [email protected] wrote:

> I think this looks good. Just one small comment. In the commit message
> nothing is mentioned about the moved checkpointing from after to
> before sending track callbacks.
> 
> Please comment this or even preferably move it to its own changeset.
> 
> Thanks,
> Hans
> 
> > -----Original Message-----
> > From: [email protected] [mailto:[email protected]]
> > Sent: den 23 september 2014 02:12
> > To: Hans Nordebäck; [email protected]
> > Cc: [email protected]
> > Subject: [devel] [PATCH 1 of 1] clm: avoid stale node down
> processing and unexpected track callback [#1120]
> > 
> >  osaf/services/saf/clmsv/clms/clms_cb.h  |   6 ++++
> >  osaf/services/saf/clmsv/clms/clms_evt.c |  48
> ++++++++++++++++++++++++++++----
> >  2 files changed, 47 insertions(+), 7 deletions(-)
> > 
> > 
> > There is a possiblity that the checkpointing message for a NODE_DOWN
> reaches the STANDBY first, i.e.
> > before the MDS delivers the NODE_DOWN event the the standby.
> >  This can result in stale node_down record getting stored in the
> node_down list which is a designated list
> > for processing of node downs that occur during role change from
> standby to active.
> > The patch introduces a variable that checks whether the checkpoint
> event for node_down has
> > arrived first, followed by a check during role change to ignore such
> stale events.
> > 
> > diff --git a/osaf/services/saf/clmsv/clms/clms_cb.h
> b/osaf/services/saf/clmsv/clms/clms_cb.h
> > --- a/osaf/services/saf/clmsv/clms/clms_cb.h
> > +++ b/osaf/services/saf/clmsv/clms/clms_cb.h
> > @@ -37,6 +37,11 @@ typedef enum {
> >     IMM_RECONFIGURED = 5
> >  } ADMIN_OP;
> > 
> > +typedef enum {
> > +   CHECKPOINT_PROCESSED = 1,
> > +   MDS_DOWN_PROCESSED
> > +} NODE_DOWN_STATUS;
> > +
> >  /* Cluster Properties */
> >  typedef struct cluster_db_t {
> >     SaNameT name;
> > @@ -124,6 +129,7 @@ typedef struct clma_down_list_tag {
> > 
> >  typedef struct node_down_list_tag {
> >     SaClmNodeIdT node_id;
> > +   NODE_DOWN_STATUS ndown_status;
> >     struct node_down_list_tag *next;
> >  } NODE_DOWN_LIST;
> > 
> > diff --git a/osaf/services/saf/clmsv/clms/clms_evt.c
> b/osaf/services/saf/clmsv/clms/clms_evt.c
> > --- a/osaf/services/saf/clmsv/clms/clms_evt.c
> > +++ b/osaf/services/saf/clmsv/clms/clms_evt.c
> > @@ -471,6 +471,13 @@ void clms_track_send_node_down(CLMS_CLUS
> >     node->stat_change = SA_TRUE;
> >     node->change = SA_CLM_NODE_LEFT;
> >     ++(clms_cb->cluster_view_num);
> > +
> > +   if(clms_cb->ha_state == SA_AMF_HA_ACTIVE){
> > +           ckpt_node_rec(node);
> > +           ckpt_node_down_rec(node);
> > +           ckpt_cluster_rec();
> > +   }
> > +
> >     clms_send_track(clms_cb, node, SA_CLM_CHANGE_COMPLETED);
> >     /* Clear node->stat_change after sending the callback to its
> clients */
> >     node->stat_change = SA_FALSE;
> > @@ -480,12 +487,6 @@ void clms_track_send_node_down(CLMS_CLUS
> >     clms_node_update_rattr(node);
> >     clms_cluster_update_rattr(osaf_cluster);
> > 
> > -   if(clms_cb->ha_state == SA_AMF_HA_ACTIVE){
> > -           ckpt_node_rec(node);
> > -           ckpt_node_down_rec(node);
> > -           ckpt_cluster_rec();
> > -   }
> > -
> >     /*For the NODE DOWN, boottimestamp will not be updated */
> > 
> >     /* Delete the node reference from the nodeid database */
> > @@ -592,6 +593,7 @@ static uint32_t proc_mds_node_evt(CLMSV_
> >                             clms_cb->node_down_list_tail->next = 
> > node_down_rec;
> >             }
> >             clms_cb->node_down_list_tail = node_down_rec;
> > +           node_down_rec->ndown_status = MDS_DOWN_PROCESSED;
> >     }
> > 
> >   done:
> > @@ -1613,6 +1615,7 @@ void clms_remove_node_down_rec(SaClmNode
> >  {
> >     NODE_DOWN_LIST *node_down_rec = clms_cb->node_down_list_head;
> >     NODE_DOWN_LIST *prev_rec = NULL;
> > +   bool record_found = false;
> > 
> >     while (node_down_rec) {
> >             if (node_down_rec->node_id == node_id) {
> > @@ -1638,11 +1641,36 @@ void clms_remove_node_down_rec(SaClmNode
> >                     /* Free the NODE_DOWN_REC */
> >                     free(node_down_rec);
> >                     node_down_rec = NULL;
> > +                   record_found = true;
> >                     break;
> >             }
> >             prev_rec = node_down_rec;       /* Remember address of this 
> > entry */
> >             node_down_rec = node_down_rec->next;    /* Go to next entry */
> >     }
> > +
> > +   if (!record_found) {
> > +           /* MDS node_down has not yet reached the STANDBY,
> > +            * Just add this checkupdate record to the list. MDS_DOWN
> processing will delete it.
> > +            * If role change happens before MDS_DOWN is recieved,
> > +            * then role change processing just ignores the record and
> removes it
> > +            * from the list.
> > +            */
> > +           node_down_rec = NULL;
> > +           if ((node_down_rec = (NODE_DOWN_LIST *)
> malloc(sizeof(NODE_DOWN_LIST))) == NULL) {
> > +                   LOG_CR("Memory Allocation for NODE_DOWN_LIST failed");
> > +                   return;
> > +           }
> > +           memset(node_down_rec, 0, sizeof(NODE_DOWN_LIST));
> > +           node_down_rec->node_id = node_id;
> > +           if (clms_cb->node_down_list_head == NULL) {
> > +                   clms_cb->node_down_list_head = node_down_rec;
> > +           } else {
> > +                   if (clms_cb->node_down_list_tail)
> > +                           clms_cb->node_down_list_tail->next = 
> > node_down_rec;
> > +           }
> > +           clms_cb->node_down_list_tail = node_down_rec;
> > +           node_down_rec->ndown_status = CHECKPOINT_PROCESSED;
> > +   }
> >  }
> > 
> >  /**
> > @@ -1696,7 +1724,13 @@ void proc_downs_during_rolechange (void)
> >             /*Remove NODE_DOWN_REC from the NODE_DOWN_LIST */
> >             node = clms_node_get_by_id(node_down_rec->node_id);
> >             temp_node_down_rec = node_down_rec;
> > -           if (node != NULL)
> > +           /* If nodedown status is CHECKPOINT_PROCESSED, it means that
> > +            * a checkpoint update was received when this node was STANDBY,
> but
> > +            * the MDS node_down did not reach the STANDBY. An extremely 
> > rare
> chance,
> > +            * but good to have protection for it, by ignoring the record
> > +            * if the record is in CHECKPOINT_PROCESSED state.
> > +            */
> > +           if ((node != NULL) && (temp_node_down_rec->ndown_status !=
> CHECKPOINT_PROCESSED))
> >                     clms_track_send_node_down(node);
> >             node_down_rec = node_down_rec->next;
> >             /*Free the NODE_DOWN_REC */
> > 
> >
> ------------------------------------------------------------------------------
> > Meet PCI DSS 3.0 Compliance Requirements with EventLog Analyzer
> > Achieve PCI DSS 3.0 Compliant Status with Out-of-the-box PCI DSS
> Reports
> > Are you Audit-Ready for PCI DSS 3.0 Compliance? Download White
> paper
> > Comply to PCI DSS 3.0 Requirement 10 and 11.5 with EventLog
> Analyzer
> >
> http://pubads.g.doubleclick.net/gampad/clk?id=154622311&iu=/4140/ostg.clktrk
> > _______________________________________________
> > Opensaf-devel mailing list
> > [email protected]
> > https://lists.sourceforge.net/lists/listinfo/opensaf-devel

------------------------------------------------------------------------------
Meet PCI DSS 3.0 Compliance Requirements with EventLog Analyzer
Achieve PCI DSS 3.0 Compliant Status with Out-of-the-box PCI DSS Reports
Are you Audit-Ready for PCI DSS 3.0 Compliance? Download White paper
Comply to PCI DSS 3.0 Requirement 10 and 11.5 with EventLog Analyzer
http://pubads.g.doubleclick.net/gampad/clk?id=154622311&iu=/4140/ostg.clktrk
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to