Hi Vu,

                /* Recover clients one at a time */
                rc = recover_one_client(p_client);
 +              if (p_client == NULL) goto done;
 +
 [Lennart] Why is this needed? This check is done in "while (p_client != NULL) 
{ "
and p_client cannot become NULL until "p_client = p_client->next; " is done as
the last line in the while loop.

 [Vu] It is my intention. I am afraid if the context is switched to
 client thread
 right after the recovery thread enters the while loop.
 If it is the case, p_client will be NULL after done recover_one_client().

[Lennart] I don't really understand your explanation. What is happening in the 
client thread when we are in recovery state 2 that can set p_client == NULL? 
The client thread should not be able to do anything with the data involved when 
recovering client in the recovery thread?

Regards 
Lennart

> -----Original Message-----
> From: Vu Minh Nguyen [mailto:vu.m.ngu...@dektech.com.au]
> Sent: den 1 april 2016 14:14
> To: Lennart Lund
> Cc: mathi.naic...@oracle.com; opensaf-devel@lists.sourceforge.net
> Subject: Re: [PATCH 1 of 1] log: miss mutex protection for common resource
> in log agent [#1705]
> 
> Hi Lennart,
> 
> Please see my comment inline.
> 
> Regards, Vu
> 
> Quoting Lennart Lund <lennart.l...@ericsson.com>:
> 
> > Hi Vu
> >
> > Ack with comments
> >
> > See my comments inline [Lennart]:
> >
> > Thanks
> > Lennart
> >
> >> -----Original Message-----
> >> From: Vu Minh Nguyen [mailto:vu.m.ngu...@dektech.com.au]
> >> Sent: den 23 mars 2016 08:16
> >> To: mathi.naic...@oracle.com; Lennart Lund
> >> Cc: opensaf-devel@lists.sourceforge.net
> >> Subject: [PATCH 1 of 1] log: miss mutex protection for common resource
> in
> >> log agent [#1705]
> >>
> >>  osaf/libs/agents/saf/lga/lga_state.c |  41
> >> +++++++++++++++++++++++++++++++----
> >>  osaf/libs/agents/saf/lga/lga_state.h |   1 +
> >>  osaf/libs/agents/saf/lga/lga_util.c  |  30 +++++++++++--------------
> >>  3 files changed, 50 insertions(+), 22 deletions(-)
> >>
> >>
> >> There was an race condition between client thread and recovery thread
> >> accessing to common resource `client_list`.
> >>
> >> Add mutex protection.
> >>
> >> diff --git a/osaf/libs/agents/saf/lga/lga_state.c
> >> b/osaf/libs/agents/saf/lga/lga_state.c
> >> --- a/osaf/libs/agents/saf/lga/lga_state.c
> >> +++ b/osaf/libs/agents/saf/lga/lga_state.c
> >> @@ -355,8 +355,11 @@ static void *recovery2_thread(void *dumm
> >>                    /* We have been signaled to exit
> >> */
> >>                    goto done;
> >>            }
> >> +
> >>            /* Recover clients one at a time */
> >>            rc = recover_one_client(p_client);
> >> +          if (p_client == NULL) goto done;
> >> +
> > [Lennart] Why is this needed? This check is done in "while (p_client
> > != NULL) { " and p_client cannot become NULL until "p_client =
> > p_client->next; " is done as the last line in the while loop.
> >
> [Vu] It is my intention. I am afraid if the context is switched to
> client thread
> right after the recovery thread enters the while loop.
> If it is the case, p_client will be NULL after done recover_one_client().
> 
> >>            TRACE("\t Client %d is recovered", p_client-
> >> >lgs_client_id);
> >>            if (rc == -1) {
> >>                    TRACE("%s recover_one_client
> >> Fail Deleting cllient (id %d)",
> >> @@ -593,6 +596,15 @@ int recover_one_client(lga_client_hdl_re
> >>
> >>    TRACE_ENTER();
> >>
> >> +  if (p_client == NULL) {
> >> +          /**
> >> +           * Probably the client_list has been deleted by
> >> client thread
> >> +           * in context calling saLogFinalize(). So, give up
> >> recovering.
> >> +           */
> >> +          rc = -1;
> >> +          goto done;
> >> +  }
> >> +
> >>    /* This function may be called both in the recovery thread and
> >> in the
> >>     * client thread. A possible scenario is that the recovery 2
> >> thread
> >>     * times out and calls this function in recovery mode 2 while
> >> there
> >> @@ -658,13 +670,32 @@ uint32_t delete_one_client(
> >>    lga_client_hdl_rec_t *rm_node
> >>    )
> >>  {
> >> -  TRACE_ENTER2();
> >> -  uint32_t ncs_rc;
> >> +  return lga_hdl_rec_del(list_head, rm_node);
> >> +}
> >>
> >> +/**
> >> + * Free the memory allocated for one client handle with mutext
> protection.
> >> + *
> >> + * @param p_client_hdl
> >> + * @return void
> >> + */
> >> +void free_client_hdl(lga_client_hdl_rec_t **p_client_hdl)
> >> +{
> >> +  lga_client_hdl_rec_t *client_hdl;
> >> +
> >> +  /**
> >> +   * To prevent race condition b/w client thread and recovery
> >> thread
> >> +   * accessing to common resource `client_list`. [#1705]
> >> +   */
> >>    osaf_mutex_lock_ordie(&lga_recov_mutex);
> >> -  ncs_rc = lga_hdl_rec_del(list_head, rm_node);
> >> +
> >> +  client_hdl = *p_client_hdl;
> >> +  if (client_hdl == NULL) goto done;
> >> +
> >> +  free(client_hdl);
> >> +  client_hdl = NULL;
> >> +
> >> +done:
> >>    osaf_mutex_unlock_ordie(&lga_recov_mutex);
> >>
> >> -  TRACE_LEAVE();
> >> -  return ncs_rc;
> >>  }
> >> diff --git a/osaf/libs/agents/saf/lga/lga_state.h
> >> b/osaf/libs/agents/saf/lga/lga_state.h
> >> --- a/osaf/libs/agents/saf/lga/lga_state.h
> >> +++ b/osaf/libs/agents/saf/lga/lga_state.h
> >> @@ -32,6 +32,7 @@ uint32_t delete_one_client(
> >>    lga_client_hdl_rec_t **list_head,
> >>    lga_client_hdl_rec_t *rm_node
> >>    );
> >> +void free_client_hdl(lga_client_hdl_rec_t **p_client_hdl);
> >>
> >>  #ifdef    __cplusplus
> >>  }
> >> diff --git a/osaf/libs/agents/saf/lga/lga_util.c
> >> b/osaf/libs/agents/saf/lga/lga_util.c
> >> --- a/osaf/libs/agents/saf/lga/lga_util.c
> >> +++ b/osaf/libs/agents/saf/lga/lga_util.c
> >> @@ -19,6 +19,7 @@
> >>  #include <syslog.h>
> >>  #include "lga.h"
> >>  #include "osaf_poll.h"
> >> +#include "lga_state.h"
> >>
> >>  /* Variables used during startup/shutdown only */
> >>  static pthread_mutex_t lga_lock = PTHREAD_MUTEX_INITIALIZER;
> >> @@ -449,11 +450,9 @@ void lga_hdl_list_del(lga_client_hdl_rec
> >>    /** clean up the channel records for this lga-client
> >>           **/
> >>            lga_log_stream_hdl_rec_list_del(&client_hdl-
> >> >stream_list);
> >> -  /** remove the association with hdl-mngr
> >> -         **/
> >> -          free(client_hdl);
> >> -          client_hdl = 0;
> >> +          free_client_hdl(&client_hdl);
> >>    }
> >> +
> >>    TRACE_LEAVE();
> >>  }
> >>
> >> @@ -536,38 +535,35 @@ uint32_t lga_hdl_rec_del(lga_client_hdl_
> >>    if (list_iter == rm_node) {
> >>            *list_head = rm_node->next;
> >>
> >> -  /** detach & release the IPC
> >> -         **/
> >> +          /* detach & release the IPC */
> >>            m_NCS_IPC_DETACH(&rm_node->mbx,
> >> lga_clear_mbx, NULL);
> >>            m_NCS_IPC_RELEASE(&rm_node->mbx, NULL);
> >>
> >>            ncshm_give_hdl(rm_node->local_hdl);
> >>            ncshm_destroy_hdl(NCS_SERVICE_ID_LGA,
> >> rm_node->local_hdl);
> >> -  /** Free the channel records off this hdl
> >> -         **/
> >> +
> >> +          /* Free the channel records off this hdl */
> >>            lga_log_stream_hdl_rec_list_del(&rm_node-
> >> >stream_list);
> >> +          free_client_hdl(&rm_node);
> >>
> >> -  /** free the hdl rec
> >> -         **/
> >> -          free(rm_node);
> >>            rc = NCSCC_RC_SUCCESS;
> >>            goto out;
> >> -  } else {                /* find the rec */
> >> +  } else {
> >> +          /* find the rec */
> >>            while (NULL != list_iter) {
> >>                    if (list_iter->next == rm_node) {
> >>                            list_iter->next =
> >> rm_node->next;
> >>
> >> -          /** detach & release the IPC */
> >> +                          /* detach & release
> >> the IPC */
> >>
> >>    m_NCS_IPC_DETACH(&rm_node->mbx, lga_clear_mbx,
> NULL);
> >>
> >>    m_NCS_IPC_RELEASE(&rm_node->mbx, NULL);
> >>
> >>
> >>    ncshm_give_hdl(rm_node->local_hdl);
> >>
> >>    ncshm_destroy_hdl(NCS_SERVICE_ID_LGA, rm_node-
> >> >local_hdl);
> >> -          /** Free the channel records off this lga_hdl  */
> >> +
> >> +                          /* Free the channel
> >> records off this lga_hdl  */
> >>
> >>    lga_log_stream_hdl_rec_list_del(&rm_node->stream_list);
> >> -
> >> -          /** free the hdl rec */
> >> -                          free(rm_node);
> >> +
> >>    free_client_hdl(&rm_node);
> >>
> >>                            rc =
> >> NCSCC_RC_SUCCESS;
> >>                            goto out;
> 

------------------------------------------------------------------------------
Transform Data into Opportunity.
Accelerate data analysis in your applications with
Intel Data Analytics Acceleration Library.
Click to learn more.
http://pubads.g.doubleclick.net/gampad/clk?id=278785471&iu=/4140
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to