Hi Vu, /* Recover clients one at a time */ rc = recover_one_client(p_client); + if (p_client == NULL) goto done; + [Lennart] Why is this needed? This check is done in "while (p_client != NULL) { " and p_client cannot become NULL until "p_client = p_client->next; " is done as the last line in the while loop.
[Vu] It is my intention. I am afraid if the context is switched to client thread right after the recovery thread enters the while loop. If it is the case, p_client will be NULL after done recover_one_client(). [Lennart] I don't really understand your explanation. What is happening in the client thread when we are in recovery state 2 that can set p_client == NULL? The client thread should not be able to do anything with the data involved when recovering client in the recovery thread? Regards Lennart > -----Original Message----- > From: Vu Minh Nguyen [mailto:vu.m.ngu...@dektech.com.au] > Sent: den 1 april 2016 14:14 > To: Lennart Lund > Cc: mathi.naic...@oracle.com; opensaf-devel@lists.sourceforge.net > Subject: Re: [PATCH 1 of 1] log: miss mutex protection for common resource > in log agent [#1705] > > Hi Lennart, > > Please see my comment inline. > > Regards, Vu > > Quoting Lennart Lund <lennart.l...@ericsson.com>: > > > Hi Vu > > > > Ack with comments > > > > See my comments inline [Lennart]: > > > > Thanks > > Lennart > > > >> -----Original Message----- > >> From: Vu Minh Nguyen [mailto:vu.m.ngu...@dektech.com.au] > >> Sent: den 23 mars 2016 08:16 > >> To: mathi.naic...@oracle.com; Lennart Lund > >> Cc: opensaf-devel@lists.sourceforge.net > >> Subject: [PATCH 1 of 1] log: miss mutex protection for common resource > in > >> log agent [#1705] > >> > >> osaf/libs/agents/saf/lga/lga_state.c | 41 > >> +++++++++++++++++++++++++++++++---- > >> osaf/libs/agents/saf/lga/lga_state.h | 1 + > >> osaf/libs/agents/saf/lga/lga_util.c | 30 +++++++++++-------------- > >> 3 files changed, 50 insertions(+), 22 deletions(-) > >> > >> > >> There was an race condition between client thread and recovery thread > >> accessing to common resource `client_list`. > >> > >> Add mutex protection. > >> > >> diff --git a/osaf/libs/agents/saf/lga/lga_state.c > >> b/osaf/libs/agents/saf/lga/lga_state.c > >> --- a/osaf/libs/agents/saf/lga/lga_state.c > >> +++ b/osaf/libs/agents/saf/lga/lga_state.c > >> @@ -355,8 +355,11 @@ static void *recovery2_thread(void *dumm > >> /* We have been signaled to exit > >> */ > >> goto done; > >> } > >> + > >> /* Recover clients one at a time */ > >> rc = recover_one_client(p_client); > >> + if (p_client == NULL) goto done; > >> + > > [Lennart] Why is this needed? This check is done in "while (p_client > > != NULL) { " and p_client cannot become NULL until "p_client = > > p_client->next; " is done as the last line in the while loop. > > > [Vu] It is my intention. I am afraid if the context is switched to > client thread > right after the recovery thread enters the while loop. > If it is the case, p_client will be NULL after done recover_one_client(). > > >> TRACE("\t Client %d is recovered", p_client- > >> >lgs_client_id); > >> if (rc == -1) { > >> TRACE("%s recover_one_client > >> Fail Deleting cllient (id %d)", > >> @@ -593,6 +596,15 @@ int recover_one_client(lga_client_hdl_re > >> > >> TRACE_ENTER(); > >> > >> + if (p_client == NULL) { > >> + /** > >> + * Probably the client_list has been deleted by > >> client thread > >> + * in context calling saLogFinalize(). So, give up > >> recovering. > >> + */ > >> + rc = -1; > >> + goto done; > >> + } > >> + > >> /* This function may be called both in the recovery thread and > >> in the > >> * client thread. A possible scenario is that the recovery 2 > >> thread > >> * times out and calls this function in recovery mode 2 while > >> there > >> @@ -658,13 +670,32 @@ uint32_t delete_one_client( > >> lga_client_hdl_rec_t *rm_node > >> ) > >> { > >> - TRACE_ENTER2(); > >> - uint32_t ncs_rc; > >> + return lga_hdl_rec_del(list_head, rm_node); > >> +} > >> > >> +/** > >> + * Free the memory allocated for one client handle with mutext > protection. > >> + * > >> + * @param p_client_hdl > >> + * @return void > >> + */ > >> +void free_client_hdl(lga_client_hdl_rec_t **p_client_hdl) > >> +{ > >> + lga_client_hdl_rec_t *client_hdl; > >> + > >> + /** > >> + * To prevent race condition b/w client thread and recovery > >> thread > >> + * accessing to common resource `client_list`. [#1705] > >> + */ > >> osaf_mutex_lock_ordie(&lga_recov_mutex); > >> - ncs_rc = lga_hdl_rec_del(list_head, rm_node); > >> + > >> + client_hdl = *p_client_hdl; > >> + if (client_hdl == NULL) goto done; > >> + > >> + free(client_hdl); > >> + client_hdl = NULL; > >> + > >> +done: > >> osaf_mutex_unlock_ordie(&lga_recov_mutex); > >> > >> - TRACE_LEAVE(); > >> - return ncs_rc; > >> } > >> diff --git a/osaf/libs/agents/saf/lga/lga_state.h > >> b/osaf/libs/agents/saf/lga/lga_state.h > >> --- a/osaf/libs/agents/saf/lga/lga_state.h > >> +++ b/osaf/libs/agents/saf/lga/lga_state.h > >> @@ -32,6 +32,7 @@ uint32_t delete_one_client( > >> lga_client_hdl_rec_t **list_head, > >> lga_client_hdl_rec_t *rm_node > >> ); > >> +void free_client_hdl(lga_client_hdl_rec_t **p_client_hdl); > >> > >> #ifdef __cplusplus > >> } > >> diff --git a/osaf/libs/agents/saf/lga/lga_util.c > >> b/osaf/libs/agents/saf/lga/lga_util.c > >> --- a/osaf/libs/agents/saf/lga/lga_util.c > >> +++ b/osaf/libs/agents/saf/lga/lga_util.c > >> @@ -19,6 +19,7 @@ > >> #include <syslog.h> > >> #include "lga.h" > >> #include "osaf_poll.h" > >> +#include "lga_state.h" > >> > >> /* Variables used during startup/shutdown only */ > >> static pthread_mutex_t lga_lock = PTHREAD_MUTEX_INITIALIZER; > >> @@ -449,11 +450,9 @@ void lga_hdl_list_del(lga_client_hdl_rec > >> /** clean up the channel records for this lga-client > >> **/ > >> lga_log_stream_hdl_rec_list_del(&client_hdl- > >> >stream_list); > >> - /** remove the association with hdl-mngr > >> - **/ > >> - free(client_hdl); > >> - client_hdl = 0; > >> + free_client_hdl(&client_hdl); > >> } > >> + > >> TRACE_LEAVE(); > >> } > >> > >> @@ -536,38 +535,35 @@ uint32_t lga_hdl_rec_del(lga_client_hdl_ > >> if (list_iter == rm_node) { > >> *list_head = rm_node->next; > >> > >> - /** detach & release the IPC > >> - **/ > >> + /* detach & release the IPC */ > >> m_NCS_IPC_DETACH(&rm_node->mbx, > >> lga_clear_mbx, NULL); > >> m_NCS_IPC_RELEASE(&rm_node->mbx, NULL); > >> > >> ncshm_give_hdl(rm_node->local_hdl); > >> ncshm_destroy_hdl(NCS_SERVICE_ID_LGA, > >> rm_node->local_hdl); > >> - /** Free the channel records off this hdl > >> - **/ > >> + > >> + /* Free the channel records off this hdl */ > >> lga_log_stream_hdl_rec_list_del(&rm_node- > >> >stream_list); > >> + free_client_hdl(&rm_node); > >> > >> - /** free the hdl rec > >> - **/ > >> - free(rm_node); > >> rc = NCSCC_RC_SUCCESS; > >> goto out; > >> - } else { /* find the rec */ > >> + } else { > >> + /* find the rec */ > >> while (NULL != list_iter) { > >> if (list_iter->next == rm_node) { > >> list_iter->next = > >> rm_node->next; > >> > >> - /** detach & release the IPC */ > >> + /* detach & release > >> the IPC */ > >> > >> m_NCS_IPC_DETACH(&rm_node->mbx, lga_clear_mbx, > NULL); > >> > >> m_NCS_IPC_RELEASE(&rm_node->mbx, NULL); > >> > >> > >> ncshm_give_hdl(rm_node->local_hdl); > >> > >> ncshm_destroy_hdl(NCS_SERVICE_ID_LGA, rm_node- > >> >local_hdl); > >> - /** Free the channel records off this lga_hdl */ > >> + > >> + /* Free the channel > >> records off this lga_hdl */ > >> > >> lga_log_stream_hdl_rec_list_del(&rm_node->stream_list); > >> - > >> - /** free the hdl rec */ > >> - free(rm_node); > >> + > >> free_client_hdl(&rm_node); > >> > >> rc = > >> NCSCC_RC_SUCCESS; > >> goto out; > ------------------------------------------------------------------------------ Transform Data into Opportunity. Accelerate data analysis in your applications with Intel Data Analytics Acceleration Library. Click to learn more. http://pubads.g.doubleclick.net/gampad/clk?id=278785471&iu=/4140 _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel