Hi Mathi, is it ok to push this patch? I'll add the documentation AndersW mentioned. /Thanks HansN
On 05/06/2015 12:17 PM, Anders Widell wrote: > Ack with minor comment: The documentation for the function > spawn_wait() should be updated to say that NCSCC_RC_REQ_TIMEOUT is a > possible return code. > > / Anders Widell > > On 04/30/2015 03:13 PM, Hans Nordeback wrote: >> osaf/services/infrastructure/nid/nodeinit.c | 56 >> +++++++++++++++------------- >> 1 files changed, 29 insertions(+), 27 deletions(-) >> >> >> Generate core dump only at timeout >> >> diff --git a/osaf/services/infrastructure/nid/nodeinit.c >> b/osaf/services/infrastructure/nid/nodeinit.c >> --- a/osaf/services/infrastructure/nid/nodeinit.c >> +++ b/osaf/services/infrastructure/nid/nodeinit.c >> @@ -106,8 +106,8 @@ static NID_APP_TYPE get_apptype(char *); >> static uint32_t get_spawn_info(char *, NID_SPAWN_INFO *, char *); >> static uint32_t parse_nodeinit_conf(char *strbuf); >> static uint32_t check_process(NID_SPAWN_INFO *service); >> -static void cleanup(NID_SPAWN_INFO *service); >> -static uint32_t recovery_action(NID_SPAWN_INFO *, char *); >> +static void cleanup(NID_SPAWN_INFO *service, int reason); >> +static uint32_t recovery_action(NID_SPAWN_INFO *, char *, int); >> static uint32_t spawn_services(char *); >> static void nid_sleep(uint32_t); >> @@ -989,7 +989,7 @@ uint32_t spawn_wait(NID_SPAWN_INFO *serv >> while ((n = osaf_poll_one_fd(select_fd, service->time_out * >> 10)) <= 0) { >> if (n == 0) { >> LOG_ER("Timed-out for response from %s", >> service->serv_name); >> - return NCSCC_RC_FAILURE; >> + return NCSCC_RC_REQ_TIMOUT; >> } >> break; >> } >> @@ -1151,7 +1151,7 @@ static pid_t get_pid_from_file(const cha >> * Return Values : >> NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE. * >> * * >> ***************************************************************************/ >> -void cleanup(NID_SPAWN_INFO *service) >> +void cleanup(NID_SPAWN_INFO *service, int reason) >> { >> char strbuff[256]; >> @@ -1168,27 +1168,29 @@ void cleanup(NID_SPAWN_INFO *service) >> const uint32_t MAX_NO_RETRIES = 5; >> // get pid of current service_name instead of the parent pid >> - pid = get_pid_from_file(service->serv_name); >> - if (pid > 0) { >> - if (check_process(service)) { >> - // send abort signal to process to generate a core dump >> - LOG_ER("Sending SIGABRT to %s, pid=%d, (origin parent >> pid=%d)", service->serv_name, pid, service->pid); >> - if (kill(pid, SIGABRT) >= 0) { >> - // wait a short period for process to exit >> - do { >> - w_pid = waitpid(service->pid, &status, WNOHANG); >> - if (w_pid < 0) { >> - if (errno == EINTR) >> - continue; >> - else >> - break; >> - } else if (w_pid > 0) { >> - if (WIFEXITED(status) || WIFSIGNALED(status)) { >> - break; >> + if (reason == NCSCC_RC_REQ_TIMOUT) { >> + pid = get_pid_from_file(service->serv_name); >> + if (pid > 0) { >> + if (check_process(service)) { >> + // send abort signal to process to generate a core dump >> + LOG_ER("Sending SIGABRT to %s, pid=%d, (origin >> parent pid=%d)", service->serv_name, pid, service->pid); >> + if (kill(pid, SIGABRT) >= 0) { >> + // wait a short period for process to exit >> + do { >> + w_pid = waitpid(service->pid, &status, >> WNOHANG); >> + if (w_pid < 0) { >> + if (errno == EINTR) >> + continue; >> + else >> + break; >> + } else if (w_pid > 0) { >> + if (WIFEXITED(status) || >> WIFSIGNALED(status)) { >> + break; >> + } >> } >> - } >> - sleep(1); >> - } while (++no_of_retries < MAX_NO_RETRIES); >> + sleep(1); >> + } while (++no_of_retries < MAX_NO_RETRIES); >> + } >> } >> } >> } >> @@ -1229,7 +1231,7 @@ void cleanup(NID_SPAWN_INFO *service) >> * Return Values : >> NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE. * >> * * >> ***************************************************************************/ >> -uint32_t recovery_action(NID_SPAWN_INFO *service, char *strbuff) >> +uint32_t recovery_action(NID_SPAWN_INFO *service, char *strbuff, int >> reason) >> { >> uint32_t count = 0; >> NID_RECOVERY_OPT opt = NID_RESPAWN; >> @@ -1244,7 +1246,7 @@ uint32_t recovery_action(NID_SPAWN_INFO >> /* Just clean the stuff we created during prev >> retry */ >> if (service->pid != 0) >> - cleanup(service); >> + cleanup(service, reason); >> /* Done with cleanup so goahead with recovery */ >> if ((service->recovery_matrix[opt].action) (service, >> strbuff) != NCSCC_RC_SUCCESS) { >> @@ -1312,7 +1314,7 @@ uint32_t spawn_services(char *strbuf) >> if (rc != NCSCC_RC_SUCCESS) { >> LOG_ER("%s", sbuff); >> LOG_ER("Going for recovery"); >> - if (recovery_action(service, sbuff) != NCSCC_RC_SUCCESS) { >> + if (recovery_action(service, sbuff, rc) != >> NCSCC_RC_SUCCESS) { >> exit(EXIT_FAILURE); >> } >> } > ------------------------------------------------------------------------------ One dashboard for servers and applications across Physical-Virtual-Cloud Widest out-of-the-box monitoring support with 50+ applications Performance metrics, stats and reports that give you Actionable Insights Deep dive visibility with transaction tracing using APM Insight. http://ad.doubleclick.net/ddm/clk/290420510;117567292;y _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel