Ack with minor comment: The documentation for the function spawn_wait() should be updated to say that NCSCC_RC_REQ_TIMEOUT is a possible return code.
/ Anders Widell On 04/30/2015 03:13 PM, Hans Nordeback wrote: > osaf/services/infrastructure/nid/nodeinit.c | 56 > +++++++++++++++------------- > 1 files changed, 29 insertions(+), 27 deletions(-) > > > Generate core dump only at timeout > > diff --git a/osaf/services/infrastructure/nid/nodeinit.c > b/osaf/services/infrastructure/nid/nodeinit.c > --- a/osaf/services/infrastructure/nid/nodeinit.c > +++ b/osaf/services/infrastructure/nid/nodeinit.c > @@ -106,8 +106,8 @@ static NID_APP_TYPE get_apptype(char *); > static uint32_t get_spawn_info(char *, NID_SPAWN_INFO *, char *); > static uint32_t parse_nodeinit_conf(char *strbuf); > static uint32_t check_process(NID_SPAWN_INFO *service); > -static void cleanup(NID_SPAWN_INFO *service); > -static uint32_t recovery_action(NID_SPAWN_INFO *, char *); > +static void cleanup(NID_SPAWN_INFO *service, int reason); > +static uint32_t recovery_action(NID_SPAWN_INFO *, char *, int); > static uint32_t spawn_services(char *); > static void nid_sleep(uint32_t); > > @@ -989,7 +989,7 @@ uint32_t spawn_wait(NID_SPAWN_INFO *serv > while ((n = osaf_poll_one_fd(select_fd, service->time_out * 10)) <= 0) { > if (n == 0) { > LOG_ER("Timed-out for response from %s", > service->serv_name); > - return NCSCC_RC_FAILURE; > + return NCSCC_RC_REQ_TIMOUT; > } > break; > } > @@ -1151,7 +1151,7 @@ static pid_t get_pid_from_file(const cha > * Return Values : NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE. > * > * > * > > ***************************************************************************/ > -void cleanup(NID_SPAWN_INFO *service) > +void cleanup(NID_SPAWN_INFO *service, int reason) > { > char strbuff[256]; > > @@ -1168,27 +1168,29 @@ void cleanup(NID_SPAWN_INFO *service) > const uint32_t MAX_NO_RETRIES = 5; > > // get pid of current service_name instead of the parent pid > - pid = get_pid_from_file(service->serv_name); > - if (pid > 0) { > - if (check_process(service)) { > - // send abort signal to process to generate a core dump > - LOG_ER("Sending SIGABRT to %s, pid=%d, (origin parent > pid=%d)", service->serv_name, pid, service->pid); > - if (kill(pid, SIGABRT) >= 0) { > - // wait a short period for process to exit > - do { > - w_pid = waitpid(service->pid, &status, > WNOHANG); > - if (w_pid < 0) { > - if (errno == EINTR) > - continue; > - else > - break; > - } else if (w_pid > 0) { > - if (WIFEXITED(status) || > WIFSIGNALED(status)) { > - break; > + if (reason == NCSCC_RC_REQ_TIMOUT) { > + pid = get_pid_from_file(service->serv_name); > + if (pid > 0) { > + if (check_process(service)) { > + // send abort signal to process to generate a > core dump > + LOG_ER("Sending SIGABRT to %s, pid=%d, (origin > parent pid=%d)", service->serv_name, pid, service->pid); > + if (kill(pid, SIGABRT) >= 0) { > + // wait a short period for process to > exit > + do { > + w_pid = waitpid(service->pid, > &status, WNOHANG); > + if (w_pid < 0) { > + if (errno == EINTR) > + continue; > + else > + break; > + } else if (w_pid > 0) { > + if (WIFEXITED(status) > || WIFSIGNALED(status)) { > + break; > + } > } > - } > - sleep(1); > - } while (++no_of_retries < MAX_NO_RETRIES); > + sleep(1); > + } while (++no_of_retries < > MAX_NO_RETRIES); > + } > } > } > } > @@ -1229,7 +1231,7 @@ void cleanup(NID_SPAWN_INFO *service) > * Return Values : NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE. > * > * > * > > ***************************************************************************/ > -uint32_t recovery_action(NID_SPAWN_INFO *service, char *strbuff) > +uint32_t recovery_action(NID_SPAWN_INFO *service, char *strbuff, int reason) > { > uint32_t count = 0; > NID_RECOVERY_OPT opt = NID_RESPAWN; > @@ -1244,7 +1246,7 @@ uint32_t recovery_action(NID_SPAWN_INFO > > /* Just clean the stuff we created during prev retry */ > if (service->pid != 0) > - cleanup(service); > + cleanup(service, reason); > > /* Done with cleanup so goahead with recovery */ > if ((service->recovery_matrix[opt].action) (service, > strbuff) != NCSCC_RC_SUCCESS) { > @@ -1312,7 +1314,7 @@ uint32_t spawn_services(char *strbuf) > if (rc != NCSCC_RC_SUCCESS) { > LOG_ER("%s", sbuff); > LOG_ER("Going for recovery"); > - if (recovery_action(service, sbuff) != > NCSCC_RC_SUCCESS) { > + if (recovery_action(service, sbuff, rc) != > NCSCC_RC_SUCCESS) { > exit(EXIT_FAILURE); > } > } ------------------------------------------------------------------------------ One dashboard for servers and applications across Physical-Virtual-Cloud Widest out-of-the-box monitoring support with 50+ applications Performance metrics, stats and reports that give you Actionable Insights Deep dive visibility with transaction tracing using APM Insight. http://ad.doubleclick.net/ddm/clk/290420510;117567292;y _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel