Ack with minor comment: The documentation for the function spawn_wait()
should be updated to say that NCSCC_RC_REQ_TIMEOUT is a possible return
code.
/ Anders Widell
On 04/30/2015 03:13 PM, Hans Nordeback wrote:
> osaf/services/infrastructure/nid/nodeinit.c | 56
> +++++++++++++++-------------
> 1 files changed, 29 insertions(+), 27 deletions(-)
>
>
> Generate core dump only at timeout
>
> diff --git a/osaf/services/infrastructure/nid/nodeinit.c
> b/osaf/services/infrastructure/nid/nodeinit.c
> --- a/osaf/services/infrastructure/nid/nodeinit.c
> +++ b/osaf/services/infrastructure/nid/nodeinit.c
> @@ -106,8 +106,8 @@ static NID_APP_TYPE get_apptype(char *);
> static uint32_t get_spawn_info(char *, NID_SPAWN_INFO *, char *);
> static uint32_t parse_nodeinit_conf(char *strbuf);
> static uint32_t check_process(NID_SPAWN_INFO *service);
> -static void cleanup(NID_SPAWN_INFO *service);
> -static uint32_t recovery_action(NID_SPAWN_INFO *, char *);
> +static void cleanup(NID_SPAWN_INFO *service, int reason);
> +static uint32_t recovery_action(NID_SPAWN_INFO *, char *, int);
> static uint32_t spawn_services(char *);
> static void nid_sleep(uint32_t);
>
> @@ -989,7 +989,7 @@ uint32_t spawn_wait(NID_SPAWN_INFO *serv
> while ((n = osaf_poll_one_fd(select_fd, service->time_out * 10)) <= 0) {
> if (n == 0) {
> LOG_ER("Timed-out for response from %s",
> service->serv_name);
> - return NCSCC_RC_FAILURE;
> + return NCSCC_RC_REQ_TIMOUT;
> }
> break;
> }
> @@ -1151,7 +1151,7 @@ static pid_t get_pid_from_file(const cha
> * Return Values : NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE.
> *
> *
> *
>
> ***************************************************************************/
> -void cleanup(NID_SPAWN_INFO *service)
> +void cleanup(NID_SPAWN_INFO *service, int reason)
> {
> char strbuff[256];
>
> @@ -1168,27 +1168,29 @@ void cleanup(NID_SPAWN_INFO *service)
> const uint32_t MAX_NO_RETRIES = 5;
>
> // get pid of current service_name instead of the parent pid
> - pid = get_pid_from_file(service->serv_name);
> - if (pid > 0) {
> - if (check_process(service)) {
> - // send abort signal to process to generate a core dump
> - LOG_ER("Sending SIGABRT to %s, pid=%d, (origin parent
> pid=%d)", service->serv_name, pid, service->pid);
> - if (kill(pid, SIGABRT) >= 0) {
> - // wait a short period for process to exit
> - do {
> - w_pid = waitpid(service->pid, &status,
> WNOHANG);
> - if (w_pid < 0) {
> - if (errno == EINTR)
> - continue;
> - else
> - break;
> - } else if (w_pid > 0) {
> - if (WIFEXITED(status) ||
> WIFSIGNALED(status)) {
> - break;
> + if (reason == NCSCC_RC_REQ_TIMOUT) {
> + pid = get_pid_from_file(service->serv_name);
> + if (pid > 0) {
> + if (check_process(service)) {
> + // send abort signal to process to generate a
> core dump
> + LOG_ER("Sending SIGABRT to %s, pid=%d, (origin
> parent pid=%d)", service->serv_name, pid, service->pid);
> + if (kill(pid, SIGABRT) >= 0) {
> + // wait a short period for process to
> exit
> + do {
> + w_pid = waitpid(service->pid,
> &status, WNOHANG);
> + if (w_pid < 0) {
> + if (errno == EINTR)
> + continue;
> + else
> + break;
> + } else if (w_pid > 0) {
> + if (WIFEXITED(status)
> || WIFSIGNALED(status)) {
> + break;
> + }
> }
> - }
> - sleep(1);
> - } while (++no_of_retries < MAX_NO_RETRIES);
> + sleep(1);
> + } while (++no_of_retries <
> MAX_NO_RETRIES);
> + }
> }
> }
> }
> @@ -1229,7 +1231,7 @@ void cleanup(NID_SPAWN_INFO *service)
> * Return Values : NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE.
> *
> *
> *
>
> ***************************************************************************/
> -uint32_t recovery_action(NID_SPAWN_INFO *service, char *strbuff)
> +uint32_t recovery_action(NID_SPAWN_INFO *service, char *strbuff, int reason)
> {
> uint32_t count = 0;
> NID_RECOVERY_OPT opt = NID_RESPAWN;
> @@ -1244,7 +1246,7 @@ uint32_t recovery_action(NID_SPAWN_INFO
>
> /* Just clean the stuff we created during prev retry */
> if (service->pid != 0)
> - cleanup(service);
> + cleanup(service, reason);
>
> /* Done with cleanup so goahead with recovery */
> if ((service->recovery_matrix[opt].action) (service,
> strbuff) != NCSCC_RC_SUCCESS) {
> @@ -1312,7 +1314,7 @@ uint32_t spawn_services(char *strbuf)
> if (rc != NCSCC_RC_SUCCESS) {
> LOG_ER("%s", sbuff);
> LOG_ER("Going for recovery");
> - if (recovery_action(service, sbuff) !=
> NCSCC_RC_SUCCESS) {
> + if (recovery_action(service, sbuff, rc) !=
> NCSCC_RC_SUCCESS) {
> exit(EXIT_FAILURE);
> }
> }
------------------------------------------------------------------------------
One dashboard for servers and applications across Physical-Virtual-Cloud
Widest out-of-the-box monitoring support with 50+ applications
Performance metrics, stats and reports that give you Actionable Insights
Deep dive visibility with transaction tracing using APM Insight.
http://ad.doubleclick.net/ddm/clk/290420510;117567292;y
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel