Ack with minor comment: The documentation for the function spawn_wait() 
should be updated to say that NCSCC_RC_REQ_TIMEOUT is a possible return 
code.

/ Anders Widell

On 04/30/2015 03:13 PM, Hans Nordeback wrote:
>   osaf/services/infrastructure/nid/nodeinit.c |  56 
> +++++++++++++++-------------
>   1 files changed, 29 insertions(+), 27 deletions(-)
>
>
> Generate core dump only at timeout
>
> diff --git a/osaf/services/infrastructure/nid/nodeinit.c 
> b/osaf/services/infrastructure/nid/nodeinit.c
> --- a/osaf/services/infrastructure/nid/nodeinit.c
> +++ b/osaf/services/infrastructure/nid/nodeinit.c
> @@ -106,8 +106,8 @@ static NID_APP_TYPE get_apptype(char *);
>   static uint32_t get_spawn_info(char *, NID_SPAWN_INFO *, char *);
>   static uint32_t parse_nodeinit_conf(char *strbuf);
>   static uint32_t check_process(NID_SPAWN_INFO *service);
> -static void cleanup(NID_SPAWN_INFO *service);
> -static uint32_t recovery_action(NID_SPAWN_INFO *, char *);
> +static void cleanup(NID_SPAWN_INFO *service, int reason);
> +static uint32_t recovery_action(NID_SPAWN_INFO *, char *, int);
>   static uint32_t spawn_services(char *);
>   static void nid_sleep(uint32_t);
>   
> @@ -989,7 +989,7 @@ uint32_t spawn_wait(NID_SPAWN_INFO *serv
>       while ((n = osaf_poll_one_fd(select_fd, service->time_out * 10)) <= 0) {
>               if (n == 0) {
>                       LOG_ER("Timed-out for response from %s", 
> service->serv_name);
> -                     return NCSCC_RC_FAILURE;
> +                     return NCSCC_RC_REQ_TIMOUT;
>               }
>               break;
>       }
> @@ -1151,7 +1151,7 @@ static pid_t get_pid_from_file(const cha
>    * Return Values : NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE.                       
> *
>    *                                                                          
> *
>    
> ***************************************************************************/
> -void cleanup(NID_SPAWN_INFO *service)
> +void cleanup(NID_SPAWN_INFO *service, int reason)
>   {
>       char strbuff[256];
>   
> @@ -1168,27 +1168,29 @@ void cleanup(NID_SPAWN_INFO *service)
>       const uint32_t MAX_NO_RETRIES = 5;
>   
>       // get pid of current service_name instead of the parent pid
> -     pid = get_pid_from_file(service->serv_name);
> -     if (pid > 0) {
> -             if (check_process(service)) {
> -                     // send abort signal to process to generate a core dump
> -                     LOG_ER("Sending SIGABRT to %s, pid=%d, (origin parent 
> pid=%d)", service->serv_name, pid, service->pid);
> -                     if (kill(pid, SIGABRT) >= 0) {
> -                             // wait a short period for process to exit
> -                             do {
> -                                     w_pid = waitpid(service->pid, &status, 
> WNOHANG);
> -                                     if (w_pid < 0) {
> -                                             if (errno == EINTR)
> -                                                     continue;
> -                                             else
> -                                                     break;
> -                                     } else if (w_pid > 0) {
> -                                             if (WIFEXITED(status) || 
> WIFSIGNALED(status)) {
> -                                                     break;
> +     if (reason == NCSCC_RC_REQ_TIMOUT) {
> +             pid = get_pid_from_file(service->serv_name);
> +             if (pid > 0) {
> +                     if (check_process(service)) {
> +                             // send abort signal to process to generate a 
> core dump
> +                             LOG_ER("Sending SIGABRT to %s, pid=%d, (origin 
> parent pid=%d)", service->serv_name, pid, service->pid);
> +                             if (kill(pid, SIGABRT) >= 0) {
> +                                     // wait a short period for process to 
> exit
> +                                     do {
> +                                             w_pid = waitpid(service->pid, 
> &status, WNOHANG);
> +                                             if (w_pid < 0) {
> +                                                     if (errno == EINTR)
> +                                                             continue;
> +                                                     else
> +                                                             break;
> +                                             } else if (w_pid > 0) {
> +                                                     if (WIFEXITED(status) 
> || WIFSIGNALED(status)) {
> +                                                             break;
> +                                                     }
>                                               }
> -                                     }
> -                                     sleep(1);
> -                             } while (++no_of_retries < MAX_NO_RETRIES);
> +                                             sleep(1);
> +                                     } while (++no_of_retries < 
> MAX_NO_RETRIES);
> +                             }
>                       }
>               }
>       }
> @@ -1229,7 +1231,7 @@ void cleanup(NID_SPAWN_INFO *service)
>    * Return Values : NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE.                       
> *
>    *                                                                          
> *
>    
> ***************************************************************************/
> -uint32_t recovery_action(NID_SPAWN_INFO *service, char *strbuff)
> +uint32_t recovery_action(NID_SPAWN_INFO *service, char *strbuff, int reason)
>   {
>       uint32_t count = 0;
>       NID_RECOVERY_OPT opt = NID_RESPAWN;
> @@ -1244,7 +1246,7 @@ uint32_t recovery_action(NID_SPAWN_INFO
>   
>                       /* Just clean the stuff we created during prev retry */
>                       if (service->pid != 0)
> -                             cleanup(service);
> +                             cleanup(service, reason);
>   
>                       /* Done with cleanup so goahead with recovery */
>                       if ((service->recovery_matrix[opt].action) (service, 
> strbuff) != NCSCC_RC_SUCCESS) {
> @@ -1312,7 +1314,7 @@ uint32_t spawn_services(char *strbuf)
>               if (rc != NCSCC_RC_SUCCESS) {
>                       LOG_ER("%s", sbuff);
>                       LOG_ER("Going for recovery");
> -                     if (recovery_action(service, sbuff) != 
> NCSCC_RC_SUCCESS) {
> +                     if (recovery_action(service, sbuff, rc) != 
> NCSCC_RC_SUCCESS) {
>                               exit(EXIT_FAILURE);
>                       }
>               }


------------------------------------------------------------------------------
One dashboard for servers and applications across Physical-Virtual-Cloud 
Widest out-of-the-box monitoring support with 50+ applications
Performance metrics, stats and reports that give you Actionable Insights
Deep dive visibility with transaction tracing using APM Insight.
http://ad.doubleclick.net/ddm/clk/290420510;117567292;y
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to