Hi Mathi, Isn't the problem that the programs nodeinit spawns are in its turn daemonized, i.e. now owned by the init process. But the pid from nodeinit's initial fork, service->pid, may now be a zombie process? So the /var/run/pid files should be used in both script, daemon and the process case. /Thanks HansN
On 04/14/2015 03:02 PM, Mathivanan Naickan Palanivelu wrote: > Hi Hans, > > I have clarified to your comment below. It's an ACK anyways. > >> Nodeinit sends SIGKILL to parent pid returned from fork. Sending >> SIGABRT >> the child pid should be used instead. > Yes that's true, and a good catch. Must have been caught in the first version! > > You would have already guessed the reason also. But in case if you have not, > then this behaviour is seen because we marked in the nodeinit.conf, for NID > to spawn scripts > by specifying "S" > i.e. For eg:- > /usr/local/lib/opensaf/clc-cli/osaf-rded:RDE:S:/usr/local/lib/opensaf/clc-cli/osaf-rded:12000:-6:2:1:start:stop > > Therefore the pid is that of the script and not of our executable, also while > spawning scripts, NID also cancels all the signals for that process. > > If we had specified it as "D" for daemon or "E" for regular process then the > service->pid would have > had the pid of that process itself. > > Thanks, > Mathi. > > ----- [email protected] wrote: > >> osaf/services/infrastructure/nid/nodeinit.c | 88 >> +++++++++++++++++++++++++++++ >> 1 files changed, 88 insertions(+), 0 deletions(-) >> >> >> Nodeinit sends SIGKILL to parent pid returned from fork. Sending >> SIGABRT >> the child pid should be used instead. >> >> diff --git a/osaf/services/infrastructure/nid/nodeinit.c >> b/osaf/services/infrastructure/nid/nodeinit.c >> --- a/osaf/services/infrastructure/nid/nodeinit.c >> +++ b/osaf/services/infrastructure/nid/nodeinit.c >> @@ -56,6 +56,10 @@ >> #include <sys/time.h> >> #include <sys/resource.h> >> >> +#include <signal.h> >> +#include <sys/wait.h> >> +#include <stdint.h> >> + >> #include <configmake.h> >> #include <rda_papi.h> >> #include <logtrace.h> >> @@ -1084,6 +1088,58 @@ uint32_t check_process(NID_SPAWN_INFO *s >> TRACE_LEAVE(); >> } >> >> + >> +/**************************************************************************** >> + * Name : get_pid_from_file >> * >> + * >> * >> + * Description : Retrieves the given service name pid. >> * >> + * >> * >> + * Arguments : service name. >> * >> + * >> * >> + * Return Values : > 0 - process id of given service >> * >> + * -1 - error, see syslog >> * >> + * >> * >> + >> ***************************************************************************/ >> +static pid_t get_pid_from_file(const char* service_name) >> +{ >> + char pid_file[NAME_MAX]; >> + >> + char prog_name[40]; >> + char *service, *tmp; >> + FILE *f; >> + pid_t pid; >> + >> + service = (char*) malloc(strlen(service_name) +1); >> + strcpy(service, service_name); >> + tmp = service; >> + for ( ; *tmp; ++tmp) *tmp = tolower(*tmp); >> + >> + strcpy(prog_name, "osaf"); >> + strcat(prog_name, service); >> + free(service); >> + >> + LOG_IN("XXXX %s", prog_name); >> + >> + snprintf(pid_file, sizeof(pid_file), PKGPIDDIR "/%s.pid", >> prog_name); >> + >> + if ((f = fopen(pid_file, "r")) == 0) { >> + LOG_WA("Failed to open %s", pid_file); >> + return -1; >> + } >> + >> + if (fscanf(f, "%d", &pid) == 0) { >> + LOG_WA("Could not read PID from file %s", pid_file); >> + return -1; >> + } >> + >> + if (fclose(f) != 0) { >> + LOG_WA("Could not close file"); >> + return -1; >> + } >> + >> + return pid; >> +} >> + >> >> /**************************************************************************** >> * Name : cleanup >> * >> * >> * >> @@ -1108,6 +1164,38 @@ void cleanup(NID_SPAWN_INFO *service) >> nid_close_ipc(); >> select_fd = -1; >> >> + pid_t w_pid; >> + pid_t pid; >> + int status; >> + uint32_t no_of_retries = 0; >> + const uint32_t MAX_NO_RETRIES = 5; >> + >> + // get pid of current service_name instead of the parent pid >> + pid = get_pid_from_file(service->serv_name); >> + if (pid > 0) { >> + if (check_process(service)) { >> + // send abort signal to process to generate a core dump >> + LOG_ER("Sending SIGABRT to %s, pid=%d, (parent pid=%d)", >> service->serv_name, pid, service->pid); >> + if (kill(pid, SIGABRT) >= 0) { >> + // wait a short period for process to exit >> + do { >> + w_pid = waitpid(service->pid, &status, >> WNOHANG); >> + if (w_pid < 0) { >> + if (errno == EINTR) >> + continue; >> + else >> + break; >> + } else if (w_pid > 0) { >> + if (WIFEXITED(status) || >> WIFSIGNALED(status)) { >> + break; >> + } >> + } >> + sleep(1); >> + } while (++no_of_retries < MAX_NO_RETRIES); >> + } >> + } >> + } >> + // if sending abort signal did not succeed, fallback to sigkill >> if (check_process(service)) { >> LOG_ER("Sending SIGKILL to %s, pid=%d", service->serv_name, >> service->pid); >> kill(service->pid, SIGKILL); ------------------------------------------------------------------------------ BPM Camp - Free Virtual Workshop May 6th at 10am PDT/1PM EDT Develop your own process in accordance with the BPMN 2 standard Learn Process modeling best practices with Bonita BPM through live exercises http://www.bonitasoft.com/be-part-of-it/events/bpm-camp-virtual- event?utm_ source=Sourceforge_BPM_Camp_5_6_15&utm_medium=email&utm_campaign=VA_SF _______________________________________________ Opensaf-devel mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/opensaf-devel
