Hi Hans, I have clarified to your comment below. It's an ACK anyways.
> Nodeinit sends SIGKILL to parent pid returned from fork. Sending > SIGABRT > the child pid should be used instead. Yes that's true, and a good catch. Must have been caught in the first version! You would have already guessed the reason also. But in case if you have not, then this behaviour is seen because we marked in the nodeinit.conf, for NID to spawn scripts by specifying "S" i.e. For eg:- /usr/local/lib/opensaf/clc-cli/osaf-rded:RDE:S:/usr/local/lib/opensaf/clc-cli/osaf-rded:12000:-6:2:1:start:stop Therefore the pid is that of the script and not of our executable, also while spawning scripts, NID also cancels all the signals for that process. If we had specified it as "D" for daemon or "E" for regular process then the service->pid would have had the pid of that process itself. Thanks, Mathi. ----- [email protected] wrote: > osaf/services/infrastructure/nid/nodeinit.c | 88 > +++++++++++++++++++++++++++++ > 1 files changed, 88 insertions(+), 0 deletions(-) > > > Nodeinit sends SIGKILL to parent pid returned from fork. Sending > SIGABRT > the child pid should be used instead. > > diff --git a/osaf/services/infrastructure/nid/nodeinit.c > b/osaf/services/infrastructure/nid/nodeinit.c > --- a/osaf/services/infrastructure/nid/nodeinit.c > +++ b/osaf/services/infrastructure/nid/nodeinit.c > @@ -56,6 +56,10 @@ > #include <sys/time.h> > #include <sys/resource.h> > > +#include <signal.h> > +#include <sys/wait.h> > +#include <stdint.h> > + > #include <configmake.h> > #include <rda_papi.h> > #include <logtrace.h> > @@ -1084,6 +1088,58 @@ uint32_t check_process(NID_SPAWN_INFO *s > TRACE_LEAVE(); > } > > + > +/**************************************************************************** > + * Name : get_pid_from_file > * > + * > * > + * Description : Retrieves the given service name pid. > * > + * > * > + * Arguments : service name. > * > + * > * > + * Return Values : > 0 - process id of given service > * > + * -1 - error, see syslog > * > + * > * > + > ***************************************************************************/ > +static pid_t get_pid_from_file(const char* service_name) > +{ > + char pid_file[NAME_MAX]; > + > + char prog_name[40]; > + char *service, *tmp; > + FILE *f; > + pid_t pid; > + > + service = (char*) malloc(strlen(service_name) +1); > + strcpy(service, service_name); > + tmp = service; > + for ( ; *tmp; ++tmp) *tmp = tolower(*tmp); > + > + strcpy(prog_name, "osaf"); > + strcat(prog_name, service); > + free(service); > + > + LOG_IN("XXXX %s", prog_name); > + > + snprintf(pid_file, sizeof(pid_file), PKGPIDDIR "/%s.pid", > prog_name); > + > + if ((f = fopen(pid_file, "r")) == 0) { > + LOG_WA("Failed to open %s", pid_file); > + return -1; > + } > + > + if (fscanf(f, "%d", &pid) == 0) { > + LOG_WA("Could not read PID from file %s", pid_file); > + return -1; > + } > + > + if (fclose(f) != 0) { > + LOG_WA("Could not close file"); > + return -1; > + } > + > + return pid; > +} > + > > /**************************************************************************** > * Name : cleanup > * > * > * > @@ -1108,6 +1164,38 @@ void cleanup(NID_SPAWN_INFO *service) > nid_close_ipc(); > select_fd = -1; > > + pid_t w_pid; > + pid_t pid; > + int status; > + uint32_t no_of_retries = 0; > + const uint32_t MAX_NO_RETRIES = 5; > + > + // get pid of current service_name instead of the parent pid > + pid = get_pid_from_file(service->serv_name); > + if (pid > 0) { > + if (check_process(service)) { > + // send abort signal to process to generate a core dump > + LOG_ER("Sending SIGABRT to %s, pid=%d, (parent pid=%d)", > service->serv_name, pid, service->pid); > + if (kill(pid, SIGABRT) >= 0) { > + // wait a short period for process to exit > + do { > + w_pid = waitpid(service->pid, &status, > WNOHANG); > + if (w_pid < 0) { > + if (errno == EINTR) > + continue; > + else > + break; > + } else if (w_pid > 0) { > + if (WIFEXITED(status) || > WIFSIGNALED(status)) { > + break; > + } > + } > + sleep(1); > + } while (++no_of_retries < MAX_NO_RETRIES); > + } > + } > + } > + // if sending abort signal did not succeed, fallback to sigkill > if (check_process(service)) { > LOG_ER("Sending SIGKILL to %s, pid=%d", service->serv_name, > service->pid); > kill(service->pid, SIGKILL); ------------------------------------------------------------------------------ BPM Camp - Free Virtual Workshop May 6th at 10am PDT/1PM EDT Develop your own process in accordance with the BPMN 2 standard Learn Process modeling best practices with Bonita BPM through live exercises http://www.bonitasoft.com/be-part-of-it/events/bpm-camp-virtual- event?utm_ source=Sourceforge_BPM_Camp_5_6_15&utm_medium=email&utm_campaign=VA_SF _______________________________________________ Opensaf-devel mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/opensaf-devel
