Hi Hans, Code changes are okay. This change in the behavior (when NID started services fail) might not be acceptable to users of older branches. Just a thought.
Otherwise, Ack from me. Mathi. > -----Original Message----- > From: Hans Nordebäck [mailto:hans.nordeb...@ericsson.com] > Sent: Tuesday, December 20, 2016 4:21 PM > To: Mathivanan Naickan Palanivelu; Ramesh Babu Betham > Cc: opensaf-devel@lists.sourceforge.net; Praveen Malviya; Anders Widell > Subject: RE: [devel] [PATCH 1 of 1] nid: Use the FIFO monitoring for started > services V2 [#2204] > > Hi Mathi & Ramesh, > > A gentle reminder, Is it ok to push this patch? > > /Thanks HansN > > -----Original Message----- > From: Hans Nordebäck > Sent: den 19 december 2016 13:40 > To: Mathivanan Naickan Palanivelu <mathi.naic...@oracle.com>; Praveen > Malviya <praveen.malv...@oracle.com>; Ramesh Babu Betham > <ramesh.bet...@oracle.com>; Anders Widell > <anders.wid...@ericsson.com> > Cc: opensaf-devel@lists.sourceforge.net > Subject: Re: [devel] [PATCH 1 of 1] nid: Use the FIFO monitoring for started > services V2 [#2204] > > Hi Mathi, > > please see comment inlined with [HansN]. > > /Thanks HansN > > > On 12/19/2016 01:05 PM, Mathivanan Naickan Palanivelu wrote: > > Hi Hans, > > > > Quick comments > > > > (a) Remove the comma below: > > + {"CLMD", "osafclmd.fifo", -1}, > [HansN] I have removed the comma. > > > > (b) NULL is not a part of c++11 right. > [HansN] yes, I changed NULL to nullptr. > > > > (c) I'm not sure there is an obvious reason for converting this file > > to c++ > [HansN] one reason to change to C++ was easier use of e.g. FileNotify and > use STL. > > > > Thanks, > > Mathi. > > > >> -----Original Message----- > >> From: Hans Nordeback [mailto:hans.nordeb...@ericsson.com] > >> Sent: Tuesday, December 13, 2016 7:27 PM > >> To: Mathivanan Naickan Palanivelu; Praveen Malviya; Ramesh Babu > >> Betham; anders.wid...@ericsson.com > >> Cc: opensaf-devel@lists.sourceforge.net > >> Subject: [PATCH 1 of 1] nid: Use the FIFO monitoring for started > >> services V2 [#2204] > >> > >> osaf/services/infrastructure/nid/Makefile.am | 2 +- > >> osaf/services/infrastructure/nid/nodeinit.c | 285 > >> ++++++++++++++++++++++++++- > >> 2 files changed, 278 insertions(+), 9 deletions(-) > >> > >> > >> diff --git a/osaf/services/infrastructure/nid/Makefile.am > >> b/osaf/services/infrastructure/nid/Makefile.am > >> --- a/osaf/services/infrastructure/nid/Makefile.am > >> +++ b/osaf/services/infrastructure/nid/Makefile.am > >> @@ -31,7 +31,7 @@ opensafd_CPPFLAGS = \ > >> $(AM_CPPFLAGS) > >> > >> opensafd_SOURCES = \ > >> - nodeinit.c > >> + nodeinit.cc > >> > >> opensafd_LDADD = \ > >> $(top_builddir)/osaf/libs/core/libopensaf_core.la > >> diff --git a/osaf/services/infrastructure/nid/nodeinit.c > >> b/osaf/services/infrastructure/nid/nodeinit.cc > >> rename from osaf/services/infrastructure/nid/nodeinit.c > >> rename to osaf/services/infrastructure/nid/nodeinit.cc > >> --- a/osaf/services/infrastructure/nid/nodeinit.c > >> +++ b/osaf/services/infrastructure/nid/nodeinit.cc > >> @@ -63,10 +63,15 @@ > >> #include <configmake.h> > >> #include <rda_papi.h> > >> #include <logtrace.h> > >> + > >> +#include <string> > >> +#include <vector> > >> + > >> #include "osaf_poll.h" > >> #include "osaf_time.h" > >> > >> #include "nodeinit.h" > >> +#include "osaf/libs/core/cplusplus/base/file_notify.h" > >> > >> #define SETSIG(sa, sig, fun, flags) \ > >> do { \ > >> @@ -111,11 +116,46 @@ static uint32_t recovery_action(NID_SPAW static > >> uint32_t spawn_services(char *); static void nid_sleep(uint32_t); > >> > >> +/* Functions used for service monitoring */ static uint32_t > >> +create_svc_monitor_thread(void); static void* > >> +svc_monitor_thread(void *fd); static int handle_data_request(struct > >> +pollfd *fds, const std::string &nid_name); static void > >> +handle_svc_exit(int fd); static std::string get_svc_name(int fd); > >> +static int start_monitor_svc(const char *svc); > >> + > >> +/* Data declarations for service monitoring */ static int svc_mon_fd > >> += -1; static int next_svc_fds_slot = 0; > >> + > >> +struct SvcMap { > >> + std::string nid_name; > >> + std::string fifo_file; > >> + int fifo_fd; > >> +}; > >> + > >> +static std::vector<SvcMap> svc_map = { > >> + {"AMFD", "osafamfd.fifo", -1}, > >> + {"TRANSPORT", "osaftransportd.fifo", -1}, > >> + {"CLMNA", "osafclmna.fifo", -1}, > >> + {"RDED", "osafrded.fifo", -1}, > >> + {"HLFM", "osaffmd.fifo", -1}, > >> + {"IMMD", "osafimmd.fifo", -1}, > >> + {"IMMND", "osafimmnd.fifo", -1}, > >> + {"LOGD", "osaflogd.fifo", -1}, > >> + {"NTFD", "osafntfd.fifo", -1}, > >> + {"PLMD", "osafplmd.fifo", -1}, > >> + {"CLMD", "osafclmd.fifo", -1}, > >> +}; > >> +static const std::string fifo_dir = PKGLOCALSTATEDIR; const int > >> +kMaxNumOfFds = 40; const int kTenSecondsInMilliseconds = 10000; > >> + > >> /* List of recovery strategies */ > >> NID_FUNC recovery_funcs[] = { spawn_wait }; NID_FORK_FUNC > >> fork_funcs[] = { fork_process, fork_script, fork_daemon }; > >> > >> -char *nid_recerr[NID_MAXREC][4] = { > >> +const char *nid_recerr[NID_MAXREC][4] = { > >> {"Trying To RESPAWN", "Could Not RESPAWN", "Succeeded To > RESPAWN", > >> "FAILED TO RESPAWN"}, > >> {"Trying To RESET", "Faild to RESET", "suceeded To RESET", "FAILED > >> AFTER RESTART"} }; @@ -167,10 +207,10 @@ char *gettoken(char **str, > >> uint32_t tok) > >> return (NULL); > >> } > >> > >> - while ((*p != tok) && (*p != '\n') && *p) > >> + while ((*p != static_cast<int>(tok)) && (*p != '\n') && *p) > >> p++; > >> > >> - if ((*p == tok) || (*p == '\n')) { > >> + if ((*p == static_cast<int>(tok)) || (*p == '\n')) { > >> *p++ = 0; > >> *str = p; > >> } > >> @@ -522,7 +562,7 @@ uint32_t parse_nodeinit_conf(char *strbu > >> NID_SPAWN_INFO *childinfo; > >> char buff[256], sbuf[200], *ch, *ch1, tmp[30], nidconf[256]; > >> uint32_t lineno = 0, retry = 0; > >> - struct nid_resetinfo info = { {""}, -1 }; > >> + struct nid_resetinfo info = { {""}, static_cast<uint32_t>(-1) }; > >> FILE *file, *ntfile; > >> > >> TRACE_ENTER(); > >> @@ -565,7 +605,7 @@ uint32_t parse_nodeinit_conf(char *strbu > >> } > >> > >> /* Allocate mem for new child info */ > >> - while ((childinfo = malloc(sizeof(NID_SPAWN_INFO))) == > >> NULL) { > >> + while ((childinfo = > >> > +reinterpret_cast<NID_SPAWN_INFO*>(malloc(sizeof(NID_SPAWN_INFO)) > >> )) == > >> +NULL) { > >> if (retry++ == 5) { > >> sprintf(strbuf, "FAILURE: Out of memory\n"); > >> return NCSCC_RC_FAILURE; > >> @@ -994,6 +1034,8 @@ uint32_t spawn_wait(NID_SPAWN_INFO *serv > >> break; > >> } > >> > >> + waitpid(pid, NULL, WNOHANG); > >> + > >> /* Read the message from FIFO and fill in structure. */ > >> while ((n = read(select_fd, buff1, sizeof(buff1))) <= 0) { > >> if (errno == EINTR) { > >> @@ -1263,7 +1305,7 @@ uint32_t recovery_action(NID_SPAWN_INFO > >> if (service->recovery_matrix[opt].retry_count == 0) { > >> if (count != 0) > >> LOG_ER("%s", nid_recerr[opt][3]); > >> - opt++; > >> + opt = > >> static_cast<NID_RECOVERY_OPT>(static_cast<int>(opt) +1); > >> continue; > >> } > >> } > >> @@ -1285,8 +1327,7 @@ uint32_t recovery_action(NID_SPAWN_INFO > >> * Return Values : NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE. > * > >> * > >> * > >> > >> > ********************************************************** > >> *****************/ > >> -uint32_t spawn_services(char *strbuf) -{ > >> +uint32_t spawn_services(char *strbuf) { > >> NID_SPAWN_INFO *service; > >> NID_CHILD_LIST sp_list = spawn_list; > >> char sbuff[100]; > >> @@ -1322,6 +1363,10 @@ uint32_t spawn_services(char *strbuf) > >> if (strlen(sbuff) > 0) > >> LOG_NO("%s", sbuff); > >> > >> + if (start_monitor_svc(service->serv_name) != > >> NCSCC_RC_SUCCESS) { > >> + exit(EXIT_FAILURE); > >> + } > >> + > >> sp_list.head = sp_list.head->next; > >> } > >> > >> @@ -1330,6 +1375,225 @@ uint32_t spawn_services(char *strbuf) > >> return NCSCC_RC_SUCCESS; > >> } > >> > >> +int start_monitor_svc(const char *svc) { > >> + int rc = NCSCC_RC_SUCCESS; > >> + char svc_name[NID_MAXSNAME]; > >> + > >> + TRACE_ENTER2("service: %s", svc); > >> + > >> + strncpy(svc_name, svc, sizeof(svc_name)); > >> + > >> + while (true) { > >> + ssize_t write_rc = write(svc_mon_fd, svc_name, strlen(svc_name)); > >> + if (write_rc == -1) { > >> + if (errno == EINTR) { > >> + continue; > >> + } else { > >> + LOG_ER("Failed to start sevice %s, error: %s", > >> + svc_name, strerror(errno)); > >> + rc = NCSCC_RC_FAILURE; > >> + break; > >> + } > >> + } > >> + break; > >> + } > >> + TRACE_LEAVE(); > >> + return rc; > >> +} > >> + > >> +int handle_data_request(struct pollfd *fds, const std::string > >> +&nid_name) { > >> + base::FileNotify file_notify; > >> + base::FileNotify::FileNotifyErrors notify_rc; > >> + int rc = NCSCC_RC_SUCCESS; > >> + int fifo_fd = -1; > >> + > >> + TRACE_ENTER2("service: %s", nid_name.c_str()); > >> + > >> + for (auto &svc : svc_map) { > >> + if (nid_name == svc.nid_name) { > >> + std::string fifo_file = fifo_dir + "/" + svc.fifo_file; > >> + notify_rc = file_notify.WaitForFileCreation(fifo_file, > >> + > >> kTenSecondsInMilliseconds); > >> + if (notify_rc != base::FileNotify::FileNotifyErrors::kOK) { > >> + LOG_ER("fifo file %s does not exist, notify rc: %d", > >> + fifo_file.c_str(), notify_rc); > >> + rc = NCSCC_RC_FAILURE; > >> + break; > >> + } > >> + int retry_cnt = 0; > >> + do { > >> + if (retry_cnt > 0) { > >> + osaf_nanosleep(&kHundredMilliseconds); > >> + } > >> + fifo_fd = open(fifo_file.c_str(), O_WRONLY|O_NONBLOCK); > >> + } while ((fifo_fd == -1) && > >> + (retry_cnt++ < 5 && (errno == EINTR || errno == > >> + ENXIO))); > >> + > >> + if (fifo_fd == -1) { > >> + LOG_ER("Failed to open %s, error: %s", fifo_file.c_str(), > >> + strerror(errno)); > >> + rc = NCSCC_RC_FAILURE; > >> + break; > >> + } else { > >> + svc.fifo_fd = fifo_fd; > >> + fds[next_svc_fds_slot].fd = fifo_fd; > >> + fds[next_svc_fds_slot].events = POLLIN; > >> + next_svc_fds_slot++; > >> + LOG_NO("Monitoring of %s started", nid_name.c_str()); > >> + break; > >> + } > >> + } > >> + } > >> + TRACE_LEAVE(); > >> + return rc; > >> +} > >> + > >> +std::string get_svc_name(int fd) { > >> + std::string svc_name; > >> + > >> + for (auto const& svc : svc_map) { > >> + if (fd == svc.fifo_fd) { > >> + svc_name = svc.nid_name; > >> + break; > >> + } > >> + } > >> + return svc_name; > >> +} > >> + > >> +void handle_svc_exit(int fd) { > >> + const std::string &svc_name = get_svc_name(fd); > >> + > >> + if (svc_name.size() != 0) { > >> + LOG_ER("Service %s has unexpectedly crashed. Unable to continue, > >> exiting", > >> + svc_name.c_str()); > >> + exit(EXIT_FAILURE); > >> + } else { > >> + LOG_NO("fd %d was not found in service map", fd); > >> + } > >> +} > >> + > >> > +/********************************************************* > >> ******************* > >> + * Name : svc_monitor_thread > >> * > >> + * > >> * > >> + * Description : creates the service monitor thread > >> * > >> + * > >> * > >> + * Arguments : - > >> * > >> + * > >> * > >> + * Return Values : NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE. > * > >> + * > >> * > >> + > >> > +********************************************************* > >> ************** > >> +****/ > >> +void* svc_monitor_thread(void *fd) { > >> + char nid_name[NID_MAXSNAME]; > >> + int svc_mon_thr_fd = *(reinterpret_cast<int*>(fd)); > >> + enum { > >> + FD_SVC_MON_THR = 0, > >> + }; > >> + > >> + struct pollfd *fds; > >> + > >> + fds = new pollfd[sizeof(pollfd) * kMaxNumOfFds]; osafassert(fds > >> + != NULL); ssize_t read_rc = -1; > >> + > >> + fds[FD_SVC_MON_THR].fd = svc_mon_thr_fd; > >> fds[FD_SVC_MON_THR].events > >> + = POLLIN; next_svc_fds_slot++; > >> + > >> + while (true) { > >> + unsigned rc = osaf_poll(fds, next_svc_fds_slot, -1); > >> + if (rc > 0) { > >> + // check if any monitored service has exit > >> + for (int i = next_svc_fds_slot-1; i > 0; --i) { > >> + if ((fds[i].revents & POLLIN) || > >> + (fds[i].revents & POLLHUP) || > >> + (fds[i].revents & POLLERR)) { > >> + handle_svc_exit(fds[i].fd); > >> + } > >> + } > >> + > >> + if (fds[FD_SVC_MON_THR].revents & POLLIN) { > >> + while (true) { > >> + read_rc = read(svc_mon_thr_fd, nid_name, NID_MAXSNAME); > >> + if (read_rc == -1) { > >> + if (errno == EINTR) { > >> + continue; > >> + } else { > >> + LOG_ER("Failed to read on socketpair descriptor: %s", > >> + strerror(errno)); > >> + exit(EXIT_FAILURE); > >> + } > >> + } > >> + osafassert(read_rc < NID_MAXSNAME); > >> + nid_name[read_rc] = '\0'; > >> + break; > >> + } > >> + if (handle_data_request(fds, nid_name) != NCSCC_RC_SUCCESS) { > >> + LOG_ER("Failed to start monitoring for service %s, exiting", > >> + nid_name); > >> + exit(EXIT_FAILURE); > >> + } > >> + } > >> + } else { > >> + LOG_ER("osaf_poll timed out and no descriptors are ready, exiting"); > >> + exit(EXIT_FAILURE); > >> + } > >> + } > >> + delete [] fds; > >> +} > >> + > >> > +/********************************************************* > >> ******************* > >> + * Name : create_svc_monitor_thread > >> * > >> + * > >> * > >> + * Description : creates the service monitor thread > >> * > >> + * > >> * > >> + * Arguments : - > >> * > >> + * > >> * > >> + * Return Values : NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE. > * > >> + * > >> * > >> + > >> > +********************************************************* > >> ************** > >> +****/ uint32_t create_svc_monitor_thread(void) { > >> + int s_pair[2]; > >> + int svc_mon_thr_fd = -1; > >> + pthread_t thread; > >> + pthread_attr_t attr; > >> + > >> + TRACE_ENTER(); > >> + > >> + if (socketpair(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0, s_pair) == - > 1) { > >> + LOG_ER("socketpair FAILED: %s", strerror(errno)); > >> + return NCSCC_RC_FAILURE; > >> + } > >> + > >> + svc_mon_fd = s_pair[0]; > >> + svc_mon_thr_fd = s_pair[1]; > >> + > >> + TRACE("sd1: %d sd2: %d", svc_mon_fd, svc_mon_thr_fd); > >> + > >> + if (pthread_attr_init(&attr) != 0) { > >> + LOG_ER("pthread_attr_init FAILED: %s", strerror(errno)); > >> + return NCSCC_RC_FAILURE; > >> + } > >> + > >> + if (pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED) > != > >> 0) { > >> + LOG_ER("pthread_setdetachstate FAILED: %s", strerror(errno)); > >> + return NCSCC_RC_FAILURE; > >> + } > >> + > >> + if (pthread_create(&thread, &attr, svc_monitor_thread, > >> + reinterpret_cast<void*>(&svc_mon_thr_fd)) != 0) { > >> + LOG_ER("pthread_create FAILED: %s", strerror(errno)); > >> + return NCSCC_RC_FAILURE; > >> + } > >> + > >> + if (pthread_attr_destroy(&attr) != 0) { > >> + LOG_ER("pthread_attr_destroy FAILED: %s", strerror(errno)); > >> + return NCSCC_RC_FAILURE; > >> + } > >> + > >> + TRACE_LEAVE(); > >> + return NCSCC_RC_SUCCESS; > >> +} > >> + > >> > >> > /********************************************************** > >> ****************** > >> * Name : main > >> * > >> * > >> * > >> @@ -1365,6 +1629,11 @@ int main(int argc, char *argv[]) > >> exit(EXIT_FAILURE); > >> } > >> > >> + if (create_svc_monitor_thread() != NCSCC_RC_SUCCESS) { > >> + LOG_ER("Failed to create service monitor thread, exiting"); > >> + exit(EXIT_FAILURE); > >> + } > >> + > >> if (parse_nodeinit_conf(sbuf) != NCSCC_RC_SUCCESS) { > >> LOG_ER("Failed to parse file %s. Exiting", sbuf); > >> exit(EXIT_FAILURE); > > > ------------------------------------------------------------------------------ > Check out the vibrant tech community on one of the world's most engaging > tech sites, SlashDot.org! http://sdm.link/slashdot > _______________________________________________ > Opensaf-devel mailing list > Opensaf-devel@lists.sourceforge.net > https://lists.sourceforge.net/lists/listinfo/opensaf-devel ------------------------------------------------------------------------------ Developer Access Program for Intel Xeon Phi Processors Access to Intel Xeon Phi processor-based developer platforms. With one year of Intel Parallel Studio XE. Training and support from Colfax. Order your platform today.http://sdm.link/intel _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel