Hi Hans, Since there are no plans to push in older branches, ticket should be updated for "Milestone" and "Type".
Thanks, Praveen On 21-Dec-16 3:25 PM, Hans Nordeback wrote: > Hi Mathi, > > thanks, I'll only push on the default branch. I think though it is > important to have the system started in a well defined state > > and not allow continue running the system with crashed services as of > today. > > /Regards HansN > > > On 12/21/2016 10:42 AM, Mathivanan Naickan Palanivelu wrote: >> Hi Hans, >> >> Code changes are okay. >> This change in the behavior (when NID started services fail) might not >> be acceptable to users of older branches. Just a thought. >> >> Otherwise, Ack from me. >> >> Mathi. >> >>> -----Original Message----- >>> From: Hans Nordebäck [mailto:hans.nordeb...@ericsson.com] >>> Sent: Tuesday, December 20, 2016 4:21 PM >>> To: Mathivanan Naickan Palanivelu; Ramesh Babu Betham >>> Cc: opensaf-devel@lists.sourceforge.net; Praveen Malviya; Anders Widell >>> Subject: RE: [devel] [PATCH 1 of 1] nid: Use the FIFO monitoring for >>> started >>> services V2 [#2204] >>> >>> Hi Mathi & Ramesh, >>> >>> A gentle reminder, Is it ok to push this patch? >>> >>> /Thanks HansN >>> >>> -----Original Message----- >>> From: Hans Nordebäck >>> Sent: den 19 december 2016 13:40 >>> To: Mathivanan Naickan Palanivelu <mathi.naic...@oracle.com>; Praveen >>> Malviya <praveen.malv...@oracle.com>; Ramesh Babu Betham >>> <ramesh.bet...@oracle.com>; Anders Widell >>> <anders.wid...@ericsson.com> >>> Cc: opensaf-devel@lists.sourceforge.net >>> Subject: Re: [devel] [PATCH 1 of 1] nid: Use the FIFO monitoring for >>> started >>> services V2 [#2204] >>> >>> Hi Mathi, >>> >>> please see comment inlined with [HansN]. >>> >>> /Thanks HansN >>> >>> >>> On 12/19/2016 01:05 PM, Mathivanan Naickan Palanivelu wrote: >>>> Hi Hans, >>>> >>>> Quick comments >>>> >>>> (a) Remove the comma below: >>>> + {"CLMD", "osafclmd.fifo", -1}, >>> [HansN] I have removed the comma. >>>> (b) NULL is not a part of c++11 right. >>> [HansN] yes, I changed NULL to nullptr. >>>> (c) I'm not sure there is an obvious reason for converting this file >>>> to c++ >>> [HansN] one reason to change to C++ was easier use of e.g. FileNotify >>> and >>> use STL. >>>> Thanks, >>>> Mathi. >>>> >>>>> -----Original Message----- >>>>> From: Hans Nordeback [mailto:hans.nordeb...@ericsson.com] >>>>> Sent: Tuesday, December 13, 2016 7:27 PM >>>>> To: Mathivanan Naickan Palanivelu; Praveen Malviya; Ramesh Babu >>>>> Betham; anders.wid...@ericsson.com >>>>> Cc: opensaf-devel@lists.sourceforge.net >>>>> Subject: [PATCH 1 of 1] nid: Use the FIFO monitoring for started >>>>> services V2 [#2204] >>>>> >>>>> osaf/services/infrastructure/nid/Makefile.am | 2 +- >>>>> osaf/services/infrastructure/nid/nodeinit.c | 285 >>>>> ++++++++++++++++++++++++++- >>>>> 2 files changed, 278 insertions(+), 9 deletions(-) >>>>> >>>>> >>>>> diff --git a/osaf/services/infrastructure/nid/Makefile.am >>>>> b/osaf/services/infrastructure/nid/Makefile.am >>>>> --- a/osaf/services/infrastructure/nid/Makefile.am >>>>> +++ b/osaf/services/infrastructure/nid/Makefile.am >>>>> @@ -31,7 +31,7 @@ opensafd_CPPFLAGS = \ >>>>> $(AM_CPPFLAGS) >>>>> >>>>> opensafd_SOURCES = \ >>>>> - nodeinit.c >>>>> + nodeinit.cc >>>>> >>>>> opensafd_LDADD = \ >>>>> $(top_builddir)/osaf/libs/core/libopensaf_core.la >>>>> diff --git a/osaf/services/infrastructure/nid/nodeinit.c >>>>> b/osaf/services/infrastructure/nid/nodeinit.cc >>>>> rename from osaf/services/infrastructure/nid/nodeinit.c >>>>> rename to osaf/services/infrastructure/nid/nodeinit.cc >>>>> --- a/osaf/services/infrastructure/nid/nodeinit.c >>>>> +++ b/osaf/services/infrastructure/nid/nodeinit.cc >>>>> @@ -63,10 +63,15 @@ >>>>> #include <configmake.h> >>>>> #include <rda_papi.h> >>>>> #include <logtrace.h> >>>>> + >>>>> +#include <string> >>>>> +#include <vector> >>>>> + >>>>> #include "osaf_poll.h" >>>>> #include "osaf_time.h" >>>>> >>>>> #include "nodeinit.h" >>>>> +#include "osaf/libs/core/cplusplus/base/file_notify.h" >>>>> >>>>> #define SETSIG(sa, sig, fun, flags) \ >>>>> do { \ >>>>> @@ -111,11 +116,46 @@ static uint32_t recovery_action(NID_SPAW static >>>>> uint32_t spawn_services(char *); static void nid_sleep(uint32_t); >>>>> >>>>> +/* Functions used for service monitoring */ static uint32_t >>>>> +create_svc_monitor_thread(void); static void* >>>>> +svc_monitor_thread(void *fd); static int handle_data_request(struct >>>>> +pollfd *fds, const std::string &nid_name); static void >>>>> +handle_svc_exit(int fd); static std::string get_svc_name(int fd); >>>>> +static int start_monitor_svc(const char *svc); >>>>> + >>>>> +/* Data declarations for service monitoring */ static int svc_mon_fd >>>>> += -1; static int next_svc_fds_slot = 0; >>>>> + >>>>> +struct SvcMap { >>>>> + std::string nid_name; >>>>> + std::string fifo_file; >>>>> + int fifo_fd; >>>>> +}; >>>>> + >>>>> +static std::vector<SvcMap> svc_map = { >>>>> + {"AMFD", "osafamfd.fifo", -1}, >>>>> + {"TRANSPORT", "osaftransportd.fifo", -1}, >>>>> + {"CLMNA", "osafclmna.fifo", -1}, >>>>> + {"RDED", "osafrded.fifo", -1}, >>>>> + {"HLFM", "osaffmd.fifo", -1}, >>>>> + {"IMMD", "osafimmd.fifo", -1}, >>>>> + {"IMMND", "osafimmnd.fifo", -1}, >>>>> + {"LOGD", "osaflogd.fifo", -1}, >>>>> + {"NTFD", "osafntfd.fifo", -1}, >>>>> + {"PLMD", "osafplmd.fifo", -1}, >>>>> + {"CLMD", "osafclmd.fifo", -1}, >>>>> +}; >>>>> +static const std::string fifo_dir = PKGLOCALSTATEDIR; const int >>>>> +kMaxNumOfFds = 40; const int kTenSecondsInMilliseconds = 10000; >>>>> + >>>>> /* List of recovery strategies */ >>>>> NID_FUNC recovery_funcs[] = { spawn_wait }; NID_FORK_FUNC >>>>> fork_funcs[] = { fork_process, fork_script, fork_daemon }; >>>>> >>>>> -char *nid_recerr[NID_MAXREC][4] = { >>>>> +const char *nid_recerr[NID_MAXREC][4] = { >>>>> {"Trying To RESPAWN", "Could Not RESPAWN", "Succeeded To >>> RESPAWN", >>>>> "FAILED TO RESPAWN"}, >>>>> {"Trying To RESET", "Faild to RESET", "suceeded To RESET", >>>>> "FAILED >>>>> AFTER RESTART"} }; @@ -167,10 +207,10 @@ char *gettoken(char **str, >>>>> uint32_t tok) >>>>> return (NULL); >>>>> } >>>>> >>>>> - while ((*p != tok) && (*p != '\n') && *p) >>>>> + while ((*p != static_cast<int>(tok)) && (*p != '\n') && *p) >>>>> p++; >>>>> >>>>> - if ((*p == tok) || (*p == '\n')) { >>>>> + if ((*p == static_cast<int>(tok)) || (*p == '\n')) { >>>>> *p++ = 0; >>>>> *str = p; >>>>> } >>>>> @@ -522,7 +562,7 @@ uint32_t parse_nodeinit_conf(char *strbu >>>>> NID_SPAWN_INFO *childinfo; >>>>> char buff[256], sbuf[200], *ch, *ch1, tmp[30], nidconf[256]; >>>>> uint32_t lineno = 0, retry = 0; >>>>> - struct nid_resetinfo info = { {""}, -1 }; >>>>> + struct nid_resetinfo info = { {""}, static_cast<uint32_t>(-1) }; >>>>> FILE *file, *ntfile; >>>>> >>>>> TRACE_ENTER(); >>>>> @@ -565,7 +605,7 @@ uint32_t parse_nodeinit_conf(char *strbu >>>>> } >>>>> >>>>> /* Allocate mem for new child info */ >>>>> - while ((childinfo = malloc(sizeof(NID_SPAWN_INFO))) == >>>>> NULL) { >>>>> + while ((childinfo = >>>>> >>> +reinterpret_cast<NID_SPAWN_INFO*>(malloc(sizeof(NID_SPAWN_INFO)) >>>>> )) == >>>>> +NULL) { >>>>> if (retry++ == 5) { >>>>> sprintf(strbuf, "FAILURE: Out of memory\n"); >>>>> return NCSCC_RC_FAILURE; >>>>> @@ -994,6 +1034,8 @@ uint32_t spawn_wait(NID_SPAWN_INFO *serv >>>>> break; >>>>> } >>>>> >>>>> + waitpid(pid, NULL, WNOHANG); >>>>> + >>>>> /* Read the message from FIFO and fill in structure. */ >>>>> while ((n = read(select_fd, buff1, sizeof(buff1))) <= 0) { >>>>> if (errno == EINTR) { >>>>> @@ -1263,7 +1305,7 @@ uint32_t recovery_action(NID_SPAWN_INFO >>>>> if (service->recovery_matrix[opt].retry_count == 0) { >>>>> if (count != 0) >>>>> LOG_ER("%s", nid_recerr[opt][3]); >>>>> - opt++; >>>>> + opt = >>>>> static_cast<NID_RECOVERY_OPT>(static_cast<int>(opt) +1); >>>>> continue; >>>>> } >>>>> } >>>>> @@ -1285,8 +1327,7 @@ uint32_t recovery_action(NID_SPAWN_INFO >>>>> * Return Values : NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE. >>> * >>>>> >>>>> * >>>>> * >>>>> >>>>> >>> ********************************************************** >>>>> *****************/ >>>>> -uint32_t spawn_services(char *strbuf) -{ >>>>> +uint32_t spawn_services(char *strbuf) { >>>>> NID_SPAWN_INFO *service; >>>>> NID_CHILD_LIST sp_list = spawn_list; >>>>> char sbuff[100]; >>>>> @@ -1322,6 +1363,10 @@ uint32_t spawn_services(char *strbuf) >>>>> if (strlen(sbuff) > 0) >>>>> LOG_NO("%s", sbuff); >>>>> >>>>> + if (start_monitor_svc(service->serv_name) != >>>>> NCSCC_RC_SUCCESS) { >>>>> + exit(EXIT_FAILURE); >>>>> + } >>>>> + >>>>> sp_list.head = sp_list.head->next; >>>>> } >>>>> >>>>> @@ -1330,6 +1375,225 @@ uint32_t spawn_services(char *strbuf) >>>>> return NCSCC_RC_SUCCESS; >>>>> } >>>>> >>>>> +int start_monitor_svc(const char *svc) { >>>>> + int rc = NCSCC_RC_SUCCESS; >>>>> + char svc_name[NID_MAXSNAME]; >>>>> + >>>>> + TRACE_ENTER2("service: %s", svc); >>>>> + >>>>> + strncpy(svc_name, svc, sizeof(svc_name)); >>>>> + >>>>> + while (true) { >>>>> + ssize_t write_rc = write(svc_mon_fd, svc_name, strlen(svc_name)); >>>>> + if (write_rc == -1) { >>>>> + if (errno == EINTR) { >>>>> + continue; >>>>> + } else { >>>>> + LOG_ER("Failed to start sevice %s, error: %s", >>>>> + svc_name, strerror(errno)); >>>>> + rc = NCSCC_RC_FAILURE; >>>>> + break; >>>>> + } >>>>> + } >>>>> + break; >>>>> + } >>>>> + TRACE_LEAVE(); >>>>> + return rc; >>>>> +} >>>>> + >>>>> +int handle_data_request(struct pollfd *fds, const std::string >>>>> +&nid_name) { >>>>> + base::FileNotify file_notify; >>>>> + base::FileNotify::FileNotifyErrors notify_rc; >>>>> + int rc = NCSCC_RC_SUCCESS; >>>>> + int fifo_fd = -1; >>>>> + >>>>> + TRACE_ENTER2("service: %s", nid_name.c_str()); >>>>> + >>>>> + for (auto &svc : svc_map) { >>>>> + if (nid_name == svc.nid_name) { >>>>> + std::string fifo_file = fifo_dir + "/" + svc.fifo_file; >>>>> + notify_rc = file_notify.WaitForFileCreation(fifo_file, >>>>> + >>>>> kTenSecondsInMilliseconds); >>>>> + if (notify_rc != base::FileNotify::FileNotifyErrors::kOK) { >>>>> + LOG_ER("fifo file %s does not exist, notify rc: %d", >>>>> + fifo_file.c_str(), notify_rc); >>>>> + rc = NCSCC_RC_FAILURE; >>>>> + break; >>>>> + } >>>>> + int retry_cnt = 0; >>>>> + do { >>>>> + if (retry_cnt > 0) { >>>>> + osaf_nanosleep(&kHundredMilliseconds); >>>>> + } >>>>> + fifo_fd = open(fifo_file.c_str(), O_WRONLY|O_NONBLOCK); >>>>> + } while ((fifo_fd == -1) && >>>>> + (retry_cnt++ < 5 && (errno == EINTR || errno == >>>>> + ENXIO))); >>>>> + >>>>> + if (fifo_fd == -1) { >>>>> + LOG_ER("Failed to open %s, error: %s", fifo_file.c_str(), >>>>> + strerror(errno)); >>>>> + rc = NCSCC_RC_FAILURE; >>>>> + break; >>>>> + } else { >>>>> + svc.fifo_fd = fifo_fd; >>>>> + fds[next_svc_fds_slot].fd = fifo_fd; >>>>> + fds[next_svc_fds_slot].events = POLLIN; >>>>> + next_svc_fds_slot++; >>>>> + LOG_NO("Monitoring of %s started", nid_name.c_str()); >>>>> + break; >>>>> + } >>>>> + } >>>>> + } >>>>> + TRACE_LEAVE(); >>>>> + return rc; >>>>> +} >>>>> + >>>>> +std::string get_svc_name(int fd) { >>>>> + std::string svc_name; >>>>> + >>>>> + for (auto const& svc : svc_map) { >>>>> + if (fd == svc.fifo_fd) { >>>>> + svc_name = svc.nid_name; >>>>> + break; >>>>> + } >>>>> + } >>>>> + return svc_name; >>>>> +} >>>>> + >>>>> +void handle_svc_exit(int fd) { >>>>> + const std::string &svc_name = get_svc_name(fd); >>>>> + >>>>> + if (svc_name.size() != 0) { >>>>> + LOG_ER("Service %s has unexpectedly crashed. Unable to continue, >>>>> exiting", >>>>> + svc_name.c_str()); >>>>> + exit(EXIT_FAILURE); >>>>> + } else { >>>>> + LOG_NO("fd %d was not found in service map", fd); >>>>> + } >>>>> +} >>>>> + >>>>> >>> +/********************************************************* >>>>> ******************* >>>>> + * Name : >>>>> svc_monitor_thread * >>>>> + >>>>> * >>>>> * >>>>> + * Description : creates the service monitor >>>>> thread * >>>>> + >>>>> * >>>>> * >>>>> + * Arguments : >>>>> - * >>>>> + >>>>> * >>>>> * >>>>> + * Return Values : NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE. >>> * >>>>> + >>>>> * >>>>> * >>>>> + >>>>> >>> +********************************************************* >>>>> ************** >>>>> +****/ >>>>> +void* svc_monitor_thread(void *fd) { >>>>> + char nid_name[NID_MAXSNAME]; >>>>> + int svc_mon_thr_fd = *(reinterpret_cast<int*>(fd)); >>>>> + enum { >>>>> + FD_SVC_MON_THR = 0, >>>>> + }; >>>>> + >>>>> + struct pollfd *fds; >>>>> + >>>>> + fds = new pollfd[sizeof(pollfd) * kMaxNumOfFds]; osafassert(fds >>>>> + != NULL); ssize_t read_rc = -1; >>>>> + >>>>> + fds[FD_SVC_MON_THR].fd = svc_mon_thr_fd; >>>>> fds[FD_SVC_MON_THR].events >>>>> + = POLLIN; next_svc_fds_slot++; >>>>> + >>>>> + while (true) { >>>>> + unsigned rc = osaf_poll(fds, next_svc_fds_slot, -1); >>>>> + if (rc > 0) { >>>>> + // check if any monitored service has exit >>>>> + for (int i = next_svc_fds_slot-1; i > 0; --i) { >>>>> + if ((fds[i].revents & POLLIN) || >>>>> + (fds[i].revents & POLLHUP) || >>>>> + (fds[i].revents & POLLERR)) { >>>>> + handle_svc_exit(fds[i].fd); >>>>> + } >>>>> + } >>>>> + >>>>> + if (fds[FD_SVC_MON_THR].revents & POLLIN) { >>>>> + while (true) { >>>>> + read_rc = read(svc_mon_thr_fd, nid_name, NID_MAXSNAME); >>>>> + if (read_rc == -1) { >>>>> + if (errno == EINTR) { >>>>> + continue; >>>>> + } else { >>>>> + LOG_ER("Failed to read on socketpair descriptor: %s", >>>>> + strerror(errno)); >>>>> + exit(EXIT_FAILURE); >>>>> + } >>>>> + } >>>>> + osafassert(read_rc < NID_MAXSNAME); >>>>> + nid_name[read_rc] = '\0'; >>>>> + break; >>>>> + } >>>>> + if (handle_data_request(fds, nid_name) != NCSCC_RC_SUCCESS) { >>>>> + LOG_ER("Failed to start monitoring for service %s, >>>>> exiting", >>>>> + nid_name); >>>>> + exit(EXIT_FAILURE); >>>>> + } >>>>> + } >>>>> + } else { >>>>> + LOG_ER("osaf_poll timed out and no descriptors are ready, >>>>> exiting"); >>>>> + exit(EXIT_FAILURE); >>>>> + } >>>>> + } >>>>> + delete [] fds; >>>>> +} >>>>> + >>>>> >>> +/********************************************************* >>>>> ******************* >>>>> + * Name : >>>>> create_svc_monitor_thread * >>>>> + >>>>> * >>>>> * >>>>> + * Description : creates the service monitor >>>>> thread * >>>>> + >>>>> * >>>>> * >>>>> + * Arguments : >>>>> - * >>>>> + >>>>> * >>>>> * >>>>> + * Return Values : NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE. >>> * >>>>> + >>>>> * >>>>> * >>>>> + >>>>> >>> +********************************************************* >>>>> ************** >>>>> +****/ uint32_t create_svc_monitor_thread(void) { >>>>> + int s_pair[2]; >>>>> + int svc_mon_thr_fd = -1; >>>>> + pthread_t thread; >>>>> + pthread_attr_t attr; >>>>> + >>>>> + TRACE_ENTER(); >>>>> + >>>>> + if (socketpair(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0, s_pair) == - >>> 1) { >>>>> + LOG_ER("socketpair FAILED: %s", strerror(errno)); >>>>> + return NCSCC_RC_FAILURE; >>>>> + } >>>>> + >>>>> + svc_mon_fd = s_pair[0]; >>>>> + svc_mon_thr_fd = s_pair[1]; >>>>> + >>>>> + TRACE("sd1: %d sd2: %d", svc_mon_fd, svc_mon_thr_fd); >>>>> + >>>>> + if (pthread_attr_init(&attr) != 0) { >>>>> + LOG_ER("pthread_attr_init FAILED: %s", strerror(errno)); >>>>> + return NCSCC_RC_FAILURE; >>>>> + } >>>>> + >>>>> + if (pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED) >>> != >>>>> 0) { >>>>> + LOG_ER("pthread_setdetachstate FAILED: %s", strerror(errno)); >>>>> + return NCSCC_RC_FAILURE; >>>>> + } >>>>> + >>>>> + if (pthread_create(&thread, &attr, svc_monitor_thread, >>>>> + reinterpret_cast<void*>(&svc_mon_thr_fd)) != 0) { >>>>> + LOG_ER("pthread_create FAILED: %s", strerror(errno)); >>>>> + return NCSCC_RC_FAILURE; >>>>> + } >>>>> + >>>>> + if (pthread_attr_destroy(&attr) != 0) { >>>>> + LOG_ER("pthread_attr_destroy FAILED: %s", strerror(errno)); >>>>> + return NCSCC_RC_FAILURE; >>>>> + } >>>>> + >>>>> + TRACE_LEAVE(); >>>>> + return NCSCC_RC_SUCCESS; >>>>> +} >>>>> + >>>>> >>>>> >>> /********************************************************** >>>>> ****************** >>>>> * Name : >>>>> main * >>>>> >>>>> * >>>>> * >>>>> @@ -1365,6 +1629,11 @@ int main(int argc, char *argv[]) >>>>> exit(EXIT_FAILURE); >>>>> } >>>>> >>>>> + if (create_svc_monitor_thread() != NCSCC_RC_SUCCESS) { >>>>> + LOG_ER("Failed to create service monitor thread, exiting"); >>>>> + exit(EXIT_FAILURE); >>>>> + } >>>>> + >>>>> if (parse_nodeinit_conf(sbuf) != NCSCC_RC_SUCCESS) { >>>>> LOG_ER("Failed to parse file %s. Exiting", sbuf); >>>>> exit(EXIT_FAILURE); >>> >>> ------------------------------------------------------------------------------ >>> >>> Check out the vibrant tech community on one of the world's most engaging >>> tech sites, SlashDot.org! http://sdm.link/slashdot >>> _______________________________________________ >>> Opensaf-devel mailing list >>> Opensaf-devel@lists.sourceforge.net >>> https://lists.sourceforge.net/lists/listinfo/opensaf-devel > ------------------------------------------------------------------------------ Check out the vibrant tech community on one of the world's most engaging tech sites, SlashDot.org! http://sdm.link/slashdot _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel