Hi Hans,

Code changes are okay. 
This change in the behavior (when NID started services fail) might not be 
acceptable to users of older branches. Just a thought.

Otherwise, Ack from me.

Mathi.

> -----Original Message-----
> From: Hans Nordebäck [mailto:hans.nordeb...@ericsson.com]
> Sent: Tuesday, December 20, 2016 4:21 PM
> To: Mathivanan Naickan Palanivelu; Ramesh Babu Betham
> Cc: opensaf-devel@lists.sourceforge.net; Praveen Malviya; Anders Widell
> Subject: RE: [devel] [PATCH 1 of 1] nid: Use the FIFO monitoring for started
> services V2 [#2204]
> 
> Hi Mathi & Ramesh,
> 
> A gentle reminder, Is it ok to push this patch?
> 
> /Thanks HansN
> 
> -----Original Message-----
> From: Hans Nordebäck
> Sent: den 19 december 2016 13:40
> To: Mathivanan Naickan Palanivelu <mathi.naic...@oracle.com>; Praveen
> Malviya <praveen.malv...@oracle.com>; Ramesh Babu Betham
> <ramesh.bet...@oracle.com>; Anders Widell
> <anders.wid...@ericsson.com>
> Cc: opensaf-devel@lists.sourceforge.net
> Subject: Re: [devel] [PATCH 1 of 1] nid: Use the FIFO monitoring for started
> services V2 [#2204]
> 
> Hi Mathi,
> 
> please see comment inlined with [HansN].
> 
> /Thanks HansN
> 
> 
> On 12/19/2016 01:05 PM, Mathivanan Naickan Palanivelu wrote:
> > Hi Hans,
> >
> > Quick comments
> >
> > (a) Remove the comma below:
> > +  {"CLMD", "osafclmd.fifo", -1},
> [HansN] I have removed the comma.
> >
> > (b) NULL is not a part of c++11 right.
> [HansN] yes, I changed NULL to nullptr.
> >
> > (c) I'm not sure there is an obvious reason for converting this file
> > to c++
> [HansN] one reason to change to C++ was easier use of e.g. FileNotify and
> use STL.
> >
> > Thanks,
> > Mathi.
> >
> >> -----Original Message-----
> >> From: Hans Nordeback [mailto:hans.nordeb...@ericsson.com]
> >> Sent: Tuesday, December 13, 2016 7:27 PM
> >> To: Mathivanan Naickan Palanivelu; Praveen Malviya; Ramesh Babu
> >> Betham; anders.wid...@ericsson.com
> >> Cc: opensaf-devel@lists.sourceforge.net
> >> Subject: [PATCH 1 of 1] nid: Use the FIFO monitoring for started
> >> services V2 [#2204]
> >>
> >>   osaf/services/infrastructure/nid/Makefile.am |    2 +-
> >>   osaf/services/infrastructure/nid/nodeinit.c  |  285
> >> ++++++++++++++++++++++++++-
> >>   2 files changed, 278 insertions(+), 9 deletions(-)
> >>
> >>
> >> diff --git a/osaf/services/infrastructure/nid/Makefile.am
> >> b/osaf/services/infrastructure/nid/Makefile.am
> >> --- a/osaf/services/infrastructure/nid/Makefile.am
> >> +++ b/osaf/services/infrastructure/nid/Makefile.am
> >> @@ -31,7 +31,7 @@ opensafd_CPPFLAGS = \
> >>    $(AM_CPPFLAGS)
> >>
> >>   opensafd_SOURCES = \
> >> -  nodeinit.c
> >> +  nodeinit.cc
> >>
> >>   opensafd_LDADD = \
> >>    $(top_builddir)/osaf/libs/core/libopensaf_core.la
> >> diff --git a/osaf/services/infrastructure/nid/nodeinit.c
> >> b/osaf/services/infrastructure/nid/nodeinit.cc
> >> rename from osaf/services/infrastructure/nid/nodeinit.c
> >> rename to osaf/services/infrastructure/nid/nodeinit.cc
> >> --- a/osaf/services/infrastructure/nid/nodeinit.c
> >> +++ b/osaf/services/infrastructure/nid/nodeinit.cc
> >> @@ -63,10 +63,15 @@
> >>   #include <configmake.h>
> >>   #include <rda_papi.h>
> >>   #include <logtrace.h>
> >> +
> >> +#include <string>
> >> +#include <vector>
> >> +
> >>   #include "osaf_poll.h"
> >>   #include "osaf_time.h"
> >>
> >>   #include "nodeinit.h"
> >> +#include "osaf/libs/core/cplusplus/base/file_notify.h"
> >>
> >>   #define SETSIG(sa, sig, fun, flags) \
> >>    do { \
> >> @@ -111,11 +116,46 @@ static uint32_t recovery_action(NID_SPAW static
> >> uint32_t spawn_services(char *);  static void nid_sleep(uint32_t);
> >>
> >> +/* Functions used for service monitoring */ static uint32_t
> >> +create_svc_monitor_thread(void); static void*
> >> +svc_monitor_thread(void *fd); static int handle_data_request(struct
> >> +pollfd *fds, const std::string &nid_name); static void
> >> +handle_svc_exit(int fd); static std::string get_svc_name(int fd);
> >> +static int start_monitor_svc(const char *svc);
> >> +
> >> +/* Data declarations for service monitoring */ static int svc_mon_fd
> >> += -1; static int next_svc_fds_slot = 0;
> >> +
> >> +struct SvcMap {
> >> +  std::string nid_name;
> >> +  std::string fifo_file;
> >> +  int fifo_fd;
> >> +};
> >> +
> >> +static std::vector<SvcMap> svc_map = {
> >> +  {"AMFD", "osafamfd.fifo", -1},
> >> +  {"TRANSPORT", "osaftransportd.fifo", -1},
> >> +  {"CLMNA", "osafclmna.fifo", -1},
> >> +  {"RDED", "osafrded.fifo", -1},
> >> +  {"HLFM", "osaffmd.fifo", -1},
> >> +  {"IMMD", "osafimmd.fifo", -1},
> >> +  {"IMMND", "osafimmnd.fifo", -1},
> >> +  {"LOGD", "osaflogd.fifo", -1},
> >> +  {"NTFD", "osafntfd.fifo", -1},
> >> +  {"PLMD", "osafplmd.fifo", -1},
> >> +  {"CLMD", "osafclmd.fifo", -1},
> >> +};
> >> +static const std::string fifo_dir = PKGLOCALSTATEDIR; const int
> >> +kMaxNumOfFds = 40; const int kTenSecondsInMilliseconds = 10000;
> >> +
> >>   /* List of recovery strategies */
> >>   NID_FUNC recovery_funcs[] = { spawn_wait  };  NID_FORK_FUNC
> >> fork_funcs[] = { fork_process, fork_script, fork_daemon };
> >>
> >> -char *nid_recerr[NID_MAXREC][4] = {
> >> +const char *nid_recerr[NID_MAXREC][4] = {
> >>    {"Trying To RESPAWN", "Could Not RESPAWN", "Succeeded To
> RESPAWN",
> >> "FAILED TO RESPAWN"},
> >>    {"Trying To RESET", "Faild to RESET", "suceeded To RESET", "FAILED
> >> AFTER RESTART"}  }; @@ -167,10 +207,10 @@ char *gettoken(char **str,
> >> uint32_t tok)
> >>            return (NULL);
> >>    }
> >>
> >> -  while ((*p != tok) && (*p != '\n') && *p)
> >> +  while ((*p != static_cast<int>(tok)) && (*p != '\n') && *p)
> >>            p++;
> >>
> >> -  if ((*p == tok) || (*p == '\n')) {
> >> +  if ((*p == static_cast<int>(tok)) || (*p == '\n')) {
> >>            *p++ = 0;
> >>            *str = p;
> >>    }
> >> @@ -522,7 +562,7 @@ uint32_t parse_nodeinit_conf(char *strbu
> >>    NID_SPAWN_INFO *childinfo;
> >>    char buff[256], sbuf[200], *ch, *ch1, tmp[30], nidconf[256];
> >>    uint32_t lineno = 0, retry = 0;
> >> -  struct nid_resetinfo info = { {""}, -1 };
> >> +  struct nid_resetinfo info = { {""}, static_cast<uint32_t>(-1) };
> >>    FILE *file, *ntfile;
> >>
> >>    TRACE_ENTER();
> >> @@ -565,7 +605,7 @@ uint32_t parse_nodeinit_conf(char *strbu
> >>            }
> >>
> >>            /* Allocate mem for new child info */
> >> -          while ((childinfo = malloc(sizeof(NID_SPAWN_INFO))) ==
> >> NULL) {
> >> +          while ((childinfo =
> >>
> +reinterpret_cast<NID_SPAWN_INFO*>(malloc(sizeof(NID_SPAWN_INFO))
> >> )) ==
> >> +NULL) {
> >>                    if (retry++ == 5) {
> >>                            sprintf(strbuf, "FAILURE: Out of memory\n");
> >>                            return NCSCC_RC_FAILURE;
> >> @@ -994,6 +1034,8 @@ uint32_t spawn_wait(NID_SPAWN_INFO *serv
> >>            break;
> >>    }
> >>
> >> +  waitpid(pid, NULL, WNOHANG);
> >> +
> >>    /* Read the message from FIFO and fill in structure. */
> >>    while ((n = read(select_fd, buff1, sizeof(buff1))) <= 0) {
> >>            if (errno == EINTR) {
> >> @@ -1263,7 +1305,7 @@ uint32_t recovery_action(NID_SPAWN_INFO
> >>            if (service->recovery_matrix[opt].retry_count == 0) {
> >>                    if (count != 0)
> >>                            LOG_ER("%s", nid_recerr[opt][3]);
> >> -                  opt++;
> >> +                  opt =
> >> static_cast<NID_RECOVERY_OPT>(static_cast<int>(opt) +1);
> >>                    continue;
> >>            }
> >>    }
> >> @@ -1285,8 +1327,7 @@ uint32_t recovery_action(NID_SPAWN_INFO
> >>    * Return Values : NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE.
> *
> >>    *                                                                       
> >>    *
> >>
> >>
> **********************************************************
> >> *****************/
> >> -uint32_t spawn_services(char *strbuf) -{
> >> +uint32_t spawn_services(char *strbuf) {
> >>    NID_SPAWN_INFO *service;
> >>    NID_CHILD_LIST sp_list = spawn_list;
> >>    char sbuff[100];
> >> @@ -1322,6 +1363,10 @@ uint32_t spawn_services(char *strbuf)
> >>            if (strlen(sbuff) > 0)
> >>                    LOG_NO("%s", sbuff);
> >>
> >> +          if (start_monitor_svc(service->serv_name) !=
> >> NCSCC_RC_SUCCESS) {
> >> +                  exit(EXIT_FAILURE);
> >> +          }
> >> +
> >>            sp_list.head = sp_list.head->next;
> >>    }
> >>
> >> @@ -1330,6 +1375,225 @@ uint32_t spawn_services(char *strbuf)
> >>    return NCSCC_RC_SUCCESS;
> >>   }
> >>
> >> +int start_monitor_svc(const char *svc) {
> >> +  int rc = NCSCC_RC_SUCCESS;
> >> +  char svc_name[NID_MAXSNAME];
> >> +
> >> +  TRACE_ENTER2("service: %s", svc);
> >> +
> >> +  strncpy(svc_name, svc, sizeof(svc_name));
> >> +
> >> +  while (true) {
> >> +    ssize_t write_rc = write(svc_mon_fd, svc_name, strlen(svc_name));
> >> +    if (write_rc == -1) {
> >> +      if (errno == EINTR) {
> >> +        continue;
> >> +      } else {
> >> +        LOG_ER("Failed to start sevice %s, error: %s",
> >> +               svc_name, strerror(errno));
> >> +        rc = NCSCC_RC_FAILURE;
> >> +        break;
> >> +      }
> >> +    }
> >> +    break;
> >> +  }
> >> +  TRACE_LEAVE();
> >> +  return rc;
> >> +}
> >> +
> >> +int handle_data_request(struct pollfd *fds, const std::string
> >> +&nid_name) {
> >> +  base::FileNotify file_notify;
> >> +  base::FileNotify::FileNotifyErrors notify_rc;
> >> +  int rc = NCSCC_RC_SUCCESS;
> >> +  int fifo_fd = -1;
> >> +
> >> +  TRACE_ENTER2("service: %s", nid_name.c_str());
> >> +
> >> +  for (auto &svc : svc_map) {
> >> +    if (nid_name == svc.nid_name) {
> >> +      std::string fifo_file = fifo_dir + "/" + svc.fifo_file;
> >> +      notify_rc = file_notify.WaitForFileCreation(fifo_file,
> >> +                                                  
> >> kTenSecondsInMilliseconds);
> >> +      if (notify_rc != base::FileNotify::FileNotifyErrors::kOK) {
> >> +        LOG_ER("fifo file %s does not exist, notify rc: %d",
> >> +               fifo_file.c_str(), notify_rc);
> >> +        rc = NCSCC_RC_FAILURE;
> >> +        break;
> >> +      }
> >> +      int retry_cnt = 0;
> >> +      do {
> >> +        if (retry_cnt > 0) {
> >> +          osaf_nanosleep(&kHundredMilliseconds);
> >> +        }
> >> +        fifo_fd = open(fifo_file.c_str(), O_WRONLY|O_NONBLOCK);
> >> +      } while ((fifo_fd == -1) &&
> >> +               (retry_cnt++ < 5 && (errno == EINTR || errno ==
> >> + ENXIO)));
> >> +
> >> +      if (fifo_fd == -1) {
> >> +        LOG_ER("Failed to open %s, error: %s", fifo_file.c_str(),
> >> +               strerror(errno));
> >> +        rc = NCSCC_RC_FAILURE;
> >> +        break;
> >> +      } else {
> >> +        svc.fifo_fd = fifo_fd;
> >> +        fds[next_svc_fds_slot].fd = fifo_fd;
> >> +        fds[next_svc_fds_slot].events = POLLIN;
> >> +        next_svc_fds_slot++;
> >> +        LOG_NO("Monitoring of %s started", nid_name.c_str());
> >> +        break;
> >> +      }
> >> +    }
> >> +  }
> >> +  TRACE_LEAVE();
> >> +  return rc;
> >> +}
> >> +
> >> +std::string get_svc_name(int fd) {
> >> +  std::string svc_name;
> >> +
> >> +  for (auto const& svc : svc_map) {
> >> +    if (fd == svc.fifo_fd) {
> >> +      svc_name = svc.nid_name;
> >> +      break;
> >> +    }
> >> +  }
> >> +  return svc_name;
> >> +}
> >> +
> >> +void handle_svc_exit(int fd) {
> >> +  const std::string &svc_name = get_svc_name(fd);
> >> +
> >> +  if (svc_name.size() != 0) {
> >> +    LOG_ER("Service %s has unexpectedly crashed. Unable to continue,
> >> exiting",
> >> +           svc_name.c_str());
> >> +    exit(EXIT_FAILURE);
> >> +  } else {
> >> +    LOG_NO("fd %d was not found in service map", fd);
> >> +  }
> >> +}
> >> +
> >>
> +/*********************************************************
> >> *******************
> >> + * Name          : svc_monitor_thread                                     
> >>   *
> >> + *                                                                        
> >>   *
> >> + * Description   : creates the service monitor thread                     
> >>   *
> >> + *                                                                        
> >>   *
> >> + * Arguments     : -                                                      
> >>   *
> >> + *                                                                        
> >>   *
> >> + * Return Values : NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE.
> *
> >> + *                                                                        
> >>   *
> >> +
> >>
> +*********************************************************
> >> **************
> >> +****/
> >> +void* svc_monitor_thread(void *fd) {
> >> +  char nid_name[NID_MAXSNAME];
> >> +  int svc_mon_thr_fd = *(reinterpret_cast<int*>(fd));
> >> +  enum {
> >> +    FD_SVC_MON_THR = 0,
> >> +  };
> >> +
> >> +  struct pollfd *fds;
> >> +
> >> +  fds = new pollfd[sizeof(pollfd) * kMaxNumOfFds];  osafassert(fds
> >> + != NULL);  ssize_t read_rc = -1;
> >> +
> >> +  fds[FD_SVC_MON_THR].fd = svc_mon_thr_fd;
> >> fds[FD_SVC_MON_THR].events
> >> + = POLLIN;  next_svc_fds_slot++;
> >> +
> >> +  while (true) {
> >> +    unsigned rc = osaf_poll(fds, next_svc_fds_slot, -1);
> >> +    if (rc > 0) {
> >> +      // check if any monitored service has exit
> >> +      for (int i = next_svc_fds_slot-1; i > 0; --i) {
> >> +        if ((fds[i].revents & POLLIN) ||
> >> +            (fds[i].revents & POLLHUP) ||
> >> +            (fds[i].revents & POLLERR)) {
> >> +          handle_svc_exit(fds[i].fd);
> >> +        }
> >> +      }
> >> +
> >> +      if (fds[FD_SVC_MON_THR].revents & POLLIN) {
> >> +        while (true) {
> >> +          read_rc = read(svc_mon_thr_fd, nid_name, NID_MAXSNAME);
> >> +          if (read_rc == -1) {
> >> +            if (errno == EINTR) {
> >> +              continue;
> >> +            } else {
> >> +              LOG_ER("Failed to read on socketpair descriptor: %s",
> >> +                     strerror(errno));
> >> +              exit(EXIT_FAILURE);
> >> +            }
> >> +          }
> >> +          osafassert(read_rc < NID_MAXSNAME);
> >> +          nid_name[read_rc] = '\0';
> >> +          break;
> >> +        }
> >> +        if (handle_data_request(fds, nid_name) != NCSCC_RC_SUCCESS) {
> >> +          LOG_ER("Failed to start monitoring for service %s, exiting",
> >> +                 nid_name);
> >> +          exit(EXIT_FAILURE);
> >> +        }
> >> +      }
> >> +    } else {
> >> +      LOG_ER("osaf_poll timed out and no descriptors are ready, exiting");
> >> +      exit(EXIT_FAILURE);
> >> +    }
> >> +  }
> >> +  delete [] fds;
> >> +}
> >> +
> >>
> +/*********************************************************
> >> *******************
> >> + * Name          : create_svc_monitor_thread                              
> >>   *
> >> + *                                                                        
> >>   *
> >> + * Description   : creates the service monitor thread                     
> >>   *
> >> + *                                                                        
> >>   *
> >> + * Arguments     : -                                                      
> >>   *
> >> + *                                                                        
> >>   *
> >> + * Return Values : NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE.
> *
> >> + *                                                                        
> >>   *
> >> +
> >>
> +*********************************************************
> >> **************
> >> +****/ uint32_t create_svc_monitor_thread(void) {
> >> +  int s_pair[2];
> >> +  int svc_mon_thr_fd = -1;
> >> +  pthread_t thread;
> >> +  pthread_attr_t attr;
> >> +
> >> +  TRACE_ENTER();
> >> +
> >> +  if (socketpair(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0, s_pair) == -
> 1) {
> >> +    LOG_ER("socketpair FAILED: %s", strerror(errno));
> >> +    return NCSCC_RC_FAILURE;
> >> +  }
> >> +
> >> +  svc_mon_fd = s_pair[0];
> >> +  svc_mon_thr_fd = s_pair[1];
> >> +
> >> +  TRACE("sd1: %d sd2: %d", svc_mon_fd, svc_mon_thr_fd);
> >> +
> >> +  if (pthread_attr_init(&attr) != 0) {
> >> +    LOG_ER("pthread_attr_init FAILED: %s", strerror(errno));
> >> +    return NCSCC_RC_FAILURE;
> >> +  }
> >> +
> >> +  if (pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED)
> !=
> >> 0) {
> >> +    LOG_ER("pthread_setdetachstate FAILED: %s", strerror(errno));
> >> +    return NCSCC_RC_FAILURE;
> >> +  }
> >> +
> >> +  if (pthread_create(&thread, &attr, svc_monitor_thread,
> >> +  reinterpret_cast<void*>(&svc_mon_thr_fd)) != 0) {
> >> +    LOG_ER("pthread_create FAILED: %s", strerror(errno));
> >> +    return NCSCC_RC_FAILURE;
> >> +  }
> >> +
> >> +  if (pthread_attr_destroy(&attr) != 0) {
> >> +    LOG_ER("pthread_attr_destroy FAILED: %s", strerror(errno));
> >> +    return NCSCC_RC_FAILURE;
> >> +  }
> >> +
> >> +  TRACE_LEAVE();
> >> +  return NCSCC_RC_SUCCESS;
> >> +}
> >> +
> >>
> >>
> /**********************************************************
> >> ******************
> >>    * Name          : main                                                  
> >>    *
> >>    *                                                                       
> >>    *
> >> @@ -1365,6 +1629,11 @@ int main(int argc, char *argv[])
> >>            exit(EXIT_FAILURE);
> >>    }
> >>
> >> +  if (create_svc_monitor_thread() != NCSCC_RC_SUCCESS) {
> >> +          LOG_ER("Failed to create service monitor thread, exiting");
> >> +          exit(EXIT_FAILURE);
> >> +  }
> >> +
> >>    if (parse_nodeinit_conf(sbuf) != NCSCC_RC_SUCCESS) {
> >>            LOG_ER("Failed to parse file %s. Exiting", sbuf);
> >>            exit(EXIT_FAILURE);
> 
> 
> ------------------------------------------------------------------------------
> Check out the vibrant tech community on one of the world's most engaging
> tech sites, SlashDot.org! http://sdm.link/slashdot
> _______________________________________________
> Opensaf-devel mailing list
> Opensaf-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/opensaf-devel

------------------------------------------------------------------------------
Developer Access Program for Intel Xeon Phi Processors
Access to Intel Xeon Phi processor-based developer platforms.
With one year of Intel Parallel Studio XE.
Training and support from Colfax.
Order your platform today.http://sdm.link/intel
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to