00-README.conf                                   |   6 +++++
 osaf/libs/core/leap/os_defs.c                    |  27 ++++++++++++++++++++++++
 osaf/services/infrastructure/nid/config/nid.conf |   3 ++
 3 files changed, 36 insertions(+), 0 deletions(-)


calling ncs_os_process_execute_timed and the child process takes too long time 
before exec, it may hang in e.g. a close or a syslog call. An alarm is set in 
the ncs_os_process_execute_timed child process. If timed out a core dump will 
be produced to be able to trouble shoot.

diff --git a/00-README.conf b/00-README.conf
--- a/00-README.conf
+++ b/00-README.conf
@@ -57,6 +57,12 @@ group/user.
   escalated to an immediate reboot via the SysRq interface, or zero to disable
   this feature.
 
+- Supervision of child process startup time before exec in seconds.
+  The child process is now supervised and is allowed to run up to maximum 
+  OPENSAF_CHILD_EXEC_TIME_TOLERANCE in seconds. If the child process hangs 
before
+  exec in e.g. a close or a syslog call, a SIGALRM will be raised and the 
child process will be aborted.  
+  Default value for OPENSAF_CHILD_EXEC_TIME_TOLERANCE is 2 seconds.
+
 *******************************************************************************
 nodeinit.conf.<node_type>
 
diff --git a/osaf/libs/core/leap/os_defs.c b/osaf/libs/core/leap/os_defs.c
--- a/osaf/libs/core/leap/os_defs.c
+++ b/osaf/libs/core/leap/os_defs.c
@@ -65,6 +65,15 @@ bool gl_ncs_atomic_mtx_initialise = fals
  * description of SOCK_CLOEXEC. */
 static pthread_mutex_t s_cloexec_mutex = PTHREAD_MUTEX_INITIALIZER;
 
+/*
+ * ALRM signal is used to detect if child process takes too long time before 
exec.
+ * 
+ * @param sig
+ */
+static void sigalrm_handler(int sig)
+{
+       abort();
+}
 /***************************************************************************
  *
  * uns64
@@ -999,6 +1008,22 @@ uint32_t ncs_os_process_execute_timed(NC
        osaf_mutex_lock_ordie(&s_cloexec_mutex);
 
        if ((pid = fork()) == 0) {
+                unsigned int alarm_time_sec;
+                char* alarm_time;
+            
+                if (signal(SIGALRM, sigalrm_handler) == SIG_ERR) {
+                        LOG_ER("signal ALRM failed: %s", strerror(errno));
+                }
+                if ((alarm_time = getenv("OPENSAF_CHILD_EXEC_TIME_TOLERANCE")) 
!= NULL) {
+                        alarm_time_sec = strtol(alarm_time, NULL, 0);
+                }
+                else {
+                        // default alarm timeout 2 seconds
+                        alarm_time_sec = 2;
+                }
+            
+                alarm(alarm_time_sec);
+            
                /*
                 ** Make sure forked processes have default scheduling class
                 ** independent of the callers scheduling class.
@@ -1054,6 +1079,8 @@ uint32_t ncs_os_process_execute_timed(NC
                }
 #endif
 
+                alarm(0);
+                
                /* child part */
                if (execvp(req->i_script, req->i_argv) == -1) {
                        syslog(LOG_ERR, "%s: execvp '%s' failed - %s", 
__FUNCTION__, req->i_script, strerror(errno));
diff --git a/osaf/services/infrastructure/nid/config/nid.conf 
b/osaf/services/infrastructure/nid/config/nid.conf
--- a/osaf/services/infrastructure/nid/config/nid.conf
+++ b/osaf/services/infrastructure/nid/config/nid.conf
@@ -33,3 +33,6 @@ export OPENSAF_REBOOT_TIMEOUT=60
 export OPENSAF_GROUP=opensaf
 export OPENSAF_USER=opensaf
 
+# Specify the allowed child process startup time before exec in seconds. 
+# Default is 2 seconds.
+# export OPENSAF_CHILD_EXEC_TIME_TOLERANCE=2

------------------------------------------------------------------------------
Get 100% visibility into Java/.NET code with AppDynamics Lite!
It's a free troubleshooting tool designed for production.
Get down to code-level detail for bottlenecks, with <2% overhead. 
Download for free and get started troubleshooting in minutes. 
http://pubads.g.doubleclick.net/gampad/clk?id=48897031&iu=/4140/ostg.clktrk
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to