00-README.conf | 6 +++++
osaf/libs/core/leap/os_defs.c | 27 ++++++++++++++++++++++++
osaf/services/infrastructure/nid/config/nid.conf | 3 ++
3 files changed, 36 insertions(+), 0 deletions(-)
calling ncs_os_process_execute_timed and the child process takes too long time
before exec, it may hang in e.g. a close or a syslog call. An alarm is set in
the ncs_os_process_execute_timed child process. If timed out a core dump will
be produced to be able to trouble shoot.
diff --git a/00-README.conf b/00-README.conf
--- a/00-README.conf
+++ b/00-README.conf
@@ -57,6 +57,12 @@ group/user.
escalated to an immediate reboot via the SysRq interface, or zero to disable
this feature.
+- Supervision of child process startup time before exec in seconds.
+ The child process is now supervised and is allowed to run up to maximum
+ OPENSAF_CHILD_EXEC_TIME_TOLERANCE in seconds. If the child process hangs
before
+ exec in e.g. a close or a syslog call, a SIGALRM will be raised and the
child process will be aborted.
+ Default value for OPENSAF_CHILD_EXEC_TIME_TOLERANCE is 2 seconds.
+
*******************************************************************************
nodeinit.conf.<node_type>
diff --git a/osaf/libs/core/leap/os_defs.c b/osaf/libs/core/leap/os_defs.c
--- a/osaf/libs/core/leap/os_defs.c
+++ b/osaf/libs/core/leap/os_defs.c
@@ -65,6 +65,15 @@ bool gl_ncs_atomic_mtx_initialise = fals
* description of SOCK_CLOEXEC. */
static pthread_mutex_t s_cloexec_mutex = PTHREAD_MUTEX_INITIALIZER;
+/*
+ * ALRM signal is used to detect if child process takes too long time before
exec.
+ *
+ * @param sig
+ */
+static void sigalrm_handler(int sig)
+{
+ abort();
+}
/***************************************************************************
*
* uns64
@@ -999,6 +1008,22 @@ uint32_t ncs_os_process_execute_timed(NC
osaf_mutex_lock_ordie(&s_cloexec_mutex);
if ((pid = fork()) == 0) {
+ unsigned int alarm_time_sec;
+ char* alarm_time;
+
+ if (signal(SIGALRM, sigalrm_handler) == SIG_ERR) {
+ LOG_ER("signal ALRM failed: %s", strerror(errno));
+ }
+ if ((alarm_time = getenv("OPENSAF_CHILD_EXEC_TIME_TOLERANCE"))
!= NULL) {
+ alarm_time_sec = strtol(alarm_time, NULL, 0);
+ }
+ else {
+ // default alarm timeout 2 seconds
+ alarm_time_sec = 2;
+ }
+
+ alarm(alarm_time_sec);
+
/*
** Make sure forked processes have default scheduling class
** independent of the callers scheduling class.
@@ -1054,6 +1079,8 @@ uint32_t ncs_os_process_execute_timed(NC
}
#endif
+ alarm(0);
+
/* child part */
if (execvp(req->i_script, req->i_argv) == -1) {
syslog(LOG_ERR, "%s: execvp '%s' failed - %s",
__FUNCTION__, req->i_script, strerror(errno));
diff --git a/osaf/services/infrastructure/nid/config/nid.conf
b/osaf/services/infrastructure/nid/config/nid.conf
--- a/osaf/services/infrastructure/nid/config/nid.conf
+++ b/osaf/services/infrastructure/nid/config/nid.conf
@@ -33,3 +33,6 @@ export OPENSAF_REBOOT_TIMEOUT=60
export OPENSAF_GROUP=opensaf
export OPENSAF_USER=opensaf
+# Specify the allowed child process startup time before exec in seconds.
+# Default is 2 seconds.
+# export OPENSAF_CHILD_EXEC_TIME_TOLERANCE=2
------------------------------------------------------------------------------
Get 100% visibility into Java/.NET code with AppDynamics Lite!
It's a free troubleshooting tool designed for production.
Get down to code-level detail for bottlenecks, with <2% overhead.
Download for free and get started troubleshooting in minutes.
http://pubads.g.doubleclick.net/gampad/clk?id=48897031&iu=/4140/ostg.clktrk
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel