00-README.conf | 6 +++++ osaf/libs/core/leap/os_defs.c | 27 ++++++++++++++++++++++++ osaf/services/infrastructure/nid/config/nid.conf | 3 ++ 3 files changed, 36 insertions(+), 0 deletions(-)
calling ncs_os_process_execute_timed and the child process takes too long time before exec, it may hang in e.g. a close or a syslog call. An alarm is set in the ncs_os_process_execute_timed child process. If timed out a core dump will be produced to be able to trouble shoot. diff --git a/00-README.conf b/00-README.conf --- a/00-README.conf +++ b/00-README.conf @@ -57,6 +57,12 @@ group/user. escalated to an immediate reboot via the SysRq interface, or zero to disable this feature. +- Supervision of child process startup time before exec in seconds. + The child process is now supervised and is allowed to run up to maximum + OPENSAF_CHILD_EXEC_TIME_TOLERANCE in seconds. If the child process hangs before + exec in e.g. a close or a syslog call, a SIGALRM will be raised and the child process will be aborted. + Default value for OPENSAF_CHILD_EXEC_TIME_TOLERANCE is 2 seconds. + ******************************************************************************* nodeinit.conf.<node_type> diff --git a/osaf/libs/core/leap/os_defs.c b/osaf/libs/core/leap/os_defs.c --- a/osaf/libs/core/leap/os_defs.c +++ b/osaf/libs/core/leap/os_defs.c @@ -65,6 +65,15 @@ bool gl_ncs_atomic_mtx_initialise = fals * description of SOCK_CLOEXEC. */ static pthread_mutex_t s_cloexec_mutex = PTHREAD_MUTEX_INITIALIZER; +/* + * ALRM signal is used to detect if child process takes too long time before exec. + * + * @param sig + */ +static void sigalrm_handler(int sig) +{ + abort(); +} /*************************************************************************** * * uns64 @@ -999,6 +1008,22 @@ uint32_t ncs_os_process_execute_timed(NC osaf_mutex_lock_ordie(&s_cloexec_mutex); if ((pid = fork()) == 0) { + unsigned int alarm_time_sec; + char* alarm_time; + + if (signal(SIGALRM, sigalrm_handler) == SIG_ERR) { + LOG_ER("signal ALRM failed: %s", strerror(errno)); + } + if ((alarm_time = getenv("OPENSAF_CHILD_EXEC_TIME_TOLERANCE")) != NULL) { + alarm_time_sec = strtol(alarm_time, NULL, 0); + } + else { + // default alarm timeout 2 seconds + alarm_time_sec = 2; + } + + alarm(alarm_time_sec); + /* ** Make sure forked processes have default scheduling class ** independent of the callers scheduling class. @@ -1054,6 +1079,8 @@ uint32_t ncs_os_process_execute_timed(NC } #endif + alarm(0); + /* child part */ if (execvp(req->i_script, req->i_argv) == -1) { syslog(LOG_ERR, "%s: execvp '%s' failed - %s", __FUNCTION__, req->i_script, strerror(errno)); diff --git a/osaf/services/infrastructure/nid/config/nid.conf b/osaf/services/infrastructure/nid/config/nid.conf --- a/osaf/services/infrastructure/nid/config/nid.conf +++ b/osaf/services/infrastructure/nid/config/nid.conf @@ -33,3 +33,6 @@ export OPENSAF_REBOOT_TIMEOUT=60 export OPENSAF_GROUP=opensaf export OPENSAF_USER=opensaf +# Specify the allowed child process startup time before exec in seconds. +# Default is 2 seconds. +# export OPENSAF_CHILD_EXEC_TIME_TOLERANCE=2 ------------------------------------------------------------------------------ Get 100% visibility into Java/.NET code with AppDynamics Lite! It's a free troubleshooting tool designed for production. Get down to code-level detail for bottlenecks, with <2% overhead. Download for free and get started troubleshooting in minutes. http://pubads.g.doubleclick.net/gampad/clk?id=48897031&iu=/4140/ostg.clktrk _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel