Disregarding the fork issue there seems to be cases of syslog() blocking due to the syslog server. For example syslog-ng can use DNS in some configurations and could block due to that.
Same thing I guess if it is overloaded and does not empty its socket buffer. I guess (without knowing) that a client should not be affected if the server socket (/dev/log) is not available / service stopped. /Hans > -----Original Message----- > From: Anders Widell > Sent: den 9 augusti 2013 09:24 > To: Hans Feldt > Cc: Hans Nordebäck; praveen malviya; Nagendra Kumar; Ramesh Babu Betham; > [email protected] > Subject: Re: [PATCH 1 of 1] leap: ncs_os_process_execute_timed child process > takes too long time before exec (#514) > > Wow, this was news to me. Ok then we must go through all functions we > use after fork() and check that they are safe. > > So do we know if syslog() in general can block (if we disregard the case > when it is used after fork(), which apparently is a illegal). > > regards, > Anders Widell > > 2013-08-09 09:17, Hans Feldt skrev: > > Yeah it seems that syslog in a child between fork() and exec() is unsecure. > > Googling reveals this: > > > > http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=454183 > > > > In search for #514 a syslog entry was added between fork() and exec() that > > increased the likeliness of this to happen. The remaining > syslogs between fork() and exec() are for error cases. I guess those need to > be stderr prints. The problem is that stderr is closed in the > child... > > We need to come up with a way to have stderr available in some trouble > > shooting scenarios > > > > /HansF > > > >> -----Original Message----- > >> From: Hans Nordebäck > >> Sent: den 9 augusti 2013 09:04 > >> To: praveen malviya; Nagendra Kumar; Ramesh Babu Betham; Hans Feldt > >> Cc: [email protected]; Anders Widell > >> Subject: Re: SV: [PATCH 1 of 1] leap: ncs_os_process_execute_timed child > >> process takes too long time before exec (#514) > >> > >> Hi, we got feedback from one site with the alarm patch applied (#532). > >> In this case it hangs when calling syslog: > >> > >> #0 0x00007f03b0b9db35 in raise () from /lib64/libc.so.6 > >> #0 0x00007f03b0b9db35 in raise () from /lib64/libc.so.6 > >> #1 0x00007f03b0b9f111 in abort () from /lib64/libc.so.6 > >> #2 0x00007f03b1e1dd99 in sigalrm_handler (sig=5928) at os_defs.c:79 > >> #3 <signal handler called> > >> #4 0x00007f03b0c51dac in __lll_lock_wait_private () from /lib64/libc.so.6 > >> #5 0x00007f03b0c04e0d in _L_lock_1598 () from /lib64/libc.so.6 > >> #6 0x00007f03b0c04bc6 in __tz_convert () from /lib64/libc.so.6 > >> #7 0x00007f03b0c40f6a in __vsyslog_chk () from /lib64/libc.so.6 > >> #8 0x00007f03b0c415b0 in syslog () from /lib64/libc.so.6 > >> #9 0x00007f03b1e1e992 in ncs_os_process_execute_timed > >> (req=0x7fff10644ef0) at os_defs.c:1086 > >> #10 0x0000000000416d03 in avnd_comp_clc_cmd_execute (cb=0x6578c0, > >> comp=0x67f680, > >> cmd_type=AVND_COMP_CLC_CMD_TYPE_INSTANTIATE) at avnd_clc.c:2702 > >> #11 0x000000000041a810 in avnd_comp_clc_uninst_inst_hdler (cb=0x6578c0, > >> comp=0x67f680) at avnd_clc.c:1384 > >> #12 0x00000000004187fb in avnd_comp_clc_fsm_run (cb=0x6578c0, > >> comp=0x67f680, ev=AVND_COMP_CLC_PRES_FSM_EV_INST) > at > >> avnd_clc.c:874 > >> #13 0x000000000043438f in avnd_su_pres_insting_compinst_hdler > >> (cb=0x6578c0, su=0x676380, comp=0x680750) at > avnd_susm.c:1759 > >> #14 0x0000000000435bc4 in avnd_su_pres_fsm_run (cb=0x6578c0, su=0x676380, > >> comp=0x680750, > >> ev=AVND_SU_PRES_FSM_EV_COMP_INSTANTIATED) at avnd_susm.c:1293 > >> #15 0x00000000004175cb in avnd_comp_clc_st_chng_prc (cb=0x6578c0, > >> comp=0x680750, > >> prv_st=SA_AMF_PRESENCE_INSTANTIATING, > >> final_st=SA_AMF_PRESENCE_INSTANTIATED) at avnd_clc.c:1313 > >> #16 0x000000000041888f in avnd_comp_clc_fsm_run (cb=0x6578c0, > >> comp=0x680750, > >> ev=AVND_COMP_CLC_PRES_FSM_EV_INST_SUCC) at avnd_clc.c:897 > >> #17 0x000000000041adb8 in avnd_evt_clc_resp_evh (cb=0x6578c0, > >> evt=0x67d0e0) at avnd_clc.c:446 > >> #18 0x00000000004300e0 in avnd_evt_process (evt=<optimized out>) at > >> avnd_proc.c:278 > >> #19 avnd_main_process () at avnd_proc.c:219 > >> #20 0x0000000000408815 in main (argc=1, argv=0x7fff10645988) at > >> amfnd_main.c:61 > >> > >> > >> /HansN > >> > >> On 08/06/13 09:34, praveen malviya wrote: > >>> Hi, > >>> I added a new syslog entry(to debug) before execv and was able > >>> to reproduce the problem. > >>> The reason for the time out seems to be that syslog was *hung*, but > >>> *after* writing the new log message into /var/log/messages. > >>> So, there is a high possibility that the child process gets hung in > >>> syslog or other file operations. > >>> See bt below: > >>> Thread 1 (Thread 0x7f030a16f700 (LWP 16331)): > >>> > >>> #0 0x00007f0308de4b1e in __lll_lock_wait_private () from > >>> /lib64/libc.so.6 > >>> #1 0x00007f0308dd4398 in _L_lock_634 () from /lib64/libc.so.6 > >>> #2 0x00007f0308dd3e62 in __vsyslog_chk () from /lib64/libc.so.6 > >>> #3 0x00007f0308dd4320 in syslog () from /lib64/libc.so.6 > >>> #4 0x00007f0309d38feb in ncs_os_process_execute_timed > >>> (req=0x7fffdcc52770, flag=false) at os_defs.c:1059 > >>> #5 0x0000000000416bd6 in avnd_comp_clc_cmd_execute (cb=0x6578a0, > >>> comp=0x673840, cmd_type=AVND_COMP_CLC_CMD_TYPE_INSTANTIATE) > >>> at avnd_clc.c:2687 > >>> #6 0x000000000041a630 in avnd_comp_clc_uninst_inst_hdler > >>> (cb=0x6578a0, comp=0x673840) at avnd_clc.c:1384 > >>> #7 0x000000000041865b in avnd_comp_clc_fsm_run (cb=0x6578a0, > >>> comp=0x673840, ev=AVND_COMP_CLC_PRES_FSM_EV_INST) > >>> at avnd_clc.c:874 > >>> #8 0x0000000000433f5f in avnd_su_pres_insting_compinst_hdler > >>> (cb=0x6578a0, su=0x66fc70, comp=0x674e10) at avnd_susm.c:1759 > >>> #9 0x0000000000435764 in avnd_su_pres_fsm_run (cb=0x6578a0, > >>> su=0x66fc70, comp=0x674e10, > >>> ev=AVND_SU_PRES_FSM_EV_COMP_INSTANTIATED) at avnd_susm.c:1293 > >>> #10 0x000000000041748b in avnd_comp_clc_st_chng_prc (cb=0x6578a0, > >>> comp=0x674e10, prv_st=SA_AMF_PRESENCE_INSTANTIATING, > >>> final_st=SA_AMF_PRESENCE_INSTANTIATED) at avnd_clc.c:1313 > >>> #11 0x00000000004186ef in avnd_comp_clc_fsm_run (cb=0x6578a0, > >>> comp=0x674e10, ev=AVND_COMP_CLC_PRES_FSM_EV_INST_SUCC) > >>> at avnd_clc.c:897 > >>> #12 0x000000000041f400 in avnd_comp_reg_prc (cb=0x6578a0, > >>> comp=0x674e10, pxy_comp=<optimized out>, reg=0x677528, > >>> dest=0x677518) at avnd_comp.c:709 > >>> #13 0x000000000041fbd7 in avnd_evt_ava_comp_reg_evh (cb=0x6578a0, > >>> evt=0x683530) at avnd_comp.c:212 > >>> #14 0x000000000042fcc8 in avnd_evt_process (evt=<optimized out>) at > >>> avnd_proc.c:278 > >>> #15 avnd_main_process () at avnd_proc.c:219 > >>> #16 0x0000000000408835 in main (argc=2, argv=0x7fffdcc53588) at > >>> amfnd_main.c:53 > >>> > >>> > >>> Thanks, > >>> Praveen. > >>> > >>> > >>> On 03-Aug-13 9:47 PM, Hans Nordebäck wrote: > >>>> Hi, > >>>> > >>>> I think the problem with "child before exec" "hangs" and the problem > >>>> where "child" at cleanup times out instantly, even though a 10 sec. > >>>> timeout is set, are related. > >>>> > >>>> Some reasons why the child seems to hang could be: > >>>> > >>>> 1. Not run at all > >>>> 2. "Hangs" in e.g. close. > >>>> 3. Crashes, but then whe should have core dumps. > >>>> > >>>> but the problem with "hanging" and instantly times out could be: > >>>> > >>>> 4. When parent process adds the pid in the patricia tree, > >>>> add_new_req_pid_in_list, ncs_exec_mod_hdlr > >>>> is able to kill this child pid before it has started. > >>>> A e.g. race condtion in this code could explain both these faults. > >>>> I looked briefly in this code and one thing is the delivery of the > >>>> SIGCHLD signal, any thread can get that signal, > >>>> e.g amfnd, mds, timer or ncs_exec_mod_hdlr. I think a dedicated > >>>> thread should handle SIGCHLD. > >>>> I guess we should look closer to this part. > >>>> > >>>> /BR HansN > >>>> ________________________________________ > >>>> Från: Hans Nordebäck > >>>> Skickat: den 30 juli 2013 13:05 > >>>> Till: Nagendra Kumar > >>>> Cc: Ramesh Babu Betham; [email protected]; Praveen > >>>> Malviya; Hans Feldt > >>>> Ämne: RE: [PATCH 1 of 1] leap: ncs_os_process_execute_timed child > >>>> process takes too long time before exec (#514) > >>>> > >>>> Hi Nagu, I have tested the patch and simulated "hanging" in the child > >>>> part, the core dump produced points out where. > >>>> The system works as before, exept the core dump if it hangs in the > >>>> child part. We have this patch running at a site > >>>> waiting for it to trigger, so we don't know yet what is causing this > >>>> and if the patch will help. > >>>> > >>>> /BR HansN > >>>> > >>>> -----Original Message----- > >>>> From: Nagendra Kumar [mailto:[email protected]] > >>>> Sent: den 30 juli 2013 13:05 > >>>> To: Hans Nordebäck > >>>> Cc: Ramesh Babu Betham; [email protected]; Praveen > >>>> Malviya; Hans Feldt > >>>> Subject: RE: [PATCH 1 of 1] leap: ncs_os_process_execute_timed child > >>>> process takes too long time before exec (#514) > >>>> > >>>> Hi Hans N, > >>>> > >>>> Close is not a sync call. I am still not convinced, these calls may > >>>> get blocked because of NFS system issue. > >>>> Have you tested by keeping abort in child process and see how amfnd > >>>> or other services behaves(just to simulate abort)? > >>>> > >>>> Thanks > >>>> -Nagu > >>>> > >>>> -----Original Message----- > >>>> From: Hans Nordebäck [mailto:[email protected]] > >>>> Sent: 30 July 2013 16:16 > >>>> To: Nagendra Kumar > >>>> Cc: Ramesh Babu Betham; [email protected]; Praveen > >>>> Malviya; Hans Feldt > >>>> Subject: Re: [PATCH 1 of 1] leap: ncs_os_process_execute_timed child > >>>> process takes too long time before exec (#514) > >>>> > >>>> Hi Nagu, I agree with Ramesh, e.g. in the child part: > >>>> > >>>> /* By default we close all inherited file descriptors in the > >>>> child */ > >>>> if (getenv("OPENSAF_KEEP_FD_OPEN_AFTER_FORK") == NULL) { > >>>> /* Close all inherited file descriptors */ > >>>> int i = sysconf(_SC_OPEN_MAX); > >>>> if (i == -1) { > >>>> syslog(LOG_ERR, "%s: sysconf failed - %s", > >>>> __FUNCTION__, strerror(errno)); > >>>> exit(EXIT_FAILURE); > >>>> } > >>>> for (i--; i >= 0; --i) > >>>> (void) close(i); /* close all descriptors */ > >>>> > >>>> /* Redirect standard files to /dev/null */ > >>>> if (freopen("/dev/null", "r", stdin) == NULL) > >>>> syslog(LOG_ERR, "%s: freopen stdin failed - %s", > >>>> __FUNCTION__, strerror(errno)); > >>>> if (freopen("/dev/null", "w", stdout) == NULL) > >>>> syslog(LOG_ERR, "%s: freopen stdout failed - %s", > >>>> __FUNCTION__, strerror(errno)); > >>>> if (freopen("/dev/null", "w", stderr) == NULL) > >>>> syslog(LOG_ERR, "%s: freopen stderr failed - %s", > >>>> __FUNCTION__, strerror(errno)); > >>>> } > >>>> > >>>> close are issued. > >>>> > >>>> /BR HansN > >>>> > >>>> On 07/30/13 12:34, Nagendra Kumar wrote: > >>>>> But where are file operations coming during fork, can you please > >>>>> explain ? > >>>>> > >>>>> Thanks > >>>>> -Nagu > >>>>> -----Original Message----- > >>>>> From: Ramesh Betham > >>>>> Sent: 30 July 2013 15:53 > >>>>> To: Nagendra Kumar > >>>>> Cc: Hans Nordebäck; [email protected]; Praveen > >>>>> Malviya; Hans Feldt > >>>>> Subject: Re: [PATCH 1 of 1] leap: ncs_os_process_execute_timed child > >>>>> process takes too long time before exec (#514) > >>>>> > >>>>> Nagu: Hans N might be pointing to the chances of hung of > >>>>> file-operation calls (esp. when some inconsistency happens with > >>>>> NFS). Just a guess, let Hans N confirm it. > >>>>> > >>>>> Thanks, > >>>>> Ramesh. > >>>>> > >>>>> On 7/30/2013 3:27 PM, Nagendra Kumar wrote: > >>>>>> Hi, > >>>>>> > >>>>>>>> regarding what can "hang" in the child part, e.g close of file > >>>>>>>> descriptors close of file descriptors. > >>>>>> When this can happen? After fork is successful, this shouldn't > >>>>>> happen. Can you please provide any example. > >>>>>> > >>>>>> Thanks > >>>>>> -Nagu > >>>>>> > >>>>>> -----Original Message----- > >>>>>> From: Hans Nordebäck [mailto:[email protected]] > >>>>>> Sent: 30 July 2013 12:53 > >>>>>> To: Nagendra Kumar > >>>>>> Cc: Hans Nordebäck; [email protected]; Praveen > >>>>>> Malviya; Ramesh Babu Betham; Hans Feldt > >>>>>> Subject: Re: [PATCH 1 of 1] leap: ncs_os_process_execute_timed child > >>>>>> process takes too long time before exec (#514) > >>>>>> > >>>>>> Hi Nagu, regarding what can "hang" in the child part, e.g close of > >>>>>> file descriptors. /BR HansN On 07/30/13 09:01, Hans Nordebäck wrote: > >>>>>>> Hi Nagu, > >>>>>>> > >>>>>>> On 07/30/13 08:54, Nagendra Kumar wrote: > >>>>>>>> Hi Hans N, > >>>>>>>> > >>>>>>>>>> 1. OPENSAF_CHILD_EXEC_TIME_TOLERANCE is the name of a new > >>>>>>>>>> environment variable where value is used as input to alarm, if > >>>>>>>>>> not set it is default 2 seconds. > >>>>>>>> Do we have some place holder for this variable for configuration > >>>>>>>> and are we going to add it in README for information. > >>>>>>> Perhaps the name isn't the best, but it should be handled as the > >>>>>>> other env variable I guess, e.g. "AVND_PM_MONITORING_RATE", etc. > >>>>>>>>>> if the child "hangs" before exec this extra coredump should give > >>>>>>>>>> information where/what is wrong. > >>>>>>>> This means that fork hangs, am I right ? If yes, then dump is not > >>>>>>>> going to provide any information as it is a system call, it can > >>>>>>>> only show, ithangs in fork. > >>>>>>> I don't think fork hangs as the parent part continues and later, > >>>>>>> with the help of ncs_exec_mod_hdlr, the parent detects that the > >>>>>>> child or "exec" has timed out, > >>>>>>> 10 sec in this case. But in this case the exec has not been > >>>>>>> performed. > >>>>>>>>>> After exec, it will work as usual > >>>>>>>> This confirms that we are only targeting fork to debug. > >>>>>>> Yes, the extra core dump will help troubleshooting. > >>>>>>> /BR HansN > >>>>>>>> Thanks > >>>>>>>> -Nagu > >>>>>>>> > >>>>>>>> -----Original Message----- > >>>>>>>> From: Hans Nordebäck [mailto:[email protected]] > >>>>>>>> Sent: 30 July 2013 11:57 > >>>>>>>> To: Nagendra Kumar > >>>>>>>> Cc: [email protected]; Praveen Malviya; Ramesh > >>>>>>>> Babu Betham; Hans Feldt; Hans Nordebäck > >>>>>>>> Subject: RE: [PATCH 1 of 1] leap: ncs_os_process_execute_timed > >>>>>>>> child process takes too long time before exec (#514) > >>>>>>>> > >>>>>>>> Hi Nagu, > >>>>>>>> > >>>>>>>> 1. OPENSAF_CHILD_EXEC_TIME_TOLERANCE is the name of a new > >>>>>>>> environment variable where value is used as input to alarm, if not > >>>>>>>> set it is default 2 seconds. > >>>>>>>> 2. Yes you are right, in this particular case it is set to 10 sec, > >>>>>>>> that's why the env. variable above can be set. > >>>>>>>> 3. This alarm is just an additional precaution, at no extra cost, > >>>>>>>> to check the child part before the exec. After exec > >>>>>>>> it will work as usual but if the child "hangs" before exec > >>>>>>>> this extra coredump should give information where/what is wrong. > >>>>>>>> > >>>>>>>> /BR HansN > >>>>>>>> > >>>>>>>> -----Original Message----- > >>>>>>>> From: Nagendra Kumar [mailto:[email protected]] > >>>>>>>> Sent: den 30 juli 2013 07:11 > >>>>>>>> To: Hans Nordebäck; Praveen Malviya; Hans Feldt; Ramesh Babu Betham > >>>>>>>> Cc: [email protected] > >>>>>>>> Subject: RE: [PATCH 1 of 1] leap: ncs_os_process_execute_timed > >>>>>>>> child process takes too long time before exec (#514) > >>>>>>>> > >>>>>>>> Hi Hans N, > >>>>>>>> For my understanding, can you please provide the below > >>>>>>>> information: > >>>>>>>> > >>>>>>>> 1. I can't find OPENSAF_CHILD_EXEC_TIME_TOLERANCE in opensaf > >>>>>>>> source code. > >>>>>>>> 2. I hope the child process is hung for more than > >>>>>>>> saAmfCtDefClcCliTimeout resulting in CLC time out. Am I right? > >>>>>>>> 3. Even we add assert in child process and we get core dump, but > >>>>>>>> it may not give any information as it got delayed because of > >>>>>>>> system issue. Are we targeting, which system call the child > >>>>>>>> process is hung? > >>>>>>>> > >>>>>>>> Thanks > >>>>>>>> -Nagu > >>>>>>>> > >>>>>>>> -----Original Message----- > >>>>>>>> From: Hans Nordeback [mailto:[email protected]] > >>>>>>>> Sent: 22 July 2013 17:07 > >>>>>>>> To: Nagendra Kumar; Praveen Malviya; [email protected]; > >>>>>>>> Ramesh Babu Betham > >>>>>>>> Cc: [email protected] > >>>>>>>> Subject: [PATCH 1 of 1] leap: ncs_os_process_execute_timed child > >>>>>>>> process takes too long time before exec (#514) > >>>>>>>> > >>>>>>>> osaf/libs/core/leap/os_defs.c | 27 +++++++++++++++++++++++++++ > >>>>>>>> 1 files changed, 27 insertions(+), 0 deletions(-) > >>>>>>>> > >>>>>>>> > >>>>>>>> amfnd calls ncs_os_process_execute_timed and the child process > >>>>>>>> takes too long time before exec, (10 sec timeout). An alarm is set > >>>>>>>> in the ncs_os_process_execute_timed child process. If timed out a > >>>>>>>> core dump will be produced to be able to trouble shoot. > >>>>>>>> > >>>>>>>> diff --git a/osaf/libs/core/leap/os_defs.c > >>>>>>>> b/osaf/libs/core/leap/os_defs.c > >>>>>>>> --- a/osaf/libs/core/leap/os_defs.c > >>>>>>>> +++ b/osaf/libs/core/leap/os_defs.c > >>>>>>>> @@ -65,6 +65,15 @@ bool gl_ncs_atomic_mtx_initialise = fals > >>>>>>>> * description of SOCK_CLOEXEC. */ > >>>>>>>> static pthread_mutex_t s_cloexec_mutex = > >>>>>>>> PTHREAD_MUTEX_INITIALIZER; > >>>>>>>> +/* > >>>>>>>> + * ALRM signal is used to detect if child process takes too long > >>>>>>>> time before exec. > >>>>>>>> + * > >>>>>>>> + * @param sig > >>>>>>>> + */ > >>>>>>>> +static void sigalrm_handler(int sig) { > >>>>>>>> + abort(); > >>>>>>>> +} > >>>>>>>> /*************************************************************************** > >>>>>>>> > >>>>>>>> * > >>>>>>>> * uns64 > >>>>>>>> @@ -999,6 +1008,22 @@ uint32_t ncs_os_process_execute_timed(NC > >>>>>>>> osaf_mutex_lock_ordie(&s_cloexec_mutex); > >>>>>>>> if ((pid = fork()) == 0) { > >>>>>>>> + unsigned int alarm_time_sec; > >>>>>>>> + char* alarm_time; > >>>>>>>> + > >>>>>>>> + if (signal(SIGALRM, sigalrm_handler) == SIG_ERR) { > >>>>>>>> + LOG_ER("signal ALRM failed: %s", > >>>>>>>> strerror(errno)); > >>>>>>>> + } > >>>>>>>> + if ((alarm_time = > >>>>>>>> getenv("OPENSAF_CHILD_EXEC_TIME_TOLERANCE")) != NULL) { > >>>>>>>> + alarm_time_sec = strtol(alarm_time, > >>>>>>>> NULL, 0); > >>>>>>>> + } > >>>>>>>> + else { > >>>>>>>> + // default alarm timeout 2 seconds > >>>>>>>> + alarm_time_sec = 2; > >>>>>>>> + } > >>>>>>>> + > >>>>>>>> + alarm(alarm_time_sec); > >>>>>>>> + > >>>>>>>> /* > >>>>>>>> ** Make sure forked processes have default > >>>>>>>> scheduling class > >>>>>>>> ** independent of the callers scheduling class. > >>>>>>>> @@ -1054,6 +1079,8 @@ uint32_t ncs_os_process_execute_timed(NC > >>>>>>>> } > >>>>>>>> #endif > >>>>>>>> + alarm(0); > >>>>>>>> + > >>>>>>>> /* child part */ > >>>>>>>> if (execvp(req->i_script, req->i_argv) == -1) { > >>>>>>>> syslog(LOG_ERR, "%s: execvp '%s' failed - %s", > >>>>>>>> __FUNCTION__, req->i_script, strerror(errno)); > ------------------------------------------------------------------------------ Get 100% visibility into Java/.NET code with AppDynamics Lite! It's a free troubleshooting tool designed for production. Get down to code-level detail for bottlenecks, with <2% overhead. Download for free and get started troubleshooting in minutes. http://pubads.g.doubleclick.net/gampad/clk?id=48897031&iu=/4140/ostg.clktrk _______________________________________________ Opensaf-devel mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/opensaf-devel
