> Am 23.03.2016 um 14:56 schrieb Yuri Burmachenko <yur...@mellanox.com>: > > We have put strace for sge_qmaster process and this what we got in the end of > the trace (the file generated was 95G) - below are last 80 lines: > > > [pid 1049] futex(0x1ee63d4, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, > 101147186, {1458258881, 623057000}, ffffffff <unfinished ...> > [pid 1050] <... futex resumed> ) = -1 EAGAIN (Resource temporarily > unavailable) > [pid 1050] futex(0x1ee63d4, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, > 101147186, {1458258881, 623051000}, ffffffff <unfinished ...> > [pid 1048] open("/dev/tty", O_RDWR|O_NOCTTY|O_NONBLOCK) = -1 ENXIO (No such > device or address) > [pid 1048] writev(2, [{"*** glibc detected *** ", 23}, > {"/opt/sge/bin/lx-amd64/sge_qmaste"..., 33}, {": ", 2}, {"double free or > corruption (fastt"..., 35}, {": 0x", 4}, {"00007fc6a14481d0", 16}, {" ***\n", > 5}], 7) = 118 > [pid 1048] mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, > -1, 0) = 0x7fc6fb1a4000 > [pid 1048] open("/opt/sge/bin/lx-amd64/../../lib/lx-amd64/libgcc_s.so.1", > O_RDONLY) = -1 ENOENT (No such file or directory) > [pid 1048] open("/etc/ld.so.cache", O_RDONLY) = 9 [pid 1048] fstat(9, > {st_mode=S_IFREG|0644, st_size=134739, ...}) = 0 > [pid 1048] mmap(NULL, 134739, PROT_READ, MAP_PRIVATE, 9, 0) = 0x7fc6f0766000 > [pid 1048] close(9) = 0 > [pid 1048] open("/lib64/libgcc_s.so.1", O_RDONLY) = 9 > [pid 1048] read(9, > "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\20)\0{3\0\0\0"..., 832) = 832 > [pid 1048] fstat(9, {st_mode=S_IFREG|0755, st_size=93320, ...}) = 0 > [pid 1048] mmap(0x337b000000, 2186584, PROT_READ|PROT_EXEC, > MAP_PRIVATE|MAP_DENYWRITE, 9, 0) = 0x337b000000 > [pid 1048] mprotect(0x337b016000, 2093056, PROT_NONE) = 0 > [pid 1048] mmap(0x337b215000, 4096, PROT_READ|PROT_WRITE, > MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 9, 0x15000) = 0x337b215000 > [pid 1048] close(9) = 0 > [pid 1048] munmap(0x7fc6f0766000, 134739) = 0 [pid 1048] > futex(0x3377b92620, FUTEX_WAKE_PRIVATE, 2147483647) = 0 > [pid 1048] futex(0x337b215af0, FUTEX_WAKE_PRIVATE, 2147483647) = 0 [pid > 1048] write(2, "======= Backtrace: =========\n", 29) = 29 > [pid 1048] writev(2, [{"/lib64/libc.so.6", 16}, {"[0x", 3}, {"3377876166", > 10}, {"]\n", 2}], 4) = 31 > [pid 1048] writev(2, [{"/opt/sge/bin/lx-amd64/sge_qmaste"..., 33}, {"(", 1}, > {"sge_free", 8}, {"+0x", 3}, {"16", 2}, {")", 1}, {"[0x", 3}, {"5e3e76", 6}, > {"]\n", 2}], 9) = 59 > [pid 1048] writev(2, [{"/opt/sge/bin/lx-amd64/sge_qmaste"..., 33}, {"(", 1}, > {"lSetString", 10}, {"+0x", 3}, {"d8", 2}, {")", 1}, {"[0x", 3}, {"590238", > 6}, {"]\n", 2}], 9) = 61 > [pid 1048] writev(2, [{"/opt/sge/bin/lx-amd64/sge_qmaste"..., 33}, {"[0x", > 3}, {"4cbfbf", 6}, {"]\n", 2}], 4) = 44 [pid 1048] writev(2, > [{"/opt/sge/bin/lx-amd64/sge_qmaste"..., 33}, {"[0x", 3}, {"4cd565", 6}, > {"]\n", 2}], 4) = 44 > [pid 1048] writev(2, [{"/opt/sge/bin/lx-amd64/sge_qmaste"..., 33}, {"(", 1}, > {"sge_select_parallel_environment", 31}, {"+0x", 3}, {"939", 3}, {")", 1}, > {"[0x", 3}, {"4ceb59", 6}, {"]\n", 2}], 9) = 83 > [pid 1048] writev(2, [{"/opt/sge/bin/lx-amd64/sge_qmaste"..., 33}, {"(", 1}, > {"verify_suitable_queues", 22}, {"+0x", 3}, {"3cc", 3}, {")", 1}, {"[0x", 3}, > {"45d03c", 6}, {"]\n", 2}], 9) = 74 > [pid 1048] writev(2, [{"/opt/sge/bin/lx-amd64/sge_qmaste"..., 33}, {"(", 1}, > {"sge_job_verify_adjust", 21}, {"+0x", 3}, {"2506", 4}, {")", 1}, {"[0x", 3}, > {"46c346", 6}, {"]\n", 2}], 9) = 74 > [pid 1048] writev(2, [{"/opt/sge/bin/lx-amd64/sge_qmaste"..., 33}, {"(", 1}, > {"sge_gdi_add_job", 15}, {"+0x", 3}, {"1b7", 3}, {")", 1}, {"[0x", 3}, > {"468e07", 6}, {"]\n", 2}], 9) = 67 > [pid 1048] writev(2, [{"/opt/sge/bin/lx-amd64/sge_qmaste"..., 33}, {"[0x", > 3}, {"4428a5", 6}, {"]\n", 2}], 4) = 44 > [pid 1048] writev(2, [{"/opt/sge/bin/lx-amd64/sge_qmaste"..., 33}, {"(", 1}, > {"sge_c_gdi", 9}, {"+0x", 3}, {"4ef", 3}, {")", 1}, {"[0x", 3}, {"44525f", > 6}, {"]\n", 2}], 9) = 61 > [pid 1048] writev(2, [{"/opt/sge/bin/lx-amd64/sge_qmaste"..., 33}, {"(", 1}, > {"sge_worker_main", 15}, {"+0x", 3}, {"28f", 3}, {")", 1}, {"[0x", 3}, > {"42feff", 6}, {"]\n", 2}], 9) = 67 > [pid 1048] writev(2, [{"/lib64/libpthread.so.0", 22}, {"[0x", 3}, > {"33780079d1", 10}, {"]\n", 2}], 4) = 37 > [pid 1048] writev(2, [{"/lib64/libc.so.6", 16}, {"(", 1}, {"clone", 5}, > {"+0x", 3}, {"6d", 2}, {")", 1}, {"[0x", 3}, {"33778e8b6d", 10}, {"]\n", 2}], > 9) = 43 > [pid 1048] write(2, "======= Memory map: ========\n", 29) = 29 > [pid 1048] open("/proc/self/maps", O_RDONLY) = 9 [pid 1048] read(9, > "00400000-0065a000 r-xp 00000000 "..., 1024) = 1024 > [pid 1048] write(2, "00400000-0065a000 r-xp 00000000 "..., 1024) = 1024 > [pid 1048] read(9, "so\n3377800000-337798b000 r-xp 00"..., 1024) = 1024 > [pid 1048] write(2, "so\n3377800000-337798b000 r-xp 00"..., 1024) = 1024 > [pid 1048] read(9, "17000 08:03 25821640 "..., 1024) = 1024 > [pid 1048] write(2, "17000 08:03 25821640 "..., 1024) = 1024 > [pid 1048] read(9, "0000-3379416000 r-xp 00000000 08"..., 1024) = 1024 > [pid 1048] write(2, "0000-3379416000 r-xp 00000000 08"..., 1024) = 1024 > [pid 1048] read(9, " /lib64/libcom_err.so.2.1\n337f"..., 1024) = 1024 > [pid 1048] write(2, " /lib64/libcom_err.so.2.1\n337f"..., 1024) = 1024 > [pid 1048] read(9, "64/libgssapi_krb5.so.2.2\n3380041"..., 1024) = 1024 > [pid 1048] write(2, "64/libgssapi_krb5.so.2.2\n3380041"..., 1024) = 1024 > [pid 1048] read(9, "tils.so.1.3\n3380a02000-3380a0300"..., 1024) = 1024 > [pid 1048] write(2, "tils.so.1.3\n3380a02000-3380a0300"..., 1024) = 1024 > [pid 1048] read(9, " \n7fc6ba459000-7fc6bc000000 ---p"..., 1024) = 1024 > [pid 1048] write(2, " \n7fc6ba459000-7fc6bc000000 ---p"..., 1024) = 1024 > [pid 1048] read(9, ":00 0 \n7fc6e21fe000-7fc6e2bfe000"..., 1024) = 1024 > [pid 1048] write(2, ":00 0 \n7fc6e21fe000-7fc6e2bfe000"..., 1024) = 1024 > [pid 1048] read(9, "ss_nis-2.12.so\n7fc6f1d93000-7fc6"..., 1024) = 1024 > [pid 1048] write(2, "ss_nis-2.12.so\n7fc6f1d93000-7fc6"..., 1024) = 1024 > [pid 1048] read(9, "25821212 /lib6"..., 1024) = 1024 > [pid 1048] write(2, "25821212 /lib6"..., 1024) = 1024 > [pid 1048] read(9, "1000-7fc6faf32000 rw-p 00137000 "..., 1024) = 791 > [pid 1048] write(2, "1000-7fc6faf32000 rw-p 00137000 "..., 791) = 791 > [pid 1048] read(9, "", 1024) = 0 > [pid 1048] close(9) = 0 > [pid 1048] rt_sigprocmask(SIG_UNBLOCK, [ABRT], NULL, 8) = 0 > [pid 1048] tgkill(957, 1048, SIGABRT) = 0 > [pid 1048] --- SIGABRT (Aborted) @ 0 (0) --- Process 1048 detached > [pid 1051] +++ killed by SIGABRT +++ > [pid 1050] +++ killed by SIGABRT +++ > [pid 1049] +++ killed by SIGABRT +++ > [pid 1047] +++ killed by SIGABRT +++ > [pid 1046] +++ killed by SIGABRT +++ > [pid 1045] +++ killed by SIGABRT +++ > [pid 1044] +++ killed by SIGABRT +++ > [pid 961] +++ killed by SIGABRT +++ > [pid 959] +++ killed by SIGABRT +++ > [pid 958] +++ killed by SIGABRT +++ > [pid 960] +++ killed by SIGABRT +++ > +++ killed by SIGABRT +++ > > > Process got SIGABRT signal - any clue what it can be? > > Reuti and Community appreciate any help and assistance for resolving this > issue....
Unfortunately I have no hint at hand how to solve it. -- Reuti > Thank You. > > -----Original Message----- > From: Yuri Burmachenko > Sent: Tuesday, March 15, 2016 9:11 AM > To: 'Reuti' <re...@staff.uni-marburg.de> > Cc: users@gridengine.org; Dmitry Leibovich <dmit...@mellanox.com> > Subject: RE: [gridengine users] SoGE 8.1.8 - Job IDs getting reset very fast > 9999999 ==> 1 - 6-7 times in a month > > Hello Reuti, > > The spool directory is shared via NFS share between qmaster and shadow > servers. > > Thanks. > > -----Original Message----- > From: Reuti [mailto:re...@staff.uni-marburg.de] > Sent: Monday, March 14, 2016 11:43 AM > To: Yuri Burmachenko <yur...@mellanox.com> > Cc: users@gridengine.org; Dmitry Leibovich <dmit...@mellanox.com> > Subject: Re: [gridengine users] SoGE 8.1.8 - Job IDs getting reset very fast > 9999999 ==> 1 - 6-7 times in a month > > Hi, > >> Am 13.03.2016 um 08:53 schrieb Yuri Burmachenko <yur...@mellanox.com>: >> >> Hello Reuti, >> >> We will try that, but we have also found another issue. >> >> We see that also our SoGE fails and failovers from master to shadow and >> vice-versa during the same time when this switch in Job ID occur: > > The spool directory is shared between the qmaster and shadow daemons? > > -- Reuti > > >> 03/13/2016 08:04:21| main|mtlxsge001|I|qmaster hard descriptor limit >> is set to 8192 >> 03/13/2016 08:04:21| main|mtlxsge001|I|qmaster soft descriptor limit >> is set to 8192 >> 03/13/2016 08:04:21| main|mtlxsge001|I|qmaster will use max. 8172 >> file descriptors for communication >> 03/13/2016 08:04:21| main|mtlxsge001|I|qmaster will accept max. 950 >> dynamic event clients >> 03/13/2016 08:04:21| main|mtlxsge001|I|starting up SGE 8.1.8 >> (lx-amd64) >> >> From qacct: >> jobnumber 351331 >> start_time Sun Mar 13 08:04:28 2016 >> end_time Sun Mar 13 08:05:04 2016 >> jobnumber 351488 >> start_time Sun Mar 13 08:04:34 2016 >> end_time Sun Mar 13 08:05:05 2016 >> jobnumber 351511 >> start_time Sun Mar 13 08:04:54 2016 >> end_time Sun Mar 13 08:05:05 2016 >> jobnumber 351410 >> start_time Sun Mar 13 08:04:29 2016 >> end_time Sun Mar 13 08:05:07 2016 >> jobnumber 351355 >> start_time Sun Mar 13 08:04:28 2016 >> end_time Sun Mar 13 08:05:07 2016 >> jobnumber 351502 >> start_time Sun Mar 13 08:04:49 2016 >> end_time Sun Mar 13 08:05:08 2016 >> jobnumber 9999253 >> start_time Sun Mar 13 08:04:56 2016 >> end_time Sun Mar 13 08:05:08 2016 >> start_time Sun Mar 13 08:04:28 2016 >> end_time Sun Mar 13 08:05:53 2016 >> jobnumber 9999337 >> start_time Sun Mar 13 08:05:43 2016 >> end_time Sun Mar 13 08:05:53 2016 >> jobnumber 9999254 >> start_time Sun Mar 13 08:04:56 2016 >> end_time Sun Mar 13 08:05:57 2016 >> >> There is a correlation in times between the job ID switch and SoGE failure >> and further failover to another node. >> >> Basically now we need to understand why the SoGE fails... >> >> Will appreciate on any tips and advices on this. >> Thank You. >> >> >> -----Original Message----- >> From: Reuti [mailto:re...@staff.uni-marburg.de] >> Sent: Tuesday, March 08, 2016 2:25 PM >> To: Yuri Burmachenko <yur...@mellanox.com> >> Cc: users@gridengine.org >> Subject: Re: [gridengine users] SoGE 8.1.8 - Job IDs getting reset >> very fast 9999999 ==> 1 - 6-7 times in a month >> >> >>> Am 08.03.2016 um 10:59 schrieb Yuri Burmachenko <yur...@mellanox.com>: >>> >>> Hello Reuti, >>> >>> See below: >>> >>> Job ID Job schedule time >>> 97453 29-02-2016_03:18:55 >>> 97454 29-02-2016_03:18:57 >>> 9999563 29-02-2016_03:23:44 >>> 9999564 29-02-2016_03:23:44 >>> 9999565 29-02-2016_03:23:44 >>> .... >>> 9999999 29-02-2016_03:27:34 >>> 1 29-02-2016_03:27:35 >>> >>> Any idea what could be the root cause and/or where to look? >> >> Interesting. One could try `incron` to spot any access to the file >> "jobseqnum". >> >> -- Reuti >> >> >>> >>> Thanks. >>> >>> -----Original Message----- >>> From: Reuti [mailto:re...@staff.uni-marburg.de] >>> Sent: Sunday, March 06, 2016 7:27 PM >>> To: Yuri Burmachenko <yur...@mellanox.com> >>> Cc: users@gridengine.org >>> Subject: Re: [gridengine users] SoGE 8.1.8 - Job IDs getting reset >>> very fast 9999999 ==> 1 - 6-7 times in a month >>> >>> Hi, >>> >>> Am 06.03.2016 um 18:04 schrieb Yuri Burmachenko: >>> >>>> Hallo to distinguished forum members, >>>> >>>> Recently we have found that something is wrong with SGE Job IDs - they are >>>> getting reset very fast: 6-7 times in a month. >>>> We don't really have so many jobs executed in such a short period of time. >>>> >>>> We use JobId (via qacct) as a primary key for different home-made analytic >>>> tools, and this very quick jobId switch impairs the reliability of the >>>> tools. >>>> >>>> This started after we had a full electricity shutdown during which we have >>>> halted all our systems including SGE master/shadow and its execution hosts. >>> >>> To elaborate this. When it suddenly jumps to 99999999: what was the highest >>> JOB_ID which was recorded before that skip in the accounting file? >>> >>> -- Reuti >>> >>> >>>> Perhaps something sets $SGE_ROOT/default/spool/qmaster/jobseqnum to >>>> "9999999" and then something (related or not) restarts SGE setting that >>>> jobid. >>>> >>>> Any tips and advices where to look for the root cause, will be greatly >>>> appreciated. >>>> Thank You. >>>> >>>> >>>> >>>> Yuri Burmachenko | Sr. Engineer | IT | Mellanox Technologies Ltd. >>>> Work: +972 74 7236386 | Cell +972 54 7542188 |Fax: +972 4 959 3245 >>>> Follow us on Twitter and Facebook >>>> >>>> _______________________________________________ >>>> users mailing list >>>> users@gridengine.org >>>> https://gridengine.org/mailman/listinfo/users >>> >>> >> >> > > _______________________________________________ users mailing list users@gridengine.org https://gridengine.org/mailman/listinfo/users