Re: [gridengine users] SoGE 8.1.8 - Job IDs getting reset very fast 9999999 ==> 1 - 6-7 times in a month

Yuri Burmachenko Fri, 15 Apr 2016 17:23:01 -0700

We have put strace for sge_qmaster process and this what we got in the end of 
the trace (the file generated was 95G) - below are last 80 lines:



[pid  1049] futex(0x1ee63d4, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 
101147186, {1458258881, 623057000}, ffffffff <unfinished ...>
[pid  1050] <... futex resumed> )       = -1 EAGAIN (Resource temporarily 
unavailable)
[pid  1050] futex(0x1ee63d4, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 
101147186, {1458258881, 623051000}, ffffffff <unfinished ...> 
[pid  1048] open("/dev/tty", O_RDWR|O_NOCTTY|O_NONBLOCK) = -1 ENXIO (No such 
device or address) 
[pid  1048] writev(2, [{"*** glibc detected *** ", 23}, 
{"/opt/sge/bin/lx-amd64/sge_qmaste"..., 33}, {": ", 2}, {"double free or 
corruption (fastt"..., 35}, {": 0x", 4}, {"00007fc6a14481d0", 16}, {" ***\n", 
5}], 7) = 118 
[pid  1048] mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 
-1, 0) = 0x7fc6fb1a4000
[pid  1048] open("/opt/sge/bin/lx-amd64/../../lib/lx-amd64/libgcc_s.so.1", 
O_RDONLY) = -1 ENOENT (No such file or directory)
[pid  1048] open("/etc/ld.so.cache", O_RDONLY) = 9 [pid  1048] fstat(9, 
{st_mode=S_IFREG|0644, st_size=134739, ...}) = 0 
[pid  1048] mmap(NULL, 134739, PROT_READ, MAP_PRIVATE, 9, 0) = 0x7fc6f0766000
[pid  1048] close(9)                    = 0
[pid  1048] open("/lib64/libgcc_s.so.1", O_RDONLY) = 9 
[pid  1048] read(9, 
"\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\20)\0{3\0\0\0"..., 832) = 832 
[pid  1048] fstat(9, {st_mode=S_IFREG|0755, st_size=93320, ...}) = 0 
[pid  1048] mmap(0x337b000000, 2186584, PROT_READ|PROT_EXEC, 
MAP_PRIVATE|MAP_DENYWRITE, 9, 0) = 0x337b000000 
[pid  1048] mprotect(0x337b016000, 2093056, PROT_NONE) = 0
[pid  1048] mmap(0x337b215000, 4096, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 9, 0x15000) = 0x337b215000
[pid  1048] close(9)                    = 0
[pid  1048] munmap(0x7fc6f0766000, 134739) = 0 [pid  1048] futex(0x3377b92620, 
FUTEX_WAKE_PRIVATE, 2147483647) = 0
[pid  1048] futex(0x337b215af0, FUTEX_WAKE_PRIVATE, 2147483647) = 0 [pid  1048] 
write(2, "======= Backtrace: =========\n", 29) = 29
[pid  1048] writev(2, [{"/lib64/libc.so.6", 16}, {"[0x", 3}, {"3377876166", 
10}, {"]\n", 2}], 4) = 31 
[pid  1048] writev(2, [{"/opt/sge/bin/lx-amd64/sge_qmaste"..., 33}, {"(", 1}, 
{"sge_free", 8}, {"+0x", 3}, {"16", 2}, {")", 1}, {"[0x", 3}, {"5e3e76", 6}, 
{"]\n", 2}], 9) = 59
[pid  1048] writev(2, [{"/opt/sge/bin/lx-amd64/sge_qmaste"..., 33}, {"(", 1}, 
{"lSetString", 10}, {"+0x", 3}, {"d8", 2}, {")", 1}, {"[0x", 3}, {"590238", 6}, 
{"]\n", 2}], 9) = 61 
[pid  1048] writev(2, [{"/opt/sge/bin/lx-amd64/sge_qmaste"..., 33}, {"[0x", 3}, 
{"4cbfbf", 6}, {"]\n", 2}], 4) = 44 [pid  1048] writev(2, 
[{"/opt/sge/bin/lx-amd64/sge_qmaste"..., 33}, {"[0x", 3}, {"4cd565", 6}, 
{"]\n", 2}], 4) = 44
[pid  1048] writev(2, [{"/opt/sge/bin/lx-amd64/sge_qmaste"..., 33}, {"(", 1}, 
{"sge_select_parallel_environment", 31}, {"+0x", 3}, {"939", 3}, {")", 1}, 
{"[0x", 3}, {"4ceb59", 6}, {"]\n", 2}], 9) = 83 
[pid  1048] writev(2, [{"/opt/sge/bin/lx-amd64/sge_qmaste"..., 33}, {"(", 1}, 
{"verify_suitable_queues", 22}, {"+0x", 3}, {"3cc", 3}, {")", 1}, {"[0x", 3}, 
{"45d03c", 6}, {"]\n", 2}], 9) = 74 
[pid  1048] writev(2, [{"/opt/sge/bin/lx-amd64/sge_qmaste"..., 33}, {"(", 1}, 
{"sge_job_verify_adjust", 21}, {"+0x", 3}, {"2506", 4}, {")", 1}, {"[0x", 3}, 
{"46c346", 6}, {"]\n", 2}], 9) = 74 
[pid  1048] writev(2, [{"/opt/sge/bin/lx-amd64/sge_qmaste"..., 33}, {"(", 1}, 
{"sge_gdi_add_job", 15}, {"+0x", 3}, {"1b7", 3}, {")", 1}, {"[0x", 3}, 
{"468e07", 6}, {"]\n", 2}], 9) = 67 
[pid  1048] writev(2, [{"/opt/sge/bin/lx-amd64/sge_qmaste"..., 33}, {"[0x", 3}, 
{"4428a5", 6}, {"]\n", 2}], 4) = 44 
[pid  1048] writev(2, [{"/opt/sge/bin/lx-amd64/sge_qmaste"..., 33}, {"(", 1}, 
{"sge_c_gdi", 9}, {"+0x", 3}, {"4ef", 3}, {")", 1}, {"[0x", 3}, {"44525f", 6}, 
{"]\n", 2}], 9) = 61 
[pid  1048] writev(2, [{"/opt/sge/bin/lx-amd64/sge_qmaste"..., 33}, {"(", 1}, 
{"sge_worker_main", 15}, {"+0x", 3}, {"28f", 3}, {")", 1}, {"[0x", 3}, 
{"42feff", 6}, {"]\n", 2}], 9) = 67 
[pid  1048] writev(2, [{"/lib64/libpthread.so.0", 22}, {"[0x", 3}, 
{"33780079d1", 10}, {"]\n", 2}], 4) = 37 
[pid  1048] writev(2, [{"/lib64/libc.so.6", 16}, {"(", 1}, {"clone", 5}, 
{"+0x", 3}, {"6d", 2}, {")", 1}, {"[0x", 3}, {"33778e8b6d", 10}, {"]\n", 2}], 
9) = 43 
[pid  1048] write(2, "======= Memory map: ========\n", 29) = 29
[pid  1048] open("/proc/self/maps", O_RDONLY) = 9 [pid  1048] read(9, 
"00400000-0065a000 r-xp 00000000 "..., 1024) = 1024
[pid  1048] write(2, "00400000-0065a000 r-xp 00000000 "..., 1024) = 1024
[pid  1048] read(9, "so\n3377800000-337798b000 r-xp 00"..., 1024) = 1024 
[pid  1048] write(2, "so\n3377800000-337798b000 r-xp 00"..., 1024) = 1024
[pid  1048] read(9, "17000 08:03 25821640            "..., 1024) = 1024
[pid  1048] write(2, "17000 08:03 25821640            "..., 1024) = 1024
[pid  1048] read(9, "0000-3379416000 r-xp 00000000 08"..., 1024) = 1024
[pid  1048] write(2, "0000-3379416000 r-xp 00000000 08"..., 1024) = 1024
[pid  1048] read(9, "   /lib64/libcom_err.so.2.1\n337f"..., 1024) = 1024
[pid  1048] write(2, "   /lib64/libcom_err.so.2.1\n337f"..., 1024) = 1024
[pid  1048] read(9, "64/libgssapi_krb5.so.2.2\n3380041"..., 1024) = 1024 
[pid  1048] write(2, "64/libgssapi_krb5.so.2.2\n3380041"..., 1024) = 1024 
[pid  1048] read(9, "tils.so.1.3\n3380a02000-3380a0300"..., 1024) = 1024
[pid  1048] write(2, "tils.so.1.3\n3380a02000-3380a0300"..., 1024) = 1024
[pid  1048] read(9, " \n7fc6ba459000-7fc6bc000000 ---p"..., 1024) = 1024
[pid  1048] write(2, " \n7fc6ba459000-7fc6bc000000 ---p"..., 1024) = 1024 
[pid  1048] read(9, ":00 0 \n7fc6e21fe000-7fc6e2bfe000"..., 1024) = 1024
[pid  1048] write(2, ":00 0 \n7fc6e21fe000-7fc6e2bfe000"..., 1024) = 1024 
[pid  1048] read(9, "ss_nis-2.12.so\n7fc6f1d93000-7fc6"..., 1024) = 1024
[pid  1048] write(2, "ss_nis-2.12.so\n7fc6f1d93000-7fc6"..., 1024) = 1024
[pid  1048] read(9, "25821212                   /lib6"..., 1024) = 1024
[pid  1048] write(2, "25821212                   /lib6"..., 1024) = 1024
[pid  1048] read(9, "1000-7fc6faf32000 rw-p 00137000 "..., 1024) = 791 
[pid  1048] write(2, "1000-7fc6faf32000 rw-p 00137000 "..., 791) = 791
[pid  1048] read(9, "", 1024)           = 0
[pid  1048] close(9)                    = 0
[pid  1048] rt_sigprocmask(SIG_UNBLOCK, [ABRT], NULL, 8) = 0
[pid  1048] tgkill(957, 1048, SIGABRT)  = 0
[pid  1048] --- SIGABRT (Aborted) @ 0 (0) --- Process 1048 detached
[pid  1051] +++ killed by SIGABRT +++ 
[pid  1050] +++ killed by SIGABRT +++ 
[pid  1049] +++ killed by SIGABRT +++
[pid  1047] +++ killed by SIGABRT +++
[pid  1046] +++ killed by SIGABRT +++ 
[pid  1045] +++ killed by SIGABRT +++
[pid  1044] +++ killed by SIGABRT +++
[pid   961] +++ killed by SIGABRT +++
[pid   959] +++ killed by SIGABRT +++
[pid   958] +++ killed by SIGABRT +++
[pid   960] +++ killed by SIGABRT +++
+++ killed by SIGABRT +++


Process got SIGABRT signal - any clue what it can be?

Reuti and Community appreciate any help and assistance for resolving this 
issue....

Thank You.

-----Original Message-----
From: Yuri Burmachenko 
Sent: Tuesday, March 15, 2016 9:11 AM
To: 'Reuti' <re...@staff.uni-marburg.de>
Cc: users@gridengine.org; Dmitry Leibovich <dmit...@mellanox.com>
Subject: RE: [gridengine users] SoGE 8.1.8 - Job IDs getting reset very fast 
9999999 ==> 1 - 6-7 times in a month

Hello Reuti,

The spool directory is shared via NFS share between qmaster and shadow servers.

Thanks.

-----Original Message-----
From: Reuti [mailto:re...@staff.uni-marburg.de]
Sent: Monday, March 14, 2016 11:43 AM
To: Yuri Burmachenko <yur...@mellanox.com>
Cc: users@gridengine.org; Dmitry Leibovich <dmit...@mellanox.com>
Subject: Re: [gridengine users] SoGE 8.1.8 - Job IDs getting reset very fast 
9999999 ==> 1 - 6-7 times in a month

Hi,

> Am 13.03.2016 um 08:53 schrieb Yuri Burmachenko <yur...@mellanox.com>:
> 
> Hello Reuti,
> 
> We will try that, but we have also found another issue.
> 
> We see that also our SoGE fails and  failovers from master to shadow and 
> vice-versa during the same time when this switch in Job ID occur:

The spool directory is shared between the qmaster and shadow daemons?

-- Reuti


> 03/13/2016 08:04:21|  main|mtlxsge001|I|qmaster hard descriptor limit 
> is set to 8192
> 03/13/2016 08:04:21|  main|mtlxsge001|I|qmaster soft descriptor limit 
> is set to 8192
> 03/13/2016 08:04:21|  main|mtlxsge001|I|qmaster will use max. 8172 
> file descriptors for communication
> 03/13/2016 08:04:21|  main|mtlxsge001|I|qmaster will accept max. 950 
> dynamic event clients
> 03/13/2016 08:04:21|  main|mtlxsge001|I|starting up SGE 8.1.8
> (lx-amd64)
> 
> From qacct:
> jobnumber    351331              
> start_time   Sun Mar 13 08:04:28 2016
> end_time     Sun Mar 13 08:05:04 2016
> jobnumber    351488              
> start_time   Sun Mar 13 08:04:34 2016
> end_time     Sun Mar 13 08:05:05 2016
> jobnumber    351511              
> start_time   Sun Mar 13 08:04:54 2016
> end_time     Sun Mar 13 08:05:05 2016
> jobnumber    351410              
> start_time   Sun Mar 13 08:04:29 2016
> end_time     Sun Mar 13 08:05:07 2016
> jobnumber    351355              
> start_time   Sun Mar 13 08:04:28 2016
> end_time     Sun Mar 13 08:05:07 2016
> jobnumber    351502              
> start_time   Sun Mar 13 08:04:49 2016
> end_time     Sun Mar 13 08:05:08 2016
> jobnumber    9999253             
> start_time   Sun Mar 13 08:04:56 2016
> end_time     Sun Mar 13 08:05:08 2016
> start_time   Sun Mar 13 08:04:28 2016
> end_time     Sun Mar 13 08:05:53 2016
> jobnumber    9999337             
> start_time   Sun Mar 13 08:05:43 2016
> end_time     Sun Mar 13 08:05:53 2016
> jobnumber    9999254             
> start_time   Sun Mar 13 08:04:56 2016
> end_time     Sun Mar 13 08:05:57 2016
> 
> There is a correlation in times between the job ID switch and SoGE failure 
> and further failover to another node.
> 
> Basically now we need to understand why the SoGE fails...
> 
> Will appreciate on any tips and advices on this.
> Thank You.
> 
> 
> -----Original Message-----
> From: Reuti [mailto:re...@staff.uni-marburg.de]
> Sent: Tuesday, March 08, 2016 2:25 PM
> To: Yuri Burmachenko <yur...@mellanox.com>
> Cc: users@gridengine.org
> Subject: Re: [gridengine users] SoGE 8.1.8 - Job IDs getting reset 
> very fast 9999999 ==> 1 - 6-7 times in a month
> 
> 
>> Am 08.03.2016 um 10:59 schrieb Yuri Burmachenko <yur...@mellanox.com>:
>> 
>> Hello Reuti,
>> 
>> See below:
>> 
>> Job ID               Job schedule time
>> 97453                29-02-2016_03:18:55
>> 97454                29-02-2016_03:18:57
>> 9999563      29-02-2016_03:23:44
>> 9999564      29-02-2016_03:23:44
>> 9999565      29-02-2016_03:23:44
>> ....
>> 9999999      29-02-2016_03:27:34
>> 1            29-02-2016_03:27:35
>> 
>> Any idea what could be the root cause and/or where to look?
> 
> Interesting. One could try `incron` to spot any access to the file 
> "jobseqnum".
> 
> -- Reuti
> 
> 
>> 
>> Thanks.
>> 
>> -----Original Message-----
>> From: Reuti [mailto:re...@staff.uni-marburg.de]
>> Sent: Sunday, March 06, 2016 7:27 PM
>> To: Yuri Burmachenko <yur...@mellanox.com>
>> Cc: users@gridengine.org
>> Subject: Re: [gridengine users] SoGE 8.1.8 - Job IDs getting reset 
>> very fast 9999999 ==> 1 - 6-7 times in a month
>> 
>> Hi,
>> 
>> Am 06.03.2016 um 18:04 schrieb Yuri Burmachenko:
>> 
>>> Hallo to distinguished forum members,
>>> 
>>> Recently we have found that something is wrong with SGE Job IDs - they are 
>>> getting reset very fast: 6-7 times in a month.
>>> We don't really have so many jobs executed in such a short period of time.
>>> 
>>> We use JobId (via qacct) as a primary key for different home-made analytic 
>>> tools, and this very quick jobId switch impairs the reliability of the 
>>> tools.
>>> 
>>> This started after we had a full electricity shutdown during which we have 
>>> halted all our systems including SGE master/shadow and its execution hosts.
>> 
>> To elaborate this. When it suddenly jumps to 99999999: what was the highest 
>> JOB_ID which was recorded before that skip in the accounting file?
>> 
>> -- Reuti
>> 
>> 
>>> Perhaps something sets $SGE_ROOT/default/spool/qmaster/jobseqnum to 
>>> "9999999" and then something (related or not) restarts SGE setting that 
>>> jobid.
>>> 
>>> Any tips and advices where to look for the root cause, will be greatly 
>>> appreciated.
>>> Thank You.
>>> 
>>> 
>>> 
>>> Yuri Burmachenko | Sr. Engineer | IT | Mellanox Technologies Ltd.
>>> Work: +972 74 7236386 | Cell +972 54 7542188 |Fax: +972 4 959 3245 
>>> Follow us on Twitter and Facebook
>>> 
>>> _______________________________________________
>>> users mailing list
>>> users@gridengine.org
>>> https://gridengine.org/mailman/listinfo/users
>> 
>> 
> 
> 


_______________________________________________
users mailing list
users@gridengine.org
https://gridengine.org/mailman/listinfo/users

Re: [gridengine users] SoGE 8.1.8 - Job IDs getting reset very fast 9999999 ==> 1 - 6-7 times in a month

Reply via email to