Hi all
I am having a crazy time fixing an issue I have having with 3 qinstance stuck
in E.
[root@rndusljpp2 opt]# qstat -explain E
queuename qtype resv/used/tot. load_avg arch
states
---------------------------------------------------------------------------------
allhosts.q@c1 BIP 0/0/12 0.03 lx-amd64 E
queue allhosts.q marked QERROR as result of job 1546377's failure at
host c1
queue allhosts.q marked QERROR as result of job 1546378's failure at
host c1
queue allhosts.q marked QERROR as result of job 1546379's failure at
host c1
queue allhosts.q marked QERROR as result of job 1546380's failure at
host c1
queue allhosts.q marked QERROR as result of job 1546381's failure at
host c1
queue allhosts.q marked QERROR as result of job 1546382's failure at
host c1
queue allhosts.q marked QERROR as result of job 1546383's failure at
host c1
queue allhosts.q marked QERROR as result of job 1546384's failure at
host c1
queue allhosts.q marked QERROR as result of job 1546385's failure at
host c1
queue allhosts.q marked QERROR as result of job 1546386's failure at
host c1
queue allhosts.q marked QERROR as result of job 1546387's failure at
host c1
queue allhosts.q marked QERROR as result of job 1546388's failure at
host c1
[root@rndusljpp2 opt]# qacct -j 1546377
==============================================================
qname allhosts.q
hostname c1
group mseierst
owner mseierst
project NONE
department defaultdepartment
jobname macrocycle_2D_s
jobnumber 1546377
taskid undefined
account sge
priority 0
qsub_time Wed Dec 31 16:00:00 1969
start_time -/-
end_time -/-
granted_pe NONE
slots 0
failed 1 : assumedly before job
exit_status 0
ru_wallclock 0s
ru_utime 0.000s
ru_stime 0.000s
ru_maxrss 0.000B
ru_ixrss 0.000B
ru_ismrss 0.000B
ru_idrss 0.000B
ru_isrss 0.000B
ru_minflt 0
ru_majflt 0
ru_nswap 0
ru_inblock 0
ru_oublock 0
ru_msgsnd 0
ru_msgrcv 0
ru_nsignals 0
ru_nvcsw 0
ru_nivcsw 0
cpu 0.000s
mem 0.000GBs
io 0.000GB
iow 0.000s
maxvmem 0.000B
arid undefined
ar_sub_time undefined
category -q allhosts.q
[root@rndusljpp2 opt]#
Cat /opt/sge/default/spool/qmaster --------- from qmaster
06/02/2016 15:17:14|worker|rndusljpp2|W|job 1546377.1 failed on host c1 general
assumedly before job because: can't create directory active_jobs/1546377.1: No
such file or directory
06/02/2016 15:17:14|worker|rndusljpp2|W|rescheduling job 1546377.1
06/02/2016 15:17:14|worker|rndusljpp2|E|queue allhosts.q marked QERROR as
result of job 1546377's failure at host c1
Qstat shows the job has been rescheduled to c5 and is running
[root@rndusljpp2 common]# qstat -u "*"
job-ID prior name user state submit/start at queue
slots ja-task-ID
-----------------------------------------------------------------------------------------------------------------
1386811 0.55500 macrocycle mseierst r 05/27/2016 10:02:53 allhosts.q@c6
1
1545754 0.55500 macrocycle mseierst r 06/02/2016 13:22:31 allhosts.q@c7
1
1545760 0.55500 macrocycle mseierst r 06/02/2016 13:23:08 allhosts.q@c7
1
1545772 0.55500 macrocycle mseierst r 06/02/2016 13:24:04 allhosts.q@c7
1
1545773 0.55500 macrocycle mseierst r 06/02/2016 13:24:10 allhosts.q@c7
1
1545780 0.55500 macrocycle mseierst r 06/02/2016 13:24:29 allhosts.q@c7
1
1545785 0.55500 macrocycle mseierst r 06/02/2016 13:24:57 allhosts.q@c7
1
1545787 0.55500 macrocycle mseierst r 06/02/2016 13:25:03 allhosts.q@c7
1
1545796 0.55500 macrocycle mseierst r 06/02/2016 13:25:45 allhosts.q@c7
1
1545806 0.55500 macrocycle mseierst r 06/02/2016 13:26:36 allhosts.q@c7
1
1545807 0.55500 macrocycle mseierst r 06/02/2016 13:26:36 allhosts.q@c7
1
1545815 0.55500 macrocycle mseierst r 06/02/2016 13:27:08 allhosts.q@c7
1
1545822 0.55500 macrocycle mseierst r 06/02/2016 13:27:54 allhosts.q@c7
1
1546062 0.55500 macrocycle mseierst r 06/02/2016 14:15:29 allhosts.q@c7
1
1546212 0.55500 macrocycle mseierst r 06/02/2016 14:46:40 allhosts.q@c7
1
1546313 0.55500 macrocycle mseierst r 06/02/2016 15:05:06 allhosts.q@c3
1
1546326 0.55500 macrocycle mseierst r 06/02/2016 15:06:38 allhosts.q@c8
1
1546327 0.55500 macrocycle mseierst r 06/02/2016 15:06:44 allhosts.q@c3
1
1546328 0.55500 macrocycle mseierst r 06/02/2016 15:06:47 allhosts.q@c8
1
1546331 0.55500 macrocycle mseierst r 06/02/2016 15:07:43 allhosts.q@c3
1
1546332 0.55500 macrocycle mseierst r 06/02/2016 15:07:43 allhosts.q@c3
1
1546333 0.55500 macrocycle mseierst r 06/02/2016 15:07:49 allhosts.q@c3
1
1546335 0.55500 macrocycle mseierst r 06/02/2016 15:07:55 allhosts.q@c8
1
1546336 0.55500 macrocycle mseierst r 06/02/2016 15:08:11 allhosts.q@c8
1
1546338 0.55500 macrocycle mseierst r 06/02/2016 15:10:07 allhosts.q@c6
1
1546340 0.55500 macrocycle mseierst r 06/02/2016 15:10:34 allhosts.q@c3
1
1546341 0.55500 macrocycle mseierst r 06/02/2016 15:10:34 allhosts.q@c8
1
1546343 0.55500 macrocycle mseierst r 06/02/2016 15:11:13 allhosts.q@c5
1
1546344 0.55500 macrocycle mseierst r 06/02/2016 15:11:19 allhosts.q@c8
1
1546346 0.55500 macrocycle mseierst r 06/02/2016 15:12:23 allhosts.q@c8
1
1546348 0.55500 macrocycle mseierst r 06/02/2016 15:12:43 allhosts.q@c7
1
1546349 0.55500 macrocycle mseierst r 06/02/2016 15:12:46 allhosts.q@c5
1
1546350 0.55500 macrocycle mseierst r 06/02/2016 15:12:46 allhosts.q@c6
1
1546351 0.55500 macrocycle mseierst r 06/02/2016 15:12:46 allhosts.q@c6
1
1546353 0.55500 macrocycle mseierst r 06/02/2016 15:12:52 allhosts.q@c8
1
1546354 0.55500 macrocycle mseierst r 06/02/2016 15:12:58 allhosts.q@c5
1
1546355 0.55500 macrocycle mseierst r 06/02/2016 15:13:01 allhosts.q@c8
1
1546357 0.55500 macrocycle mseierst r 06/02/2016 15:13:10 allhosts.q@c5
1
1546358 0.55500 macrocycle mseierst r 06/02/2016 15:13:22 allhosts.q@c3
1
1546359 0.55500 macrocycle mseierst r 06/02/2016 15:13:22 allhosts.q@c3
1
1546360 0.55500 macrocycle mseierst r 06/02/2016 15:13:22 allhosts.q@c8
1
1546361 0.55500 macrocycle mseierst r 06/02/2016 15:13:22 allhosts.q@c6
1
1546362 0.55500 macrocycle mseierst r 06/02/2016 15:13:28 allhosts.q@c5
1
1546363 0.55500 macrocycle mseierst r 06/02/2016 15:13:45 allhosts.q@c6
1
1546364 0.55500 macrocycle mseierst r 06/02/2016 15:13:55 allhosts.q@c8
1
1546365 0.55500 macrocycle mseierst r 06/02/2016 15:13:58 allhosts.q@c6
1
1546366 0.55500 macrocycle mseierst r 06/02/2016 15:14:04 allhosts.q@c3
1
1546367 0.55500 macrocycle mseierst r 06/02/2016 15:14:04 allhosts.q@c6
1
1546368 0.55500 macrocycle mseierst r 06/02/2016 15:14:55 allhosts.q@c8
1
1546369 0.55500 macrocycle mseierst r 06/02/2016 15:15:02 allhosts.q@c6
1
1546370 0.55500 macrocycle mseierst r 06/02/2016 15:15:05 allhosts.q@c3
1
1546371 0.55500 macrocycle mseierst r 06/02/2016 15:15:13 allhosts.q@c5
1
1546372 0.55500 macrocycle mseierst r 06/02/2016 15:15:13 allhosts.q@c5
1
1546373 0.55500 macrocycle mseierst r 06/02/2016 15:15:13 allhosts.q@c5
1
1546374 0.55500 macrocycle mseierst r 06/02/2016 15:15:13 allhosts.q@c8
1
1546375 0.55500 macrocycle mseierst r 06/02/2016 15:15:16 allhosts.q@c5
1
1546376 0.55500 macrocycle mseierst r 06/02/2016 15:15:36 allhosts.q@c6
1
1546377 0.55500 macrocycle mseierst r 06/02/2016 15:17:41 allhosts.q@c5
1 <--------------------- running
Message on C1
06/02/2016 15:19:11| main|c1|E|can't create directory "active_jobs/1546377.1":
No such file or directory
06/02/2016 15:19:11| main|c1|E|can't start job "1546377": can't create
directory active_jobs/1546377.1: No such file or directory
06/02/2016 15:19:20| main|c1|E|received task belongs to job 1546377 but that
job is not here
06/02/2016 15:19:20| main|c1|E|acknowledge for unknown job 1546377.1/master
06/02/2016 15:19:20| main|c1|E|can't find active jobs directory
"active_jobs/1546377.1" for reaping job 1546377
06/02/2016 15:19:20| main|c1|E|unlink(jobs/00/0154/6377.1) failed: No such
file or directory
06/02/2016 15:19:20| main|c1|E|can not remove file job spool file:
jobs/00/0154/6377.1
06/02/2016 15:19:20| main|c1|E|can not remove file task spool file: No such
file or directory
06/02/2016 15:19:20| main|c1|E|can not remove file task spool file: No such
file or directory
06/02/2016 15:19:20| main|c1|E|can't remove directory "active_jobs/1546377.1":
opendir(active_jobs/1546377.1) failed: No such file or directory
The directory is there on C1 ( and has 777 for permissions)
[root@c1 active_jobs]# pwd
/opt/sge/default/spool/c1/active_jobs
[root@c1 active_jobs]#
[root@c1 c1]# ls -l
total 5980
drwxrwxrwx 32000 sgeadmin sgeadmin 999424 May 30 04:54 active_jobs
<------------------------- 777
-rw-r--r-- 1 sgeadmin sgeadmin 5 Jun 1 21:21 execd.pid
drwxr-xr-x 2 sgeadmin sgeadmin 4096 May 31 14:03 jobs
drwxr-xr-x 2 sgeadmin sgeadmin 4096 May 30 05:07 job_scripts
-rw-r--r-- 1 sgeadmin sgeadmin 5095417 Jun 2 15:19 messages
[root@c1 c1]#
When I stop the execd service on c1 this happens: (these are the last jobs
that finish? On the qinstance before it stopped working)
So SGE was able to write to this directory with no problems before....
[root@c1 c1]# service sgeexecd.LJ_SGE_ClusterA stop
Shutting down Grid Engine execution daemon
Shutting down Grid Engine shepherd of job 1197405.1
Shutting down Grid Engine shepherd of job 1197432.1
Shutting down Grid Engine shepherd of job 1197433.1
Shutting down Grid Engine shepherd of job 1197434.1
So I try to clear the qinstance or the que itself...
[root@rndusljpp2 ~]# qmod -c allhosts.q
Queue instance "allhosts.q@c6" is already in the specified state: no error
[email protected] changed state of "allhosts.q@c1" (no error)
Queue instance "allhosts.q@c2" is already in the specified state: no error
Queue instance "allhosts.q@c4" is already in the specified state: no error
Queue instance "allhosts.q@c8" is already in the specified state: no error
Queue instance "allhosts.q@c5" is already in the specified state: no error
Queue instance "allhosts.q@c7" is already in the specified state: no error
Queue instance "allhosts.q@c3" is already in the specified state: no error
And the E comes right back after a few seconds...
So I am force to disable the qinstance...
Any help would be appreciated
_______________________________________________
users mailing list
[email protected]
https://gridengine.org/mailman/listinfo/users