Hallo,
I see the following behavior on a cluster running SLURM 2.5.7.
[root@e conf.d]# scontrol update jobid=371 name=test
[root@e conf.d]# squeue
JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
371 defq test cmsuppor PD 0:00 1 (JobHeldUser)
[root@e conf.d]# scontrol update jobid=371 starttime=2014-01-01T00:00:01
[root@e conf.d]# squeue
JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
371 defq test cmsuppor PD 0:00 1 (JobHeldAdmin)
Note, the "JobHeldAdmin" after changing the start time.
The scheduler used is the backfill scheduler.
srun -H sleep 120
squeue: 6 com1 sleep cmsuppor PD 0:00 1
(JobHeldUser)
scontrol update jobid=6 starttime=2014-01-01T00:00:01
squeue: 6 com1 sleep cmsuppor PD 0:00 1
(JobHeldAdmin)
here is the corresponding log (debug level 9):
[2013-12-03T20:34:03+01:00] debug2: sched: Processing RPC:
REQUEST_RESOURCE_ALLOCATION from uid=1000
[2013-12-03T20:34:03+01:00] debug3: JobDesc: user_id=1000 job_id=-1
partition=(null) name=sleep
[2013-12-03T20:34:03+01:00] debug3: cpus=1-4294967294 pn_min_cpus=-1
[2013-12-03T20:34:03+01:00] debug3: -N min-[max]:
1-[4294967294]:65534:65534:
65534
[2013-12-03T20:34:03+01:00] debug3: pn_min_memory_job=-1
pn_min_tmp_disk=-1
[2013-12-03T20:34:03+01:00] debug3: immediate=0 features=(null)
reservation=(null)
[2013-12-03T20:34:03+01:00] debug3: req_nodes=(null) exc_nodes=(null)
gres=(null)
[2013-12-03T20:34:03+01:00] debug3: time_limit=-1--1 priority=0
contiguous=0 shared=-1
[2013-12-03T20:34:03+01:00] debug3: kill_on_node_fail=-1 script=(null)
[2013-12-03T20:34:03+01:00] debug3: argv="/bin/sleep"
[2013-12-03T20:34:03+01:00] debug3: stdin=(null) stdout=(null)
stderr=(null)
[2013-12-03T20:34:03+01:00] debug3: work_dir=/home/cmsupport
alloc_node:sid=b61a:4124
[2013-12-03T20:34:03+01:00] debug3: resp_host=10.141.255.254
alloc_resp_port=49527 other_port=38020
[2013-12-03T20:34:03+01:00] debug3: dependency=(null) account=(null)
qos=(null) comment=(null)
[2013-12-03T20:34:03+01:00] debug3: mail_type=0 mail_user=(null)
nice=55534 num_tasks=4294967294 open_mode=0 overcommit=-1 acctg_freq=-1
[2013-12-03T20:34:03+01:00] debug3: network=(null) begin=Unknown
cpus_per_task=-1 requeue=-1 licenses=(null)
[2013-12-03T20:34:03+01:00] debug3: end_time=Unknown
signal=0@0wait_all_nodes=-1
[2013-12-03T20:34:03+01:00] debug3: ntasks_per_node=-1
ntasks_per_socket=-1 ntasks_per_core=-1
[2013-12-03T20:34:03+01:00] debug3: cpus_bind=65534:(null)
mem_bind=65534:(null) plane_size:65534
[2013-12-03T20:34:03+01:00] debug3: User (null)(1000) doesn't have a
default account
[2013-12-03T20:34:03+01:00] debug3: User (null)(1000) doesn't have a
default account
[2013-12-03T20:34:03+01:00] debug3: found correct qos
[2013-12-03T20:34:03+01:00] sched: _slurm_rpc_allocate_resources JobId=6
NodeList=(null) usec=2287
[2013-12-03T20:34:03+01:00] debug3: Writing job id 6 to header record of
job_state file
[2013-12-03T20:34:03+01:00] debug: Spawning ping agent for n001
[2013-12-03T20:34:03+01:00] debug2: Spawning RPC agent for msg_type 1008
[2013-12-03T20:34:03+01:00] debug2: got 1 threads to send out
[2013-12-03T20:34:03+01:00] debug2: Tree head got back 0 looking for 1
[2013-12-03T20:34:03+01:00] debug3: Tree sending to n001
[2013-12-03T20:34:03+01:00] debug4: orig_timeout was 10000 we have 0 steps
and a timeout of 10000
[2013-12-03T20:34:03+01:00] debug2: Tree head got back 1
[2013-12-03T20:34:03+01:00] debug2: Tree head got them all
[2013-12-03T20:34:03+01:00] debug2: node_did_resp n001
[2013-12-03T20:34:04+01:00] debug3: Processing RPC: REQUEST_JOB_INFO from
uid=0
[2013-12-03T20:34:05+01:00] debug: Spawning ping agent for n001
[2013-12-03T20:34:05+01:00] debug2: Spawning RPC agent for msg_type 1008
[2013-12-03T20:34:05+01:00] debug2: got 1 threads to send out
[2013-12-03T20:34:05+01:00] debug2: Tree head got back 0 looking for 1
[2013-12-03T20:34:05+01:00] debug3: Tree sending to n001
[2013-12-03T20:34:05+01:00] debug4: orig_timeout was 10000 we have 0 steps
and a timeout of 10000
[2013-12-03T20:34:05+01:00] debug2: Tree head got back 1
[2013-12-03T20:34:05+01:00] debug2: Tree head got them all
[2013-12-03T20:34:05+01:00] debug2: node_did_resp n001
[2013-12-03T20:34:06+01:00] debug3: sched: JobId=6. State=PENDING.
Reason=JobHeldUser. Priority=0.
[2013-12-03T20:34:06+01:00] debug: backfill: no jobs to backfill
[2013-12-03T20:34:06+01:00] debug2: Testing job time limits and checkpoints
[2013-12-03T20:34:06+01:00] debug2: Performing purge of old job records
[2013-12-03T20:34:06+01:00] debug: sched: Running job scheduler
*[2013-12-03T20:34:06+01:00] debug3: sched: JobId=6. State=PENDING.
Reason=JobHeldUser. Priority=0.*
[2013-12-03T20:34:06+01:00] debug2: Processing RPC: REQUEST_UPDATE_JOB from
uid=1000
[2013-12-03T20:34:06+01:00] debug3: JobDesc: user_id=1000 job_id=6
partition=(null) name=(null)
[2013-12-03T20:34:06+01:00] debug3: cpus=-1-4294967294 pn_min_cpus=-1
[2013-12-03T20:34:06+01:00] debug3: -N min-[max]:
4294967294-[4294967294]:65534:65534:65534
[2013-12-03T20:34:06+01:00] debug3: pn_min_memory_job=-1
pn_min_tmp_disk=-1
[2013-12-03T20:34:06+01:00] debug3: immediate=0 features=(null)
reservation=(null)
[2013-12-03T20:34:06+01:00] debug3: req_nodes=(null) exc_nodes=(null)
gres=(null)
[2013-12-03T20:34:06+01:00] debug3: time_limit=-1--1 priority=-1
contiguous=-1 shared=-1
[2013-12-03T20:34:06+01:00] debug3: kill_on_node_fail=-1 script=(null)
[2013-12-03T20:34:06+01:00] debug3: stdin=(null) stdout=(null)
stderr=(null)
[2013-12-03T20:34:06+01:00] debug3: work_dir=(null)
alloc_node:sid=(null):4294967294
[2013-12-03T20:34:06+01:00] debug3: resp_host=(null) alloc_resp_port=0
other_port=0
[2013-12-03T20:34:06+01:00] debug3: dependency=(null) account=(null)
qos=(null) comment=(null)
[2013-12-03T20:34:06+01:00] debug3: mail_type=0 mail_user=(null)
nice=55534 num_tasks=4294967294 open_mode=0 overcommit=-1 acctg_freq=-1
[2013-12-03T20:34:06+01:00] debug3: network=(null)
begin=2014-01-01T00:00:01 cpus_per_task=-1 requeue=-1 licenses=(null)
[2013-12-03T20:34:06+01:00] debug3: end_time=Unknown
signal=0@0wait_all_nodes=-1
[2013-12-03T20:34:06+01:00] debug3: ntasks_per_node=-1
ntasks_per_socket=-1 ntasks_per_core=-1
[2013-12-03T20:34:06+01:00] debug3: cpus_bind=65534:(null)
mem_bind=65534:(null) plane_size:65534
[2013-12-03T20:34:06+01:00] debug3: update before alteration asking for
nodes 4294967294-4294967294 cpus 4294967294-4294967294
[2013-12-03T20:34:06+01:00] debug3: update after alteration asking for
nodes 4294967294-4294967294 cpus 4294967294-4294967294
[2013-12-03T20:34:06+01:00] sched: update_job: setting begin to
2014-01-01T00:00:01 for job_id 6
[2013-12-03T20:34:06+01:00] updating accounting
[2013-12-03T20:34:06+01:00] _slurm_rpc_update_job complete JobId=6 uid=1000
usec=1656
[2013-12-03T20:34:06+01:00] debug: sched: Running job scheduler
*[2013-12-03T20:34:06+01:00] debug3: sched: JobId=6. State=PENDING.
Reason=JobHeldAdmin. Priority=0.*
[2013-12-03T20:34:06+01:00] debug3: Writing job id 6 to header record of
job_state file
[2013-12-03T20:34:07+01:00] debug: Spawning ping agent for n001
[2013-12-03T20:34:07+01:00] debug2: Spawning RPC agent for msg_type 1008
[2013-12-03T20:34:07+01:00] debug2: got 1 threads to send out
[2013-12-03T20:34:07+01:00] debug2: Tree head got back 0 looking for 1
I was wondering whether this behavior is expected or it is actually a bug.
Best Regards,
Panos Labropoulos