Hello!

We have slurm 2.5.4 installed on SL6. Only one partition is defined. Slurm configuration is attached.

We have a bunch of jobs in pending state:
# sacct --state=pending | wc
   1614   11298  125892

Although a couple of nodes that are idle they do not accept new jobs:
# sinfo
PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST
all* up 14-00:00:0 65 mix wn[001,003-020,022-031,033-036,038-045,052-053,056-057,062-066,068-069],wnib[001-013]
all*         up 14-00:00:0     10   idle wn[046-051,054-055,058-059]


For example this job is in pending state:
# scontrol show job 3155750
JobId=3155750 Name=test
   UserId=gen009(25009) GroupId=gridgen(25000)
   Priority=53477 Account=gridgen QOS=normal
   JobState=PENDING Reason=Priority Dependency=(null)
   Requeue=0 Restarts=0 BatchFlag=1 ExitCode=0:0
   RunTime=00:00:00 TimeLimit=16:40:00 TimeMin=N/A
   SubmitTime=2013-04-16T09:40:03 EligibleTime=2013-04-16T09:40:03
   StartTime=Unknown EndTime=Unknown
   PreemptTime=None SuspendTime=None SecsPreSuspend=0
   Partition=all AllocNode:Sid=jost:10672
   ReqNodeList=(null) ExcNodeList=(null)
   NodeList=(null)
   NumNodes=1-2000 NumCPUs=1 CPUs/Task=1 ReqS:C:T=*:*:*
   MinCPUsNode=1 MinMemoryCPU=2000M MinTmpDiskNode=0
   Features=(null) Gres=(null) Reservation=(null)
   Shared=OK Contiguous=0 Licenses=(null) Network=(null)
   Command=/tmp/SLURM_job_script.nFVWYS
WorkDir=/d04/session/T3kKDmrfFmhnmmR0Xox1SiGmABFKDmABFKDmJDLKDmABFKDmE8SBRn

It seems that all the jobs are scheduled on a FIFO basis. Maybe I should mention that jobs that were submitted to the cluster first, have a "feature" specification (were submitted with sbatch --C).

Any help would be appreciated

Thanks,
Barbara

Configuration data as of 2013-04-16T14:04:56
AccountingStorageBackupHost = (null)
AccountingStorageEnforce = associations,limits
AccountingStorageHost   = hostname
AccountingStorageLoc    = slurm_db
AccountingStoragePort   = 3306
AccountingStorageType   = accounting_storage/mysql
AccountingStorageUser   = slurm
AccountingStoreJobComment = NO
AcctGatherEnergyType    = acct_gather_energy/none
AcctGatherNodeFreq      = 0 sec
AuthType                = auth/munge
BackupAddr              = (null)
BackupController        = (null)
BatchStartTimeout       = 10 sec
BOOT_TIME               = 2013-04-16T14:04:06
CacheGroups             = 0
CheckpointType          = checkpoint/none
ClusterName             = arnes
CompleteWait            = 0 sec
ControlAddr             = IP
ControlMachine          = hostname
CryptoType              = crypto/munge
DebugFlags              = (null)
DefMemPerNode           = UNLIMITED
DisableRootJobs         = NO
EnforcePartLimits       = NO
Epilog                  = /etc/slurm/epilog
EpilogMsgTime           = 10000 usec
EpilogSlurmctld         = (null)
FastSchedule            = 2
FirstJobId              = 2300000
GetEnvTimeout           = 2 sec
GresTypes               = (null)
GroupUpdateForce        = 0
GroupUpdateTime         = 600 sec
HASH_VAL                = Match
HealthCheckInterval     = 30 sec
HealthCheckProgram      = /etc/slurm/healthcheck
InactiveLimit           = 0 sec
JobAcctGatherFrequency  = 0 sec
JobAcctGatherType       = jobacct_gather/linux
JobCheckpointDir        = /var/slurm/checkpoint
JobCompHost             = localhost
JobCompLoc              = /var/log/slurm_jobcomp.log
JobCompPort             = 0
JobCompType             = jobcomp/none
JobCompUser             = root
JobCredentialPrivateKey = (null)
JobCredentialPublicCertificate = (null)
JobFileAppend           = 0
JobRequeue              = 1
JobSubmitPlugins        = (null)
KillOnBadExit           = 0
KillWait                = 30 sec
LaunchType              = launch/slurm
Licenses                = (null)
LicensesUsed            = (null)
MailProg                = /bin/mail
MaxJobCount             = 20000
MaxJobId                = 4294901760
MaxMemPerNode           = UNLIMITED
MaxStepCount            = 40000
MaxTasksPerNode         = 128
MessageTimeout          = 60 sec
MinJobAge               = 300 sec
MpiDefault              = openmpi
MpiParams               = ports=12000-12500
NEXT_JOB_ID             = 3158442
OverTimeLimit           = 0 min
PluginDir               = /usr/lib64/slurm
PlugStackConfig         = /etc/slurm/plugstack.conf
PreemptMode             = OFF
PreemptType             = preempt/none
PriorityDecayHalfLife   = 7-00:00:00
PriorityCalcPeriod      = 00:05:00
PriorityFavorSmall      = 0
PriorityFlags           = 0
PriorityMaxAge          = 7-00:00:00
PriorityUsageResetPeriod = NONE
PriorityType            = priority/multifactor
PriorityWeightAge       = 10000
PriorityWeightFairShare = 100000
PriorityWeightJobSize   = 1000
PriorityWeightPartition = 0
PriorityWeightQOS       = 0
PrivateData             = none
ProctrackType           = proctrack/cgroup
Prolog                  = /etc/slurm/prolog
PrologSlurmctld         = (null)
PropagatePrioProcess    = 0
PropagateResourceLimits = (null)
PropagateResourceLimitsExcept = MEMLOCK
RebootProgram           = (null)
ReconfigFlags           = (null)
ResumeProgram           = (null)
ResumeRate              = 300 nodes/min
ResumeTimeout           = 60 sec
ResvOverRun             = 0 min
ReturnToService         = 0
SallocDefaultCommand    = (null)
SchedulerParameters     = 
bf_max_job_user=10,bf_interval=10,max_job_bf=500,bf_resolution=3600,default_queue_depth=300
SchedulerPort           = 7321
SchedulerRootFilter     = 1
SchedulerTimeSlice      = 30 sec
SchedulerType           = sched/backfill
SelectType              = select/cons_res
SelectTypeParameters    = CR_CORE_MEMORY
SlurmUser               = slurm(106)
SlurmctldDebug          = info
SlurmctldLogFile        = /var/log/slurm/slurmctld.log
SlurmSchedLogFile       = (null)
SlurmctldPort           = 6817
SlurmctldTimeout        = 300 sec
SlurmdDebug             = info
SlurmdLogFile           = /var/log/slurm/slurmd.%n.log.%h
SlurmdPidFile           = /var/run/slurmd.%n.pid
SlurmdPort              = 6818
SlurmdSpoolDir          = /var/spool/slurm
SlurmdTimeout           = 1000 sec
SlurmdUser              = root(0)
SlurmSchedLogLevel      = 0
SlurmctldPidFile        = /var/run/slurmctld.pid
SLURM_CONF              = /etc/slurm/slurm.conf
SLURM_VERSION           = 2.5.4
SrunEpilog              = (null)
SrunProlog              = (null)
StateSaveLocation       = /var/spool/slurmstate
SuspendExcNodes         = (null)
SuspendExcParts         = (null)
SuspendProgram          = (null)
SuspendRate             = 60 nodes/min
SuspendTime             = NONE
SuspendTimeout          = 30 sec
SwitchType              = switch/none
TaskEpilog              = (null)
TaskPlugin              = task/cgroup
TaskPluginParam         = (null type)
TaskProlog              = /etc/slurm/taskprolog
TmpFS                   = /tmp
TopologyPlugin          = topology/none
TrackWCKey              = 0
TreeWidth               = 50
UsePam                  = 1
UnkillableStepProgram   = (null)
UnkillableStepTimeout   = 60 sec
VSizeFactor             = 200 percent
WaitTime                = 0 sec

Reply via email to