It doesn't appear your slurmctld is running or responsive.

[email protected] wrote:
>
>Hello,
>
>I obtain the following error message when I try to use SLURM.
>
>root@kosmos:~# sinfo
>sinfo: error: slurm_receive_msg: Zero Bytes were transmitted or
>received
>slurm_load_partitions: Zero Bytes were transmitted or received
>
>
>
>Here is the output of same command with an increased level of
>verbosity:
>
>root@kosmos:~# sinfo -vv
>-----------------------------
>dead        = false
>exact       = 0
>filtering   = false
>format      = %9P %.5a %.10l %.6D %.6t %N
>iterate     = 0
>long        = false
>no_header   = false
>node_field  = false
>node_format = false
>nodes       = n/a
>part_field  = true
>partition   = n/a
>responding  = false
>states      = (null)
>sort        = (null)
>summarize   = false
>verbose     = 2
>-----------------------------
>all_flag        = false
>avail_flag      = true
>bg_flag         = false
>cpus_flag       = false
>default_time_flag =false
>disk_flag       = false
>features_flag   = false
>groups_flag     = false
>gres_flag       = false
>job_size_flag   = false
>max_time_flag   = true
>memory_flag     = false
>partition_flag  = true
>priority_flag   = false
>reason_flag     = false
>reason_timestamp_flag = false
>reason_user_flag = false
>reservation_flag = false
>root_flag       = false
>share_flag      = false
>state_flag      = true
>weight_flag     = false
>-----------------------------
>
>sinfo: debug:  Reading slurm.conf file: /etc/slurm-llnl/slurm.conf
>Tue Oct  8 15:30:10 2013
>sinfo: auth plugin for Munge (http://code.google.com/p/munge/) loaded
>sinfo: debug:  _slurm_recv_timeout at 0 of 4, recv zero bytes
>sinfo: error: slurm_receive_msg: Zero Bytes were transmitted or
>received
>slurm_load_partitions: Zero Bytes were transmitted or received
>
>
>
>Everything seems fine with Munge:
>
>root@kosmos:~# munge -n|ssh k01 unmunge
>STATUS:           Success (0)
>ENCODE_HOST:      kosmos (10.3.1.80)
>ENCODE_TIME:      2013-10-08 15:30:48 (1381239048)
>DECODE_TIME:      2013-10-08 15:30:48 (1381239048)
>TTL:              300
>CIPHER:           aes128 (4)
>MAC:              sha1 (3)
>ZIP:              none (0)
>UID:              root (0)
>GID:              root (0)
>LENGTH:           0
>
>
>
>Here is the slurm.conf:
>
>root@kosmos:~# cat /etc/slurm-llnl/slurm.conf
># slurm.conf file generated by configurator.html.
># Put this file on all nodes of your cluster.
># See the slurm.conf man page for more information.
>#
>ControlMachine=kosmos
>#ControlAddr=
>#BackupController=
>#BackupAddr=
>#
>#AuthType=auth/none
>AuthType=auth/munge
>CacheGroups=0
>#CheckpointType=checkpoint/none
>CryptoType=crypto/munge
>#CryptoType=crypto/openssl
>#DisableRootJobs=NO
>#EnforcePartLimits=NO
>#Epilog=
>#PrologSlurmctld=
>#FirstJobId=1
>#MaxJobId=999999
>#GresTypes=
>#GroupUpdateForce=0
>#GroupUpdateTime=600
>JobCheckpointDir=/var/lib/slurm-llnl/checkpoint
>#JobCredentialPrivateKey=
>#JobCredentialPublicCertificate=
>#JobCredentialPrivateKey=/home/slurm/ssl/id_rsa
>#JobCredentialPublicCertificate=/home/slurm/ssl/id_rsa.pub
>#JobFileAppend=0
>#JobRequeue=1
>#JobSubmitPlugins=1
>#KillOnBadExit=0
>#Licenses=foo*4,bar
>#MailProg=/usr/bin/mail
>#MaxJobCount=5000
>#MaxStepCount=40000
>#MaxTasksPerNode=128
>MpiDefault=none
>#MpiParams=ports=#-#
>#PluginDir=
>#PlugStackConfig=
>#PrivateData=jobs
>ProctrackType=proctrack/pgid
>#Prolog=
>#PrologSlurmctld=
>#PropagatePrioProcess=0
>#PropagateResourceLimits=
>#PropagateResourceLimitsExcept=
>ReturnToService=1
>#SallocDefaultCommand=
>SlurmctldPidFile=/var/run/slurm-llnl/slurmctld.pid
>SlurmctldPort=6817
>SlurmdPidFile=/var/run/slurm-llnl/slurmd.pid
>SlurmdPort=6818
>SlurmdSpoolDir=/var/lib/slurm-llnl/slurmd
>SlurmUser=slurm
>#SrunEpilog=
>#SrunProlog=
>StateSaveLocation=/var/lib/slurm-llnl/slurmctld
>SwitchType=switch/none
>#TaskEpilog=
>TaskPlugin=task/none
>#TaskPluginParam=
>#TaskProlog=
>#TopologyPlugin=topology/tree
>#TmpFs=/tmp
>#TrackWCKey=no
>#TreeWidth=
>#UnkillableStepProgram=
>#UsePAM=0
>#
>#
># TIMERS
>#BatchStartTimeout=10
>#CompleteWait=0
>#EpilogMsgTime=2000
>#GetEnvTimeout=2
>#HealthCheckInterval=0
>#HealthCheckProgram=
>InactiveLimit=0
>KillWait=30
>#MessageTimeout=10
>#ResvOverRun=0
>MinJobAge=300
>#OverTimeLimit=0
>SlurmctldTimeout=120
>SlurmdTimeout=300
>#UnkillableStepTimeout=60
>#VSizeFactor=0
>Waittime=0
>#
>#
># SCHEDULING
>#DefMemPerCPU=0
>FastSchedule=1
>#MaxMemPerCPU=0
>#SchedulerRootFilter=1
>#SchedulerTimeSlice=30
>SchedulerType=sched/backfill
>SchedulerPort=7321
>SelectType=select/cons_res
>SelectTypeParameters=CR_Core_Memory
>#
>#
># JOB PRIORITY
>#PriorityType=priority/basic
>#PriorityDecayHalfLife=
>#PriorityCalcPeriod=
>#PriorityFavorSmall=
>#PriorityMaxAge=
>#PriorityUsageResetPeriod=
>#PriorityWeightAge=
>#PriorityWeightFairshare=
>#PriorityWeightJobSize=
>#PriorityWeightPartition=
>#PriorityWeightQOS=
>#
>#
># LOGGING AND ACCOUNTING
>#AccountingStorageEnforce=0
>#AccountingStorageHost=
>AccountingStorageLoc=/var/log/slurm/accounting.txt
>#AccountingStoragePass=
>#AccountingStoragePort=
>AccountingStorageType=accounting_storage/filetxt
>#AccountingStorageUser=
>AccountingStoreJobComment=YES
>ClusterName=cluster
>#DebugFlags=
>#JobCompHost=
>JobCompLoc=/var/log/slurm/slurm.log
>#JobCompPass=
>#JobCompPort=
>JobCompType=jobcomp/filetxt
>#JobCompUser=
>JobAcctGatherFrequency=30
>JobAcctGatherType=jobacct_gather/linux
>SlurmctldDebug=3
>SlurmctldLogFile=/var/log/slurm-llnl/slurmctld.log
>SlurmdDebug=3
>SlurmdLogFile=/var/log/slurm-llnl/slurmd.log
>#SlurmSchedLogFile=
>#SlurmSchedLogLevel=
>#
>#
># POWER SAVE SUPPORT FOR IDLE NODES (optional)
>#SuspendProgram=
>#ResumeProgram=
>#SuspendTimeout=
>#ResumeTimeout=
>#ResumeRate=
>#SuspendExcNodes=
>#SuspendExcParts=
>#SuspendRate=
>#SuspendTime=
>#
>#
># COMPUTE NODES
>NodeName=k0[1-3] CPUs=32 RealMemory=129186 Sockets=2 CoresPerSocket=8
>ThreadsPerCore=2 State=UNKNOWN
>PartitionName=uag Nodes=k0[1-3] Default=YES MaxTime=INFINITE State=UP
>
>NodeName=kosmos CPUs=32 RealMemory=129186 Sockets=2 CoresPerSocket=8
>ThreadsPerCore=2 State=UNKNOWN
>PartitionName=uag Nodes=kosmos Default=YES MaxTime=INFINITE State=UP
>
>
>
>NTP is running on all the nodes and the clocks are in sync.
>
>Thank you for your help!
>
>Best regards,
>
>Philippe

Reply via email to