Hello,

Thank you for your reply.

It seems to be running.

root@kosmos:~# /etc/init.d/slurm-llnl status
slurmctld (pid 6093) is running...
slurmd (pid 6221) is running...



----- Mail original -----
> De: "Danny Auble" <[email protected]>
> À: "slurm-dev" <[email protected]>
> Envoyé: Mardi 8 Octobre 2013 16:42:11
> Objet: [slurm-dev] Re: sinfo: error: slurm_receive_msg: Zero Bytes were 
> transmitted or received
> 
> It doesn't appear your slurmctld is running or responsive.
> 
> 
> [email protected] wrote:
> 
> 
> Hello,
> 
> I obtain the following error message when I try to use SLURM.
> 
> root@kosmos:~# sinfo
> sinfo: error: slurm_receive_msg: Zero Bytes were transmitted or
> received
> slurm_load_partitions: Zero Bytes were transmitted or received
> 
> 
> 
> Here is the output of same command with an increased level of
> verbosity:
> 
> root@kosmos:~# sinfo -vv
> -----------------------------
> dead        = false
> exact       = 0
> filtering   = false
> format      = %9P %.5a %.10l %.6D %.6t %N
> iterate     = 0
> long        = false
> no_header   = false
> node_field  = false
> node_format = false
> nodes       = n/a
> part_field  = true
> partition   = n/a
> responding  = false
> states      = (null)
> sort        = (null)
> summarize   = false
> verbose     = 2
> -----------------------------
> all_flag        = false
> avail_flag      = true
> bg_flag
>        =
> false
> cpus_flag       = false
> default_time_flag =false
> disk_flag       = false
> features_flag   = false
> groups_flag     = false
> gres_flag       = false
> job_size_flag   = false
> max_time_flag   = true
> memory_flag     = false
> partition_flag  = true
> priority_flag   = false
> reason_flag     = false
> reason_timestamp_flag = false
> reason_user_flag = false
> reservation_flag = false
> root_flag       = false
> share_flag      = false
> state_flag      = true
> weight_flag     = false
> -----------------------------
> 
> sinfo: debug:  Reading slurm.conf file: /etc/slurm-llnl/slurm.conf
> Tue Oct  8 15:30:10 2013
> sinfo: auth plugin for Munge ( http://code.google.com/p/munge /)
> loaded
> sinfo: debug:  _slurm_recv_timeout at 0 of 4, recv zero bytes
> sinfo: error: slurm_receive_msg: Zero Bytes were transmitted or
> received
> slurm_load_partitions: Zero Bytes were transmitted or received
> 
> 
> 
> Everything seems fine with Munge:
> 
> root@kosmos:~# munge -n|ssh k01 unmunge
> STATUS:           Success (0)
> ENCODE_HOST:      kosmos ( 10.3.1.80 )
> ENCODE_TIME:      2013-10-08 15:30:48 (1381239048)
> DECODE_TIME:      2013-10-08 15:30:48 (1381239048)
> TTL:              300
> CIPHER:           aes128 (4)
> MAC:              sha1 (3)
> ZIP:              none (0)
> UID:              root (0)
> GID:              root (0)
> LENGTH:           0
> 
> 
> 
> Here is the slurm.conf:
> 
> root@kosmos:~# cat /etc/slurm-llnl/slurm.conf
> # slurm.conf file generated by configurator.html.
> # Put this file on all nodes of your cluster.
> # See the slurm.conf man page for more information.
> #
> ControlMachine=kosmos
> #ControlAddr=
> #BackupController=
> #BackupAddr=
> #
> #AuthType=auth/none
> AuthType=auth/munge
> CacheGroups=0
> #CheckpointType=checkpoint/none
> CryptoType=crypto/munge
> #CryptoType=crypto/openssl
> #DisableRootJobs=NO
> #EnforcePartLimits=NO
> #Epilog=
> #PrologSlurmctld=
> #FirstJobId=1
> #MaxJobId=999999
> #GresTypes=
> #GroupUpdateForce=0
> #GroupUpdateTime=600
> JobCheckpointDir=/var/lib/slurm-llnl/checkpoint
> #JobCredentialPrivateKey=
> #JobCredentialPublicCertificate=
> #JobCredentialPrivateKey=/home/slurm/ssl/id_rsa
> #JobCredentialPublicCertificate=/home/slurm/ssl/id_rsa.pub
> #JobFileAppend=0
> #JobRequeue=1
> #JobSubmitPlugins=1
> #KillOnBadExit=0
> #Licenses=foo*4,bar
> #MailProg=/usr/bin/mail
> #MaxJobCount=5000
> #MaxStepCount=40000
> #MaxTasksPerNode=128
> MpiDefault=none
> #MpiParams=ports=#-#
> #PluginDir=
> #PlugStackConfig=
> #PrivateData=jobs
> ProctrackType=proctrack/pgid
> #Prolog=
> #PrologSlurmctld=
> #PropagatePrioProcess=0
> #PropagateResourceLimits=
> #PropagateResourceLimitsExcept=
> ReturnToService=1
> #SallocDefaultCommand=
> SlurmctldPidFile=/var/run/slurm-llnl/slurmctld.pid
> SlurmctldPort=6817
> SlurmdPidFile=/var/run/slurm-llnl/slurmd.pid
> SlurmdPort=6818
> SlurmdSpoolDir=/var/lib/slurm-llnl/slurmd
> SlurmUser=slurm
> #SrunEpilog=
> #SrunProlog=
> StateSaveLocation=/var/lib/slurm-llnl/slurmctld
> SwitchType=switch/none
> #TaskEpilog=
> TaskPlugin=task/none
> #TaskPluginParam=
> #TaskProlog=
> #TopologyPlugin=topology/tree
> #TmpFs=/tmp
> #TrackWCKey=no
> #TreeWidth=
> #UnkillableStepProgram=
> #UsePAM=0
> #
> #
> # TIMERS
> #BatchStartTimeout=10
> #CompleteWait=0
> #EpilogMsgTime=2000
> #GetEnvTimeout=2
> #HealthCheckInterval=0
> #HealthCheckProgram=
> InactiveLimit=0
> KillWait=30
> #MessageTimeout=10
> #ResvOverRun=0
> MinJobAge=300
> #OverTimeLimit=0
> SlurmctldTimeout=120
> SlurmdTimeout=300
> #UnkillableStepTimeout=60
> #VSizeFactor=0
> Waittime=0
> #
> #
> # SCHEDULING
> #DefMemPerCPU=0
> FastSchedule=1
> #MaxMemPerCPU=0
> #SchedulerRootFilter=1
> #SchedulerTimeSlice=30
> SchedulerType=sched/backfill
> SchedulerPort=7321
> SelectType=select/cons_res
> SelectTypeParameters=CR_Core_Memory
> #
> #
> # JOB PRIORITY
> #PriorityType=priority/basic
> #PriorityDecayHalfLife=
> #PriorityCalcPeriod=
> #PriorityFavorSmall=
> #PriorityMaxAge=
> #PriorityUsageResetPeriod=
> #PriorityWeightAge=
> #PriorityWeightFairshare=
> #PriorityWeightJobSize=
> #PriorityWeightPartition=
> #PriorityWeightQOS=
> #
> #
> # LOGGING AND ACCOUNTING
> #AccountingStorageEnforce=0
> #AccountingStorageHost=
> AccountingStorageLoc=/var/log/slurm/accounting.txt
> #AccountingStoragePass=
> #AccountingStoragePort=
> AccountingStorageType=accounting_storage/filetxt
> #AccountingStorageUser=
> AccountingStoreJobComment=YES
> ClusterName=cluster
> #DebugFlags=
> #JobCompHost=
> JobCompLoc=/var/log/slurm/slurm.log
> #JobCompPass=
> #JobCompPort=
> JobCompType=jobcomp/filetxt
> #JobCompUser=
> JobAcctGatherFrequency=30
> JobAcctGatherType=jobacct_gather/linux
> SlurmctldDebug=3
> SlurmctldLogFile=/var/log/slurm-llnl/slurmctld.log
> SlurmdDebug=3
> SlurmdLogFile=/var/log/slurm-llnl/slurmd.log
> #SlurmSchedLogFile=
> #SlurmSchedLogLevel=
> #
> #
> # POWER SAVE SUPPORT FOR IDLE NODES (optional)
> #SuspendProgram=
> #ResumeProgram=
> #SuspendTimeout=
> #ResumeTimeout=
> #ResumeRate=
> #SuspendExcNodes=
> #SuspendExcParts=
> #SuspendRate=
> #SuspendTime=
> #
> #
> # COMPUTE NODES
> NodeName=k0[1-3] CPUs=32 RealMemory=129186 Sockets=2 CoresPerSocket=8
> ThreadsPerCore=2
> State=UNKNOWN
> PartitionName=uag Nodes=k0[1-3] Default=YES MaxTime=INFINITE State=UP
> 
> NodeName=kosmos CPUs=32 RealMemory=129186 Sockets=2 CoresPerSocket=8
> ThreadsPerCore=2 State=UNKNOWN
> PartitionName=uag Nodes=kosmos Default=YES MaxTime=INFINITE State=UP
> 
> 
> 
> NTP is running on all the nodes and the clocks are in sync.
> 
> Thank you for your help!
> 
> Best regards,
> 
> Philippe
> 

Reply via email to