And port 6817 as well

Am 09.10.2014 21:04, schrieb Monica Marathe:
> Hey Michael,
> I did build my configuration file:
> 
> # slurm.conf file generated by configurator.html.
> # Put this file on all nodes of your cluster.
> # See the slurm.conf man page for more information.
> #
> ControlMachine=control-machine
> #ControlAddr=
> #BackupController=
> #BackupAddr=
> #
> AuthType=auth/munge
> CacheGroups=0
> #CheckpointType=checkpoint/none
> CryptoType=crypto/munge
> #DisableRootJobs=NO
> #EnforcePartLimits=NO
> #Epilog=
> #EpilogSlurmctld=
> #FirstJobId=1
> #MaxJobId=999999
> #GresTypes=
> #GroupUpdateForce=0
> #GroupUpdateTime=600
> #JobCheckpointDir=/var/slurm/checkpoint
> #JobCredentialPrivateKey=
> #JobCredentialPublicCertificate=
> #JobFileAppend=0
> #JobRequeue=1
> #JobSubmitPlugins=1
> #KillOnBadExit=0
> #Licenses=foo*4,bar
> #MailProg=/bin/mail
> #MaxJobCount=5000
> #MaxStepCount=40000
> #MaxTasksPerNode=128
> MpiDefault=none
> #MpiParams=ports=#-#
> #PluginDir=
> #PlugStackConfig=
> #PrivateData=jobs
> ProctrackType=proctrack/pgid
> #Prolog=
> #PrologSlurmctld=
> #PropagatePrioProcess=0
> #PropagateResourceLimits=
> #PropagateResourceLimitsExcept=
> ReturnToService=1
> #SallocDefaultCommand=
> SlurmctldPidFile=/var/run/slurmctld.pid
> SlurmctldPort=6817
> SlurmdPidFile=/var/run/slurmd.pid
> SlurmdPort=6818
> SlurmdSpoolDir=/tmp/slurmd
> SlurmUser=slurm
> #SlurmdUser=root
> #SrunEpilog=
> #SrunProlog=
> StateSaveLocation=/tmp
> SwitchType=switch/none
> #TaskEpilog=
> TaskPlugin=task/none
> #TaskPluginParam=
> #TaskProlog=
> #TopologyPlugin=topology/tree
> #TmpFs=/tmp
> #TrackWCKey=no
> #TreeWidth=
> #UnkillableStepProgram=
> #UsePAM=0
> #
> #
> # TIMERS
> #BatchStartTimeout=10
> #CompleteWait=0
> #EpilogMsgTime=2000
> #GetEnvTimeout=2
> #HealthCheckInterval=0
> #HealthCheckProgram=
> InactiveLimit=0
> KillWait=30
> #MessageTimeout=10
> #ResvOverRun=0
> MinJobAge=300
> #OverTimeLimit=0
> SlurmctldTimeout=120
> SlurmdTimeout=300
> #UnkillableStepTimeout=60
> #VSizeFactor=0
> Waittime=0
> #
> #
> # SCHEDULING
> #DefMemPerCPU=0
> FastSchedule=1
> #MaxMemPerCPU=0
> #SchedulerRootFilter=1
> #SchedulerTimeSlice=30
> SchedulerType=sched/backfill
> SchedulerPort=7321
> SelectType=select/linear
> #SelectTypeParameters=
> #
> #
> # JOB PRIORITY
> #PriorityType=priority/basic
> #PriorityDecayHalfLife=
> #PriorityCalcPeriod=
> #PriorityFavorSmall=
> #PriorityMaxAge=
> #PriorityUsageResetPeriod=
> #PriorityWeightAge=
> #PriorityWeightFairshare=
> #PriorityWeightJobSize=
> #PriorityWeightPartition=
> #PriorityWeightQOS=
> #
> #
> # LOGGING AND ACCOUNTING
> #AccountingStorageEnforce=0
> #AccountingStorageHost=
> #AccountingStorageLoc=
> #AccountingStoragePass=
> #AccountingStoragePort=
> AccountingStorageType=accounting_storage/none
> #AccountingStorageUser=
> AccountingStoreJobComment=YES
> ClusterName=cluster
> #DebugFlags=
> #JobCompHost=
> #JobCompLoc=
> #JobCompPass=
> #JobCompPort=
> JobCompType=jobcomp/none
> #JobCompUser=
> JobAcctGatherFrequency=30
> JobAcctGatherType=jobacct_gather/none
> SlurmctldDebug=3
> #SlurmctldLogFile=
> SlurmdDebug=3
> #SlurmdLogFile=
> #SlurmSchedLogFile=
> #SlurmSchedLogLevel=
> #
> #
> # POWER SAVE SUPPORT FOR IDLE NODES (optional)
> #SuspendProgram=
> #ResumeProgram=
> #SuspendTimeout=
> #ResumeTimeout=
> #ResumeRate=
> #SuspendExcNodes=
> #SuspendExcParts=
> #SuspendRate=
> #SuspendTime=
> #
> #
> # COMPUTE NODES
> NodeName=control-machine CPUs=1 State=UNKNOWN
> PartitionName=debug Nodes=control-machine Default=YES MaxTime=INFINITE
> State=UP
> 
> But I get the following errors when i run slurmctld:
> 
> slurmctld: error: Configured MailProg is invalid
> 
> slurmctld: error: ################################################
> slurmctld: error: ###       SEVERE SECURITY VULERABILTY        ###
> slurmctld: error: ### StateSaveLocation DIRECTORY IS WORLD WRITABLE ###
> slurmctld: error: ###         CORRECT FILE PERMISSIONS         ###
> slurmctld: error: ################################################
> slurmctld: error: Could not open node state file /tmp/node_state: No
> such file or directory
> slurmctld: error: NOTE: Trying backup state save file. Information may
> be lost!
> slurmctld: No node state file (/tmp/node_state.old) to recover
> slurmctld: error: Incomplete node data checkpoint file
> slurmctld: Recovered state of 0 nodes
> slurmctld: error: Could not open job state file /tmp/job_state: No such
> file or directory
> slurmctld: error: NOTE: Trying backup state save file. Jobs may be lost!
> slurmctld: No job state file (/tmp/job_state.old) to recover
> slurmctld: debug:  Updating partition uid access list
> slurmctld: error: Could not open reservation state file /tmp/resv_state:
> No such file or directory
> slurmctld: error: NOTE: Trying backup state save file. Reservations may
> be lost
> slurmctld: No reservation state file (/tmp/resv_state.old) to recover
> slurmctld: Recovered state of 0 reservations
> slurmctld: error: Could not open trigger state file /tmp/trigger_state:
> No such file or directory
> slurmctld: error: NOTE: Trying backup state save file. Triggers may be lost!
> slurmctld: No trigger state file (/tmp/trigger_state.old) to recover
> slurmctld: error: Incomplete trigger data checkpoint file
> slurmctld: State of 0 triggers recovered
> slurmctld: read_slurm_conf: backup_controller not specified.
> slurmctld: Reinitializing job accounting state
> slurmctld: Running as primary controller
> 
> slurmctld: debug2: slurmctld listening on 0.0.0.0:6817 <http://0.0.0.0:6817>
> slurmctld: debug:  Spawning registration agent for control-machine 1 hosts
> slurmctld: debug2: Spawning RPC agent for msg_type
> REQUEST_NODE_REGISTRATION_STATUS
> slurmctld: debug2: got 1 threads to send out
> slurmctld: debug2: Tree head got back 0 looking for 1
> slurmctld: debug3: Tree sending to control-machine
> slurmctld: debug2: _slurm_connect failed: Connection refused
> slurmctld: debug2: Error connecting slurm stream socket at
> 10.47.65.195:6818 <http://10.47.65.195:6818>: Connection refused
> slurmctld: debug3: connect refused, retrying
> slurmctld: debug2: _slurm_connect failed: Connection refused
> slurmctld: debug2: Error connecting slurm stream socket at
> 10.47.65.195:6818 <http://10.47.65.195:6818>: Connection refused
> slurmctld: debug2: _slurm_connect failed: Connection refused
> slurmctld: debug2: Error connecting slurm stream socket at
> 10.47.65.195:6818 <http://10.47.65.195:6818>: Connection refused
> 
> slurmctld: debug3: problems with control-machine
> slurmctld: debug2: Tree head got back 1
> slurmctld: agent/is_node_resp: node:control-machine rpc:1001 :
> Communication connection failure
> slurmctld: error: Nodes control-machine not responding
> 
> slurmctld: debug2: Error connecting slurm stream socket at
> 10.47.65.195:6818 <http://10.47.65.195:6818>: Connection refused
> slurmctld: debug2: _slurm_connect failed: Connection refused
> slurmctld: debug2: Error connecting slurm stream socket at
> 10.47.65.195:6818 <http://10.47.65.195:6818>: Connection refused
> ^Cslurmctld: Terminate signal (SIGINT or SIGTERM) received
> slurmctld: debug:  sched: slurmctld terminating
> slurmctld: debug3: _slurmctld_rpc_mgr shutting down
> slurmctld: Saving all slurm state
> slurmctld: error: Could not open job state file /tmp/job_state: No such
> file or directory
> slurmctld: error: NOTE: Trying backup state save file. Jobs may be lost!
> slurmctld: No job state file (/tmp/job_state.old) found
> slurmctld: debug3: Writing job id 1 to header record of job_state file
> slurmctld: debug4: unable to create link for /tmp/job_state ->
> /tmp/job_state.old: No such file or directory
> slurmctld: debug4: unable to create link for /tmp/node_state ->
> /tmp/node_state.old: No such file or directory
> slurmctld: debug4: unable to create link for /tmp/part_state ->
> /tmp/part_state.old: No such file or directory
> slurmctld: debug4: unable to create link for /tmp/resv_state ->
> /tmp/resv_state.old: No such file or directory
> slurmctld: debug2: _slurm_connect failed: Connection refused
> slurmctld: debug2: Error connecting slurm stream socket at
> 10.47.65.195:6818 <http://10.47.65.195:6818>: Connection refused
> slurmctld: debug3: problems with control-machine
> slurmctld: debug2: Tree head got back 1
> slurmctld: debug4: unable to create link for /tmp/trigger_state ->
> /tmp/trigger_state.old: No such file or directory
> slurmctld: debug4: unable to create link for /tmp/assoc_mgr_state ->
> /tmp/assoc_mgr_state.old: No such file or directory
> slurmctld: debug4: unable to create link for /tmp/assoc_usage ->
> /tmp/assoc_usage.old: No such file or directory
> slurmctld: debug4: unable to create link for /tmp/qos_usage ->
> /tmp/qos_usage.old: No such file or directory
> slurmctld: debug3: _slurmctld_background shutting down
> slurmctld: Unable to remove pidfile '/var/run/slurmctld.pid': Permission
> denied
> 
> I am trying to run this on a single machine.
> Any suggestions?
> 
> Thanks!
> -Monica
> 
> 
> 
> On Thu, Oct 9, 2014 at 2:17 PM, Michael Jennings <[email protected]
> <mailto:[email protected]>> wrote:
> 
>     Have you tried generating your own on the web?
> 
>     http://slurm.schedmd.com/configurator.easy.html
>     <http://slurm.schedmd.com/configurator.easy.html>
>     http://slurm.schedmd.com/configurator.html
>     <http://slurm.schedmd.com/configurator.html>
> 
>     You might more appropriate results that way.  :-)
> 
>     Michael
> 
> 
>     On Thu, Oct 9, 2014 at 11:11 AM, Monica Marathe
>     <[email protected] <mailto:[email protected]>> wrote:
> 
>         Hi,
> 
>         Can anyone send me a sample slurm.conf file? I am trying to
>         configure SLURM on a single machine only.
> 
>         Thanks!
>         -Monica
> 
>         -- 
>         - Monica Marathe
> 
> 
> 
> 
>     -- 
>     Michael Jennings <[email protected] <mailto:[email protected]>>
>     Senior HPC Systems Engineer
>     High-Performance Computing Services
>     Lawrence Berkeley National Laboratory
>     Bldg 50B-3209E        W: 510-495-2687 <tel:510-495-2687>
>     MS 050B-3209          F: 510-486-8615 <tel:510-486-8615>
> 
> 
> 
> 
> -- 
> - Monica Marathe

Reply via email to