Hi,
I am trying to figure out how to make the following work within our cluster using torque/maui:


I'd like a user to be able to submit as many jobs as they like. However, they should only be allowed up to 32cpu or 32gb of memory. After that if there are idle resources then the rest of their jobs can be backfilled on idle nodes.

If another user submits jobs they should get the same policy and pre- empt any backfilled jobs (if that's required to meet the 32cpu or memory limit).

So basically, I think this should be fairly common. I want to run as many jobs as possible on idle resources but only guarantee the jobs that fall under the MAXPROC/MAXMEM policy. I've implemented the MAXPROC/MAXMEM policy but it appears backfill won't work for the remaining jobs. So I am assuming backfill has to abide by the MAXPROC/ MAXMEM policy I have in place. Can anyone give me some pointers to the proper way to implement this? Thanks in advance!

-Steve


[root@ maui]# cat maui.cfg (edited for some content)
# maui.cfg 3.2.6p14


# Resource Manager Definition

RMCFG[JAKE] TYPE=PBS


RMPOLLINTERVAL        00:00:30

SERVERPORT            42559
SERVERMODE            NORMAL

# Admin: http://clusterresources.com/mauidocs/a.esecurity.html


LOGDIR                /var/log/maui
LOGFILE               maui.log
LOGFILEMAXSIZE        100000000
#LOGLEVEL              3
LOGLEVEL              2
LOGFILEROLLDEPTH      5
STATDIR               /var/log/maui/stats
SERVERHOMEDIR         /usr/maui/
TOOLSDIR              /usr/maui/tools/
LOGDIR                /var/log/maui/
STATDIR               /usr/maui/stats/
#LOCKFILE              /usr/maui/maui.pid
SERVERCONFIGFILE      /usr/maui/maui.cfg
CHECKPOINTFILE        /var/log/maui/maui.ck

# Misc configs

ENABLEMULTINODEJOBS     TRUE
JOBMAXOVERRUN           00:01:00
#SYSTEMDEFAULTJOBWALLTIME       1:00:00:00
USEMACHINESPEED         ON
#PREEMPTPOLICY          CHECKPOINT
PREEMPTPOLICY           SUSPEND
CREDWEIGHT             1
CLASSWEIGHT            1
QOSWEIGHT              1
RESCTLPOLICY            ANY

# Job Priority: http://clusterresources.com/mauidocs/ 5.1jobprioritization.html

QUEUETIMEWEIGHT       1

# FairShare: http://clusterresources.com/mauidocs/6.3fairshare.html

FSPOLICY              DEDICATEDPS
FSDEPTH               7
FSINTERVAL            86400
FSDECAY               0.80

# Throttling Policies: http://clusterresources.com/mauidocs/ 6.2throttlingpolicies.html

# NONE SPECIFIED

# Backfill: http://clusterresources.com/mauidocs/8.2backfill.html

BACKFILLPOLICY        BESTFIT
RESERVATIONPOLICY     CURRENTHIGHEST
#RESERVATIONPOLICY    NEVER
RESERVATIONDEPTH      50
RESDEPTH              32

# Node Allocation: http://clusterresources.com/mauidocs/ 5.2nodeallocation.html

NODEACCESSPOLICY        SHARED
#NODEALLOCATIONPOLICY   MINRESOURCE
#NODEALLOCATIONPOLICY   MAXBALANCE
NODEALLOCATIONPOLICY    FASTEST
#NODEAVAILABILITYPOLICY         UTILIZED
NODEAVAILABILITYPOLICY  COMBINED
NODEMAXLOAD             1.0
NODELOADPOLICY          ADJUSTSTATE


# QOS: http://clusterresources.com/mauidocs/7.3qos.html


QOSCFG[qm] PRIORITY=100 QFLAGS=PREEMPTEE
QOSCFG[md] PRIORITY=100 QFLAGS=PREEMPTEE
QOSCFG[faculty] PRIORITY=1000 QFLAGS=PREEMPTOR
QOSFEATURES[qm] hamilton g03
QOSFEATURES[md] hamilton

# Standing Reservations: http://clusterresources.com/mauidocs/ 7.1.3standingreservations.html

# SRSTARTTIME[test] 8:00:00
# SRENDTIME[test]   17:00:00
# SRDAYS[test]      MON TUE WED THU FRI
# SRTASKCOUNT[test] 20
# SRMAXTIME[test]   0:30:00

# Creds: http://clusterresources.com/mauidocs/6.1fairnessoverview.html

# USERCFG[DEFAULT]      FSTARGET=25.0
# USERCFG[john]         PRIORITY=100  FSTARGET=10.0-
# GROUPCFG[staff]       PRIORITY=1000 QLIST=hi:low QDEF=hi
#
# Groups
#
GROUPCFG[faculty]       PRIORITY=1000 QLIST=faculty QDEF=faculty
GROUPCFG[hamilton]      PRIORITY=10
GROUPCFG[users]         PRIORITY=10
#
# Classes (queue's)
#
#CLASSCFG[main]         QLIST=md:qm
CLASSCFG[main] QLIST=md:qm:mercury MAXPROC=32,64 MAXMEM=32768,65536
CLASSCFG[hamilton]      QLIST=md:qm



torque config
-------------------

[root@ maui]# qmgr
Max open servers: 4
Qmgr: print server
#
# Create queues and set their attributes.
#
#
# Create and define queue main
#
create queue main
set queue main queue_type = Execution
set queue main Priority = 100
set queue main resources_default.neednodes = main
set queue main resources_default.walltime = 24:00:00
set queue main enabled = True
set queue main started = True
#
# Create and define queue hamilton
#
create queue hamilton
set queue hamilton queue_type = Execution
set queue hamilton resources_default.neednodes = hamilton
set queue hamilton resources_default.walltime = 24:00:00
set queue hamilton enabled = True
set queue hamilton started = True
#
# Set server attributes.
#
set server scheduling = True
set server default_queue = main
set server log_events = 511
set server mail_from = adm
set server query_other_jobs = True
set server resources_default.ncpus = 1
set server resources_default.walltime = 24:00:00
set server scheduler_iteration = 60
set server node_check_rate = 150
set server tcp_timeout = 6
set server job_nanny = True
_______________________________________________
mauiusers mailing list
mauiusers@supercluster.org
http://www.supercluster.org/mailman/listinfo/mauiusers

Reply via email to