Hi,

We are using Open grid scheduler/Grid Engine version 2011.11p1
Currently have two queues, with identical settings except priority.

all.q  -- default queue
high.q    ---- higher priority

The scheduler is set to least nodes used policy. All our nodes have
identical resources, 30 cores, 120GB RAM. Scheduler is working as expected
when submitting jobs with low resource requests as per queue priorities.
But when a high mem job (50+GB) is submitted in high.q, it gets stuck in
queue waiting forever, as low mem jobs from default.q are executed when
ever a resource is available and scheduler is not able to fulfill high mem
job requirements even though it is of higher priority. How can I make all
jobs in default.q to wait until higher priority jobs finish ?


Thanks,

Here are the details of our GE config,

root@master1: gridengine#qconf -ssconf
algorithm                         default
schedule_interval                 0:0:05
maxujobs                          0
queue_sort_method                 load
job_load_adjustments              np_load_avg=1.75
load_adjustment_decay_time        0:7:30
load_formula                      np_load_avg
schedd_job_info                   true
flush_submit_sec                  0
flush_finish_sec                  0
params                            none
reprioritize_interval             0:0:0
halftime                          168
usage_weight_list                 cpu=1.000000,mem=0.000000,io=0.000000
compensation_factor               5.000000
weight_user                       0.250000
weight_project                    0.250000
weight_department                 0.250000
weight_job                        0.250000
weight_tickets_functional         0
weight_tickets_share              0
share_override_tickets            TRUE
share_functional_shares           TRUE
max_functional_jobs_to_schedule   200
report_pjob_tickets               TRUE
max_pending_tasks_per_job         50
halflife_decay_list               none
policy_hierarchy                  OFS
weight_ticket                     0.010000
weight_waiting_time               0.000000
weight_deadline                   3600000.000000
weight_urgency                    0.100000
weight_priority                   1.000000
max_reservation                   64
default_duration                  360:00:00

root@master1: gridengine#qconf -sq high.q
qname                 high.q
hostlist              @allhosts
seq_no                0
load_thresholds       np_load_avg=3.0
suspend_thresholds    NONE
nsuspend              1
suspend_interval      00:05:00
priority              -10
min_cpu_interval      00:05:00
processors            UNDEFINED
qtype                 BATCH INTERACTIVE
ckpt_list             NONE
pe_list               make mpich mpi orte smp threaded
rerun                 FALSE
slots                 1,[]
tmpdir                /tmp
shell                 /bin/bash
prolog                NONE
epilog                NONE
shell_start_mode      posix_compliant
starter_method        NONE
suspend_method        NONE
resume_method         NONE
terminate_method      NONE
notify                00:00:60
owner_list            NONE
user_lists            NONE
xuser_lists           NONE
subordinate_list      NONE
complex_values        NONE
projects              NONE
xprojects             NONE
calendar              NONE
initial_state         default
s_rt                  INFINITY
h_rt                  INFINITY
s_cpu                 INFINITY
h_cpu                 INFINITY
s_fsize               INFINITY
h_fsize               INFINITY
s_data                INFINITY
h_data                INFINITY
s_stack               20971520
h_stack               104857600
s_core                INFINITY
h_core                0
s_rss                 INFINITY
h_rss                 INFINITY
s_vmem                INFINITY
h_vmem                INFINITY

root@master1: gridengine#qconf -sq all.q
qname                 all.q
hostlist              @allhosts
seq_no                0
load_thresholds       np_load_avg=3.0
suspend_thresholds    NONE
nsuspend              1
suspend_interval      00:05:00
priority              0
min_cpu_interval      00:05:00
processors            UNDEFINED
qtype                 BATCH INTERACTIVE
ckpt_list             NONE
pe_list               make mpich mpi orte smp threaded
rerun                 FALSE
slots                 1,[]
tmpdir                /tmp
shell                 /bin/bash
prolog                NONE
epilog                NONE
shell_start_mode      posix_compliant
starter_method        NONE
suspend_method        NONE
resume_method         NONE
terminate_method      NONE
notify                00:00:60
owner_list            NONE
user_lists            NONE
xuser_lists           NONE
subordinate_list      NONE
complex_values        NONE
projects              NONE
xprojects             NONE
calendar              NONE
initial_state         default
s_rt                  INFINITY
h_rt                  INFINITY
s_cpu                 INFINITY
h_cpu                 INFINITY
s_fsize               INFINITY
h_fsize               INFINITY
s_data                INFINITY
h_data                INFINITY
s_stack               20971520
h_stack               104857600
s_core                INFINITY
h_core                0
s_rss                 INFINITY
h_rss                 INFINITY
s_vmem                INFINITY
h_vmem                INFINITY

root@master1: gridengine#qconf -se compute-2-1
hostname              compute-2-1.local
load_scaling          NONE
complex_values        slots=30,h_vmem=120G,io_slots=30
load_values           arch=linux-x64,num_proc=32,mem_total=129169.750000M, \

swap_total=31983.871094M,virtual_total=161153.621094M, \
                      load_avg=21.680000,load_short=21.950000, \
                      load_medium=21.680000,load_long=21.480000, \
                      mem_free=102849.832031M,swap_free=31983.871094M, \
                      virtual_free=134833.703125M,mem_used=26319.917969M, \
                      swap_used=0.000000M,virtual_used=26319.917969M, \
                      cpu=65.300000, \

m_topology=SCTTCTTCTTCTTCTTCTTCTTCTTSCTTCTTCTTCTTCTTCTTCTTCTT, \

m_topology_inuse=SCTTCTTCTTCTTCTTCTTCTTCTTSCTTCTTCTTCTTCTTCTTCTTCTT, \
                      m_socket=2,m_core=16,np_load_avg=0.677500, \
                      np_load_short=0.685937,np_load_medium=0.677500, \
                      np_load_long=0.671250
processors            32
user_lists            NONE
xuser_lists           NONE
projects              NONE
xprojects             NONE
usage_scaling         NONE
report_variables      NONE

root@squid: master1#qconf -sp threaded
pe_name            threaded
slots              9999
user_lists         NONE
xuser_lists        NONE
start_proc_args    /bin/true
stop_proc_args     /bin/true
allocation_rule    $pe_slots
control_slaves     FALSE
job_is_first_task  TRUE
urgency_slots      min
accounting_summary FALSE
_______________________________________________
users mailing list
[email protected]
https://gridengine.org/mailman/listinfo/users

Reply via email to