Hi,
We are using Open grid scheduler/Grid Engine version 2011.11p1
Currently have two queues, with identical settings except priority.
all.q -- default queue
high.q ---- higher priority
The scheduler is set to least nodes used policy. All our nodes have
identical resources, 30 cores, 120GB RAM. Scheduler is working as expected
when submitting jobs with low resource requests as per queue priorities.
But when a high mem job (50+GB) is submitted in high.q, it gets stuck in
queue waiting forever, as low mem jobs from default.q are executed when
ever a resource is available and scheduler is not able to fulfill high mem
job requirements even though it is of higher priority. How can I make all
jobs in default.q to wait until higher priority jobs finish ?
Thanks,
Here are the details of our GE config,
root@master1: gridengine#qconf -ssconf
algorithm default
schedule_interval 0:0:05
maxujobs 0
queue_sort_method load
job_load_adjustments np_load_avg=1.75
load_adjustment_decay_time 0:7:30
load_formula np_load_avg
schedd_job_info true
flush_submit_sec 0
flush_finish_sec 0
params none
reprioritize_interval 0:0:0
halftime 168
usage_weight_list cpu=1.000000,mem=0.000000,io=0.000000
compensation_factor 5.000000
weight_user 0.250000
weight_project 0.250000
weight_department 0.250000
weight_job 0.250000
weight_tickets_functional 0
weight_tickets_share 0
share_override_tickets TRUE
share_functional_shares TRUE
max_functional_jobs_to_schedule 200
report_pjob_tickets TRUE
max_pending_tasks_per_job 50
halflife_decay_list none
policy_hierarchy OFS
weight_ticket 0.010000
weight_waiting_time 0.000000
weight_deadline 3600000.000000
weight_urgency 0.100000
weight_priority 1.000000
max_reservation 64
default_duration 360:00:00
root@master1: gridengine#qconf -sq high.q
qname high.q
hostlist @allhosts
seq_no 0
load_thresholds np_load_avg=3.0
suspend_thresholds NONE
nsuspend 1
suspend_interval 00:05:00
priority -10
min_cpu_interval 00:05:00
processors UNDEFINED
qtype BATCH INTERACTIVE
ckpt_list NONE
pe_list make mpich mpi orte smp threaded
rerun FALSE
slots 1,[]
tmpdir /tmp
shell /bin/bash
prolog NONE
epilog NONE
shell_start_mode posix_compliant
starter_method NONE
suspend_method NONE
resume_method NONE
terminate_method NONE
notify 00:00:60
owner_list NONE
user_lists NONE
xuser_lists NONE
subordinate_list NONE
complex_values NONE
projects NONE
xprojects NONE
calendar NONE
initial_state default
s_rt INFINITY
h_rt INFINITY
s_cpu INFINITY
h_cpu INFINITY
s_fsize INFINITY
h_fsize INFINITY
s_data INFINITY
h_data INFINITY
s_stack 20971520
h_stack 104857600
s_core INFINITY
h_core 0
s_rss INFINITY
h_rss INFINITY
s_vmem INFINITY
h_vmem INFINITY
root@master1: gridengine#qconf -sq all.q
qname all.q
hostlist @allhosts
seq_no 0
load_thresholds np_load_avg=3.0
suspend_thresholds NONE
nsuspend 1
suspend_interval 00:05:00
priority 0
min_cpu_interval 00:05:00
processors UNDEFINED
qtype BATCH INTERACTIVE
ckpt_list NONE
pe_list make mpich mpi orte smp threaded
rerun FALSE
slots 1,[]
tmpdir /tmp
shell /bin/bash
prolog NONE
epilog NONE
shell_start_mode posix_compliant
starter_method NONE
suspend_method NONE
resume_method NONE
terminate_method NONE
notify 00:00:60
owner_list NONE
user_lists NONE
xuser_lists NONE
subordinate_list NONE
complex_values NONE
projects NONE
xprojects NONE
calendar NONE
initial_state default
s_rt INFINITY
h_rt INFINITY
s_cpu INFINITY
h_cpu INFINITY
s_fsize INFINITY
h_fsize INFINITY
s_data INFINITY
h_data INFINITY
s_stack 20971520
h_stack 104857600
s_core INFINITY
h_core 0
s_rss INFINITY
h_rss INFINITY
s_vmem INFINITY
h_vmem INFINITY
root@master1: gridengine#qconf -se compute-2-1
hostname compute-2-1.local
load_scaling NONE
complex_values slots=30,h_vmem=120G,io_slots=30
load_values arch=linux-x64,num_proc=32,mem_total=129169.750000M, \
swap_total=31983.871094M,virtual_total=161153.621094M, \
load_avg=21.680000,load_short=21.950000, \
load_medium=21.680000,load_long=21.480000, \
mem_free=102849.832031M,swap_free=31983.871094M, \
virtual_free=134833.703125M,mem_used=26319.917969M, \
swap_used=0.000000M,virtual_used=26319.917969M, \
cpu=65.300000, \
m_topology=SCTTCTTCTTCTTCTTCTTCTTCTTSCTTCTTCTTCTTCTTCTTCTTCTT, \
m_topology_inuse=SCTTCTTCTTCTTCTTCTTCTTCTTSCTTCTTCTTCTTCTTCTTCTTCTT, \
m_socket=2,m_core=16,np_load_avg=0.677500, \
np_load_short=0.685937,np_load_medium=0.677500, \
np_load_long=0.671250
processors 32
user_lists NONE
xuser_lists NONE
projects NONE
xprojects NONE
usage_scaling NONE
report_variables NONE
root@squid: master1#qconf -sp threaded
pe_name threaded
slots 9999
user_lists NONE
xuser_lists NONE
start_proc_args /bin/true
stop_proc_args /bin/true
allocation_rule $pe_slots
control_slaves FALSE
job_is_first_task TRUE
urgency_slots min
accounting_summary FALSE
_______________________________________________
users mailing list
[email protected]
https://gridengine.org/mailman/listinfo/users