Hi all,
I recently implemented a fair share policy using share tickets. I've been
monitoring the cluster for a couple of days using qstat -pri -ext -u "*" in
order to see how the functional tickets are working and it seems to have the
intended effect. There are some anomalies where some running jobs have 0
tickets but still get scheduled since there's free resources; I assume this is
normal.
I'll admit that I don't fully understand the scheduling as it's somewhat
complex. So, I'm hoping someone can review the configuration to see if they
can find any glaring issues such as conflicting options.
I created a share-tree and gave all users an equal value of 10:
$ qconf -sstree
id=0
name=Root
type=0
shares=1
childnodes=1
id=1
name=default
type=0
shares=10
childnodes=NONE
I modified the scheduling by setting the weight_tickets_share to 1000000. I
reduced the weight_waiting_time weight_priority weight_urgency to well below
the weight_ticket (what are good values?).
$ qconf -ssconf
algorithm default
schedule_interval 0:0:15
maxujobs 0
queue_sort_method seqno
job_load_adjustments np_load_avg=0.50
load_adjustment_decay_time 0:7:30
load_formula np_load_avg
schedd_job_info false
flush_submit_sec 0
flush_finish_sec 0
params none
reprioritize_interval 0:0:0
halftime 168
usage_weight_list cpu=0.700000,mem=0.200000,io=0.100000
compensation_factor 5.000000
weight_user 0.250000
weight_project 0.250000
weight_department 0.250000
weight_job 0.250000
weight_tickets_functional 0
weight_tickets_share 1000000
share_override_tickets TRUE
share_functional_shares TRUE
max_functional_jobs_to_schedule 200
report_pjob_tickets TRUE
max_pending_tasks_per_job 50
halflife_decay_list none
policy_hierarchy OFS
weight_ticket 0.500000
weight_waiting_time 0.000010
weight_deadline 3600000.000000
weight_urgency 0.010000
weight_priority 0.010000
max_reservation 0
default_duration INFINITY
I modified all the users to set the fshare to 1000
$ qconf -muser XXX
I modified the general conf to auto_user_fsahre 1000 and auto_user_delete_time
7776000 (90 days). Halftime is set to the default 7 days (I assume I should
increase this). I don't know if auto_user_delete_time even matters.
$ qconf -sconf
#global:
execd_spool_dir /opt/gridengine/default/spool
mailer /opt/gridengine/default/commond/mail_wrapper.py
xterm /usr/bin/xterm
load_sensor none
prolog none
epilog none
shell_start_mode posix_compliant
login_shells sh,bash
min_uid 100
min_gid 100
user_lists none
xuser_lists none
projects none
xprojects none
enforce_project false
enforce_user auto
load_report_time 00:00:40
max_unheard 00:05:00
reschedule_unknown 00:00:00
loglevel log_info
administrator_mail none
set_token_cmd none
pag_cmd none
token_extend_time none
shepherd_cmd none
qmaster_params none
execd_params ENABLE_BINDING=true ENABLE_ADDGRP_KILL=true \
H_DESCRIPTORS=16K
reporting_params accounting=true reporting=true \
flush_time=00:00:15 joblog=true sharelog=00:00:00
finished_jobs 100
gid_range 20000-20100
qlogin_command /opt/gridengine/bin/rocks-qlogin.sh
qlogin_daemon /usr/sbin/sshd -i
rlogin_command builtin
rlogin_daemon builtin
rsh_command builtin
rsh_daemon builtin
max_aj_instances 2000
max_aj_tasks 75000
max_u_jobs 0
max_jobs 0
max_advance_reservations 0
auto_user_oticket 0
auto_user_fshare 1000
auto_user_default_project none
auto_user_delete_time 7776000
delegated_file_staging false
reprioritize 0
jsv_url none
jsv_allowed_mod ac,h,i,e,o,j,M,N,p,w
Thanks for your assistance.
Cheers
Iyad Kandalaft
_______________________________________________
users mailing list
[email protected]
https://gridengine.org/mailman/listinfo/users