If your jobs or queues have an h_rt specified, you can look into advanced 
reservation and submitting large memory jobs with -R y. You will likely want to 
look into tweaking max*reservation* and default_duration parameters via qconf 
-mconf/msconf. Utilizing advanced reservation puts more load on the 
qmaster/scheduler but allows it to prevent smaller jobs from flooding out large 
jobs when only small portions of nodes become available.

Other options are using qhold or disabling the all.q queue instance on many 
nodes when there is a backlog of high.q jobs.

Also, if you haven't already you may want to look into making mem a consumable 
resource (based on your qconf -se output you may have already done this).

Best,
Chris




On 4/14/16, 8:10 PM, "[email protected] on behalf of Happy Monk" 
<[email protected] on behalf of [email protected]> wrote:

>Hi,
>
>
>We are using Open grid scheduler/Grid Engine version 2011.11p1
>
>Currently have two queues, with identical settings except priority.
>
>
>all.q  -- default queue
>
>high.q    ---- higher priority
>
>
>The scheduler is set to least nodes used policy. All our nodes have identical 
>resources, 30 cores, 120GB RAM. Scheduler is working as expected when 
>submitting jobs with low resource requests as per queue priorities. But when a 
>high mem job (50+GB) is submitted
> in high.q, it gets stuck in queue waiting forever, as low mem jobs from 
> default.q are executed when ever a resource is available and scheduler is not 
> able to fulfill high mem job requirements even though it is of higher 
> priority. How can I make all jobs in
> default.q to wait until higher priority jobs finish ?
>
>
>
>Thanks,
>
>
>Here are the details of our GE config,
>
>
>root@master1: gridengine#qconf -ssconf
>algorithm                         default
>schedule_interval                 0:0:05
>maxujobs                          0
>queue_sort_method                 load
>job_load_adjustments              np_load_avg=1.75
>load_adjustment_decay_time        0:7:30
>load_formula                      np_load_avg
>schedd_job_info                   true
>flush_submit_sec                  0
>flush_finish_sec                  0
>params                            none
>reprioritize_interval             0:0:0
>halftime                          168
>usage_weight_list                 cpu=1.000000,mem=0.000000,io=0.000000
>compensation_factor               5.000000
>weight_user                       0.250000
>weight_project                    0.250000
>weight_department                 0.250000
>weight_job                        0.250000
>weight_tickets_functional         0
>weight_tickets_share              0
>share_override_tickets            TRUE
>share_functional_shares           TRUE
>max_functional_jobs_to_schedule   200
>report_pjob_tickets               TRUE
>max_pending_tasks_per_job         50
>halflife_decay_list               none
>policy_hierarchy                  OFS
>weight_ticket                     0.010000
>weight_waiting_time               0.000000
>weight_deadline                   3600000.000000
>weight_urgency                    0.100000
>weight_priority                   1.000000
>max_reservation                   64
>default_duration                  360:00:00
>
>root@master1: gridengine#qconf -sq high.q
>qname                 high.q
>hostlist              @allhosts
>seq_no                0
>load_thresholds       np_load_avg=3.0
>suspend_thresholds    NONE
>nsuspend              1
>suspend_interval      00:05:00
>priority              -10
>min_cpu_interval      00:05:00
>processors            UNDEFINED
>qtype                 BATCH INTERACTIVE
>ckpt_list             NONE
>pe_list               make mpich mpi orte smp threaded
>rerun                 FALSE
>slots                 1,[]
>tmpdir                /tmp
>shell                 /bin/bash
>prolog                NONE
>epilog                NONE
>shell_start_mode      posix_compliant
>starter_method        NONE
>suspend_method        NONE
>resume_method         NONE
>terminate_method      NONE
>notify                00:00:60
>owner_list            NONE
>user_lists            NONE
>xuser_lists           NONE
>subordinate_list      NONE
>complex_values        NONE
>projects              NONE
>xprojects             NONE
>calendar              NONE
>initial_state         default
>s_rt                  INFINITY
>h_rt                  INFINITY
>s_cpu                 INFINITY
>h_cpu                 INFINITY
>s_fsize               INFINITY
>h_fsize               INFINITY
>s_data                INFINITY
>h_data                INFINITY
>s_stack               20971520
>h_stack               104857600
>s_core                INFINITY
>h_core                0
>s_rss                 INFINITY
>h_rss                 INFINITY
>s_vmem                INFINITY
>h_vmem                INFINITY
>
>root@master1: gridengine#qconf -sq all.q
>qname                 all.q
>hostlist              @allhosts
>seq_no                0
>load_thresholds       np_load_avg=3.0
>suspend_thresholds    NONE
>nsuspend              1
>suspend_interval      00:05:00
>priority              0
>min_cpu_interval      00:05:00
>processors            UNDEFINED
>qtype                 BATCH INTERACTIVE
>ckpt_list             NONE
>pe_list               make mpich mpi orte smp threaded
>rerun                 FALSE
>slots                 1,[]
>tmpdir                /tmp
>shell                 /bin/bash
>prolog                NONE
>epilog                NONE
>shell_start_mode      posix_compliant
>starter_method        NONE
>suspend_method        NONE
>resume_method         NONE
>terminate_method      NONE
>notify                00:00:60
>owner_list            NONE
>user_lists            NONE
>xuser_lists           NONE
>subordinate_list      NONE
>complex_values        NONE
>projects              NONE
>xprojects             NONE
>calendar              NONE
>initial_state         default
>s_rt                  INFINITY
>h_rt                  INFINITY
>s_cpu                 INFINITY
>h_cpu                 INFINITY
>s_fsize               INFINITY
>h_fsize               INFINITY
>s_data                INFINITY
>h_data                INFINITY
>s_stack               20971520
>h_stack               104857600
>s_core                INFINITY
>h_core                0
>s_rss                 INFINITY
>h_rss                 INFINITY
>s_vmem                INFINITY
>h_vmem                INFINITY
>
>root@master1: gridengine#qconf -se compute-2-1
>hostname              compute-2-1.local
>load_scaling          NONE
>complex_values        slots=30,h_vmem=120G,io_slots=30
>load_values           arch=linux-x64,num_proc=32,mem_total=129169.750000M, \
>                      swap_total=31983.871094M,virtual_total=161153.621094M, \
>                      load_avg=21.680000,load_short=21.950000, \
>                      load_medium=21.680000,load_long=21.480000, \
>                      mem_free=102849.832031M,swap_free=31983.871094M, \
>                      virtual_free=134833.703125M,mem_used=26319.917969M, \
>                      swap_used=0.000000M,virtual_used=26319.917969M, \
>                      cpu=65.300000, \
>                      
> m_topology=SCTTCTTCTTCTTCTTCTTCTTCTTSCTTCTTCTTCTTCTTCTTCTTCTT, \
>                      
> m_topology_inuse=SCTTCTTCTTCTTCTTCTTCTTCTTSCTTCTTCTTCTTCTTCTTCTTCTT, \
>                      m_socket=2,m_core=16,np_load_avg=0.677500, \
>                      np_load_short=0.685937,np_load_medium=0.677500, \
>                      np_load_long=0.671250
>processors            32
>user_lists            NONE
>xuser_lists           NONE
>projects              NONE
>xprojects             NONE
>usage_scaling         NONE
>report_variables      NONE
>
>root@squid: master1#qconf -sp threaded
>pe_name            threaded
>slots              9999
>user_lists         NONE
>xuser_lists        NONE
>start_proc_args    /bin/true
>stop_proc_args     /bin/true
>allocation_rule    $pe_slots
>control_slaves     FALSE
>job_is_first_task  TRUE
>urgency_slots      min
>accounting_summary FALSE
>
>
>
>
>
>
>
This electronic message is intended for the use of the named recipient only, 
and may contain information that is confidential, privileged or protected from 
disclosure under applicable law. If you are not the intended recipient, or an 
employee or agent responsible for delivering this message to the intended 
recipient, you are hereby notified that any reading, disclosure, dissemination, 
distribution, copying or use of the contents of this message including any of 
its attachments is strictly prohibited. If you have received this message in 
error or are not the named recipient, please notify us immediately by 
contacting the sender at the electronic mail address noted above, and destroy 
all copies of this message. Please note, the recipient should check this email 
and any attachments for the presence of viruses. The organization accepts no 
liability for any damage caused by any virus transmitted by this email.

_______________________________________________
users mailing list
[email protected]
https://gridengine.org/mailman/listinfo/users

Reply via email to