Hello,

I have a problem submiiting parralel jobs, e.g.:

qsub -pe orte 4 -cwd -V <<< "mpiexec -n 4 a.out"

No files are opened by this program, only output to stdout.

The master has 8 nodes, with 24 to 64 cpus each (see output of qhost in
the attachment).
My problem: When i send with qsub, i can run on 100 cpus but 300 brakes
it all (the job crashes). I wonder why.
Which means: It seems to work on small scales but if i use too many cpus
i get errors. It happens on random nodes. The jobs get stuck when this
happens with this error message:

karun02:48241] [[59899,0],0] ORTE_ERROR_LOG: The system limit on number
of pipes a process can open was reached in file base/iof_base_setup.c at
line 99
[karun02:48241] [[59899,0],0] ORTE_ERROR_LOG: The system limit on number
of pipes a process can open was reached in file odls_default_module.c at
line 895
--------------------------------------------------------------------------
WARNING: The accept(3) system call failed on a TCP socket.  While this
should generally never happen on a well-configured HPC system, the
most common causes when it does occur are:

  * The process ran out of file descriptors
  * The operating system ran out of file descriptors
  * The operating system ran out of memory

Your Open MPI job will likely hang until the failure resason is fixed
(e.g., more file descriptors and/or memory becomes available), and may
eventually timeout / abort.

  Local host:     karun02
  Errno:          24 (Too many open files)
  Probable cause: Out of file descriptors
--------------------------------------------------------------------------


I have no idea what i have overseen. (Yes, i googled arroud, but did not
find a solution).
Does anybody have an idea what i did wrong?

With kind regards, ulrich

Here is my configuration (output of 'qhost -q' in the attachment):


My configuration:
~> qconf -sq all.q| grep pe_list
pe_list               make smp mpi orte

~> qconf -sp orte
pe_name            orte
slots              99999
user_lists         NONE
xuser_lists        NONE
start_proc_args    /bin/true
stop_proc_args     /bin/true
allocation_rule    $fill_up
control_slaves     TRUE
job_is_first_task  FALSE
urgency_slots      min
accounting_summary TRUE

~> qconf -ssconf
algorithm                         default
schedule_interval                 0:0:15
maxujobs                          0
queue_sort_method                 load
job_load_adjustments              np_load_avg=5.50
load_adjustment_decay_time        0:7:30
load_formula                      np_load_avg
schedd_job_info                   false
flush_submit_sec                  0
flush_finish_sec                  0
params                            none
reprioritize_interval             1:0:0
halftime                          168
usage_weight_list                 cpu=1.000000,mem=0.000000,io=0.000000
compensation_factor               5.000000
weight_user                       1.000000
weight_project                    0.000000
weight_department                 0.000000
weight_job                        0.000000
weight_tickets_functional         100000
weight_tickets_share              100000
share_override_tickets            TRUE
share_functional_shares           TRUE
max_functional_jobs_to_schedule   2000
report_pjob_tickets               TRUE
max_pending_tasks_per_job         500
halflife_decay_list               none
policy_hierarchy                  OFS
weight_ticket                     0.010000
weight_waiting_time               0.000000
weight_deadline                   3600000.000000
weight_urgency                    0.100000
weight_priority                   1.000000
max_reservation                   0
default_duration                  INFINITY




~> qconf -sconf
#global:
execd_spool_dir              /home/sgeadmin/sge/GE2011.11p1/default/spool
mailer                       /bin/mail
xterm                        /usr/bin/X11/xterm
load_sensor                  none
prolog                       none
epilog                       none
shell_start_mode             posix_compliant
login_shells                 sh,bash,ksh,csh,tcsh
min_uid                      0
min_gid                      0
user_lists                   none
xuser_lists                  none
projects                     none
xprojects                    none
enforce_project              false
enforce_user                 auto
load_report_time             00:00:40
max_unheard                  00:05:00
reschedule_unknown           02:00:00
loglevel                     log_warning
administrator_mail           [email protected]
set_token_cmd                none
pag_cmd                      none
token_extend_time            none
shepherd_cmd                 none
qmaster_params               none
execd_params                 none
reporting_params             accounting=true reporting=false \
                             flush_time=00:00:15 joblog=false
sharelog=00:00:00
finished_jobs                100
gid_range                    20000-20100
qlogin_command               builtin
qlogin_daemon                builtin
rlogin_command               builtin
rlogin_daemon                builtin
rsh_command                  builtin
rsh_daemon                   builtin
max_aj_instances             2000
max_aj_tasks                 75000
max_u_jobs                   0
max_jobs                     0
max_advance_reservations     0
auto_user_oticket            0
auto_user_fshare             0
auto_user_default_project    none
auto_user_delete_time        86400
delegated_file_staging       false
reprioritize                 1
jsv_url                      none
jsv_allowed_mod              ac,h,i,e,o,j,M,N,p,w
user_sort                    true



HOSTNAME                ARCH         NCPU  LOAD  MEMTOT  MEMUSE  SWAPTO  SWAPUS
-------------------------------------------------------------------------------
global                  -               -     -       -       -       -       -
karun01                 linux-x64      64  0.01  504.9G    2.0G   10.0G     0.0
   all.q                BIP   0/0/64        
karun02                 linux-x64      64  1.00  504.8G   11.8G   10.0G     0.0
   all.q                BIP   0/64/64       
karun03                 linux-x64      64  0.01  504.9G    2.0G   10.0G     0.0
   all.q                BIP   0/64/64       
karun04                 linux-x64      64  0.01  504.9G    2.0G   10.0G     0.0
   all.q                BIP   0/64/64       
karun05                 linux-x64      64  0.01  504.9G    2.0G   10.0G     0.0
   all.q                BIP   0/21/64       
karun06                 linux-x64      40  0.01  314.8G    1.1G   32.0G     0.0
   all.q                BIP   0/40/40       
karun07                 linux-x64      24  0.01  188.8G  637.3M   30.0G     0.0
   all.q                BIP   0/24/24       
karun08                 linux-x64      24  0.01  188.8G  648.7M   30.0G     0.0
   all.q                BIP   0/24/24       
_______________________________________________
users mailing list
[email protected]
https://gridengine.org/mailman/listinfo/users

Reply via email to