Hi,

I am a long time happy user of mpi_comm_spawn() routine. But so far I
used it only with the MPI_COMM_WORLD communicator. Now I want to
execute more mpi_comm_spawn() routines, by creating and using group
communicators. However this seems to have some problems. I can get it
to run about 50% times on my laptop, but on some more "speedy"
machines it just produces the following message:

$ mpirun -n 4 a.out
[ala:31406] [[45304,0],0] ORTE_ERROR_LOG: Not found in file 
base/plm_base_launch_support.c at line 758
--------------------------------------------------------------------------
mpirun was unable to start the specified application as it encountered an error.
More information may be available above.
--------------------------------------------------------------------------

I am attaching the 2 programs needed to test the behavior. Compile:
$ mpif90 -o sps sps.f08 # spawned program
$ mpif90 mspbug.f08     # program with problems
$ mpirun -n 4 a.out

The compiler is gfortran-4.4.4, and openmpi is 1.4.2.

Needless to say it runs with mpich2, but mpich2 doesn't know how to
deal with stdin on a spawned process, so it's useless for my project :-(

Any ideas?

-------------------------------------------------
program sps
  use mpi
  implicit none
  integer :: ier,nproc,me,pcomm,meroot,mi,on
  integer, dimension(1:10) :: num

  call mpi_init(ier)

  mi=mpi_integer
  call mpi_comm_rank(mpi_comm_world,me,ier)
  meroot=0

  on=1

  call mpi_comm_get_parent(pcomm,ier)

  call mpi_bcast(num,on,mi,meroot,pcomm,ier)
  write(*,*)'sps>me,num=',me,num(on)

  call mpi_finalize(ier)

end program sps
-------------------------------------------------

program groupspawn

  use mpi

  implicit none
  ! in the case use mpi does not work (eg Ubuntu) use the include below
  ! include 'mpif.h'
  integer :: ier,intercom,nproc,meroot,info,mpierrs(1),mcw
  integer :: i,myrepsiz,me,np,mcg,repdgrp,repdcom,on,mi,op
  integer, dimension(1:10) :: myrepgrp
  character(len=5) :: sarg(1),prog
  integer, dimension(1:10) :: num,sm
  integer :: newme,ngrp,igrp

  call mpi_init(ier)

  prog='sps'
  sarg(1) = ''
  nproc=2
  on=1
  meroot=0
  mcw=mpi_comm_world
  info=mpi_info_null
  mi=mpi_integer
  op=mpi_sum
  mpierrs(1)=mpi_errcodes_ignore(1)

  call mpi_comm_rank(mcw,me,ier)
  call mpi_comm_size(mcw,np,ier)

  ngrp=2  ! lets have some groups
  myrepsiz=np/ngrp
  igrp=me/myrepsiz
  do i = 1, myrepsiz
        myrepgrp(i)=i+me-mod(me,myrepsiz)-1
  enddo

  call mpi_comm_group(mcw,mcg,ier)
  call mpi_group_incl(mcg,myrepsiz,myrepgrp,repdgrp,ier)
  call mpi_comm_create(mcw,repdgrp,repdcom,ier)

!  call mpi_comm_spawn(prog,sarg,nproc,info,meroot,mcw,intercom,mpierrs,ier)
  call mpi_comm_spawn(prog,sarg,nproc,info,meroot,repdcom,intercom,mpierrs,ier)

  ! send a number to spawned ones...

  call mpi_comm_rank(intercom,newme,ier)
  write(*,*)'me,intercom,newme=',me,intercom,newme
  num(1)=111*(igrp+1)

  meroot=mpi_proc_null
  if(newme == 0) meroot=mpi_root ! to send data

  call mpi_bcast(num,on,mi,meroot,intercom,ier)
  ! sometimes there is no output from sps programs, so we wait here: WEIRD :-(
  !call sleep(1)

  call mpi_finalize(ier)

end program groupspawn

Reply via email to