Hi all!

We have a small test cluster of 44 compute nodes and we have 4 cnodes
with 2 MICs per node.

So, I would like to share with you my latest version of mpirun-mic
script. It is based on the script that was created by Mr. Olli-Pekka
Lehto. The users are happy with that script and if you would like you
could include it in the next Slurm release, if Mr. Lehot agrees with
this. You can use it or modify it as you like :)

Here is the script (you can find it also attached with this message):

{{{
#!/bin/bash
# **************************************************************************
# Function:  Wrapper that helps launching Intel MPI jobs within SLURM
#            using MICs in native mode.
#            mpiexec.hydra needs passwordless ssh access to all involved
nodes
# Version:   0.4
#---------------------------------------------------------------------------
# 11.10.2013 Created by Chrysovalantis Paschoulas, Juelich
Supercomputing Centre - Forschungszentrum Juelich
# Intial Script by (C) Olli-Pekka Lehto - CSC IT Center for Science Ltd.
# **************************************************************************

# Usage message
USAGE="
USAGE
  $(basename "$0") [ [-h] | [-v] [-x <host num tasks> -c <host binary>]
[-z <mic num tasks> -m <mic binary>] ]

OPTIONS
  -h       Print this message.
  -c       Binary that will run on host nodes. If it is not set then
only the MICs will be used.
  -m       Binary that will run inside the MICs.
  -x       Number of tasks (MPI ranks) for the host nodes. Default
value is 1.
  -z       Number of tasks (MPI ranks) for the MICs. Default value is 1.
  -v       Show more info for this script.

MORE INFO
  The user MUST export the following environment variables:
    MIC_NUM_PER_HOST     Number of MICs on each host that will be used
by mpiexec. Available options: 0, 1, 2. Default 2.
    OMP_NUM_THREADS      OpenMP threads number per task on hosts. This
MUST be exported when OpenMP is used!
    MIC_OMP_NUM_THREADS  OpenMP threads number per task on MICs. If not
defined then is set same as OMP_NUM_THREADS.

  Also the user MAY pass additional flags to mpiexec exporting the
following env vars:
    MPIEXEC_PREFIX       Wrap the execution of mpiexec with another
tool (e.g. totalview).
    MPIEXEC_FLAGS_HOST   Flags that will be passed to the hosts.
    MPIEXEC_FLAGS_MIC    Flags that will be passed to the MICs.
  -- Examples:
       export MPIEXEC_PREFIX=\"totalview -args\"
       export MPIEXEC_PREFIX=\"totalviewcli -args\"
       export MPIEXEC_FLAGS_HOST=\"-env VAR VALUE\"
       export MPIEXEC_FLAGS_MIC=\"-envlist VAR1,VAR2\"

EXAMPLES
  Batch Script1 - Only hosts:
    ---
    #!/bin/bash
    #SBATCH -J TestJobMICNativeHybrid
    #SBATCH -N 4
    #SBATCH -p q_mics
    #SBATCH -o TestJob-%j.out
    #SBATCH -e TestJob-%j.err
    #SBATCH --time=30

    module purge
    module load impi intel/13.1.3

    export MIC_NUM_PER_HOST=0
    export OMP_NUM_THREADS=32

    mpirun-mic -x 1 -c ./impi_native_hybrid
    ---

  Batch Script2 - Only mics:
    ---
    #!/bin/bash
    #SBATCH -J TestJobMICNativeHybrid
    #SBATCH -N 4
    #SBATCH -p q_mics
    #SBATCH -o TestJob-%j.out
    #SBATCH -e TestJob-%j.err
    #SBATCH --time=30

    module purge
    module load impi intel/13.1.3

    export MIC_NUM_PER_HOST=2
    export MIC_OMP_NUM_THREADS=240

    mpirun-mic -z 1 -m ./impi_native_hybrid.mic
    ---

  Batch Script3 - Hosts and MICs:
    ---
    #!/bin/bash
    #SBATCH -J TestJobMICNativeHybrid
    #SBATCH -N 2
    #SBATCH -p q_mics
    #SBATCH -o TestJob-%j.out
    #SBATCH -e TestJob-%j.err
    #SBATCH --time=30

    module purge
    module load impi intel/13.1.3

    export MIC_NUM_PER_HOST=2
    export OMP_NUM_THREADS=2
    export MIC_OMP_NUM_THREADS=4

    mpirun-mic -v -x 16 -c ./impi_native_hybrid -z 60 -m
./impi_native_hybrid.mic
    ---
";

# check script arguments
if [ $# -lt 1 ] ; then
   echo "$USAGE" >&2
   exit 1
fi

# get script arguments
while getopts "vhc:m:x:z:" OPTION
do
  case $OPTION in
    h)
      echo "$USAGE";
      exit 0;
      ;;
    c)
      HOST_BINARY=$OPTARG
      ;;
    m)
      MIC_BINARY=$OPTARG
      ;;
    x)
      HOST_PPN=$OPTARG
      ;;
    z)
      MIC_PPN=$OPTARG
      ;;
    v)
      MPIRUN_MIC_VERBOSE=1
      ;;
    \?)
      echo "$USAGE";
      exit 1;
      ;;
    esac
done


### prepare the environment
# If not under SLURM just run on the local system, but still we must be
on a compute node..
if [[ -z "$SLURM_PROCID" ]] ; then
    SLURM_PROCID=0
fi
if [[ -z "$SLURM_NODELIST" ]] ; then
    SLURM_NODELIST=`hostname`
fi

# give default values
if [[ -z "$MIC_PPN" ]] ; then
    MIC_PPN=1
fi
if [[ -z "$HOST_PPN" ]] ; then
    HOST_PPN=1
fi

if [[ -z "$MIC_NUM_PER_HOST" ]] ; then
    MIC_NUM_PER_HOST=2
fi


# We will use OMP_NUM_THREADS to decide if the user will run a Hybrid
MPI+OpenMP job
# Here set default value for MIC_OMP_NUM_THREADS
if [[ -n "$OMP_NUM_THREADS" ]] ; then
  if [[ -z "$MIC_OMP_NUM_THREADS" ]] ; then
    MIC_OMP_NUM_THREADS=$OMP_NUM_THREADS
  fi
fi

# check the important values
if [[ -z "$HOST_BINARY" ]] &&  [[ -z "$MIC_BINARY" ]] ; then
  echo "$USAGE" >&2
  exit 1;
fi

# create the command line
#MPI_EXEC=mpirun
MPI_EXEC=mpiexec.hydra
EXEC_ARGS=""

# create the list of the nodes that are configured to have MICs
LLIST_HOSTS_WITH_MICS="";
SLIST_HOSTS_WITH_MICS=`sinfo -h -o "%N %G" | grep mic | awk '{ print $1;
}'`;
for host in `scontrol show hostname $SLIST_HOSTS_WITH_MICS` ; do
  LLIST_HOSTS_WITH_MICS="${LLIST_HOSTS_WITH_MICS} ${host}";
done

# create the lists of HOSTS AND MICS!
HOST_NODELIST="";
MIC_NODELIST="";
for host in `scontrol show hostname $SLURM_NODELIST` ; do
  echo $LLIST_HOSTS_WITH_MICS | grep $host  &> /dev/null
  if [ $? -eq 0 ] ; then
    if [ $MIC_NUM_PER_HOST -eq 1 ] ; then
      MIC_NODELIST="${MIC_NODELIST} ${host}-mic0";
    elif [ $MIC_NUM_PER_HOST -eq 2 ] ; then
      MIC_NODELIST="${MIC_NODELIST} ${host}-mic0 ${host}-mic1";
    fi
  fi
  HOST_NODELIST="${HOST_NODELIST} ${host}";
done


# create the arguments
# args for hosts here
# run job on hosts if host binary is not null
if [[ -n "$HOST_BINARY" ]] ; then
  if [[ -n "$HOST_NODELIST" ]] ; then
    for n in $HOST_NODELIST ; do
      if [[ -n "$OMP_NUM_THREADS" ]] ; then
        # with OpenMP
        EXEC_ARGS="${EXEC_ARGS} : -env OMP_NUM_THREADS $OMP_NUM_THREADS
$MPIEXEC_FLAGS_HOST -n $HOST_PPN -host $n $HOST_BINARY";
      else
        # without OpenMP
        EXEC_ARGS="${EXEC_ARGS} : $MPIEXEC_FLAGS_HOST -n $HOST_PPN
-host $n $HOST_BINARY";
      fi
    done
  fi
fi
# args for mics here
# run job on mics if mic binary is not null and MIC_NUM_PER_HOST is 1 or 2
if [[ -n "$MIC_NODELIST" ]] ; then
  for n in $MIC_NODELIST ; do
    if [[ -n "$MIC_OMP_NUM_THREADS" ]] ; then
      # with OpenMP
      EXEC_ARGS="${EXEC_ARGS} : -env OMP_NUM_THREADS
$MIC_OMP_NUM_THREADS -env LD_LIBRARY_PATH
$MIC_LD_LIBRARY_PATH:$LD_LIBRARY_PATH $MPIEXEC_FLAGS_MIC -n $MIC_PPN
-host $n $MIC_BINARY";
      #EXEC_ARGS="${EXEC_ARGS} : -env OMP_NUM_THREADS
$MIC_OMP_NUM_THREADS $MPIEXEC_FLAGS_MIC -n $MIC_PPN -host $n $MIC_BINARY";
    else
      # NO OpenMP
      EXEC_ARGS="${EXEC_ARGS} : -env LD_LIBRARY_PATH
$MIC_LD_LIBRARY_PATH:$LD_LIBRARY_PATH $MPIEXEC_FLAGS_MIC -n $MIC_PPN
-host $n $MIC_BINARY";
      #EXEC_ARGS="${EXEC_ARGS} : $MPIEXEC_FLAGS_MIC -n $MIC_PPN -host
$n $MIC_BINARY";
    fi
  done
fi

RUNCMD="$MPI_EXEC $EXEC_ARGS";

if [[ -n "$MPIEXEC_PREFIX" ]] ; then
  RUNCMD="$MPIEXEC_PREFIX $RUNCMD";
fi

# extra important env (Local System depended)
#export LD_LIBRARY_PATH="$MIC_LD_LIBRARY_PATH:$LD_LIBRARY_PATH"
export I_MPI_MIC=1
export I_MPI_DAPL_PROVIDER_LIST=ofa-v2-mlx4_0-1
unset I_MPI_DEVICE
unset I_MPI_PMI_LIBRARY

# start the job
if [ $SLURM_PROCID -eq 0 ] ; then

    if [[ -n "$MPIRUN_MIC_VERBOSE" ]] ; then
        echo
        echo
"########################################################################"
        echo "MPI Tasks per host:          $HOST_PPN"
        echo "Threads per host MPI task:   $OMP_NUM_THREADS"
    echo "Binary for the hosts:        $HOST_BINARY"
        echo "MPI Tasks per MIC:           $MIC_PPN"
        echo "Threads per MIC MPI task:    $MIC_OMP_NUM_THREADS"
    echo "Binary for the mics:         $MIC_BINARY"
        echo "MIC_NUM_PER_HOST:            $MIC_NUM_PER_HOST"
        echo
        echo "MPIEXEC_PREFIX:              $MPIEXEC_PREFIX"
        echo "MPIEXEC_FLAGS_HOST:          $MPIEXEC_FLAGS_HOST"
        echo "MPIEXEC_FLAGS_MIC:           $MPIEXEC_FLAGS_MIC"
        echo ""
        echo "Run command: "
        echo "$RUNCMD"
        echo
"########################################################################"
        echo
    fi

    $RUNCMD

fi

}}}

Best Regards,
Chrysovalantis Paschoulas

Juelich Supercomputing Centre
Forschungszentrum Juelich


------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------
Forschungszentrum Juelich GmbH
52425 Juelich
Sitz der Gesellschaft: Juelich
Eingetragen im Handelsregister des Amtsgerichts Dueren Nr. HR B 3498
Vorsitzender des Aufsichtsrats: MinDir Dr. Karl Eugen Huthmacher
Geschaeftsfuehrung: Prof. Dr. Achim Bachem (Vorsitzender),
Karsten Beneke (stellv. Vorsitzender), Prof. Dr.-Ing. Harald Bolt,
Prof. Dr. Sebastian M. Schmidt
------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------

#!/bin/bash
# **************************************************************************
# Function:  Wrapper that helps launching Intel MPI jobs within SLURM
#            using MICs in native mode.
#            mpiexec.hydra needs passwordless ssh access to all involved nodes
# Version:   0.4
#---------------------------------------------------------------------------
# 11.10.2013 Created by Chrysovalantis Paschoulas, Juelich Supercomputing 
Centre - Forschungszentrum Juelich
# Intial Script by (C) Olli-Pekka Lehto - CSC IT Center for Science Ltd.
# **************************************************************************

# Usage message
USAGE="
USAGE 
  $(basename "$0") [ [-h] | [-v] [-x <host num tasks> -c <host binary>] [-z 
<mic num tasks> -m <mic binary>] ]

OPTIONS
  -h       Print this message.
  -c       Binary that will run on host nodes. If it is not set then only the 
MICs will be used.
  -m       Binary that will run inside the MICs.
  -x       Number of tasks (MPI ranks) for the host nodes. Default value is 1.
  -z       Number of tasks (MPI ranks) for the MICs. Default value is 1.
  -v       Show more info for this script.

MORE INFO
  The user MUST export the following environment variables:
    MIC_NUM_PER_HOST     Number of MICs on each host that will be used by 
mpiexec. Available options: 0, 1, 2. Default 2.
    OMP_NUM_THREADS      OpenMP threads number per task on hosts. This MUST be 
exported when OpenMP is used!
    MIC_OMP_NUM_THREADS  OpenMP threads number per task on MICs. If not defined 
then is set same as OMP_NUM_THREADS.

  Also the user MAY pass additional flags to mpiexec exporting the following 
env vars: 
    MPIEXEC_PREFIX       Wrap the execution of mpiexec with another tool (e.g. 
totalview).
    MPIEXEC_FLAGS_HOST   Flags that will be passed to the hosts.
    MPIEXEC_FLAGS_MIC    Flags that will be passed to the MICs.
  -- Examples:
       export MPIEXEC_PREFIX=\"totalview -args\"
       export MPIEXEC_PREFIX=\"totalviewcli -args\"
       export MPIEXEC_FLAGS_HOST=\"-env VAR VALUE\"
       export MPIEXEC_FLAGS_MIC=\"-envlist VAR1,VAR2\"

EXAMPLES
  Batch Script1 - Only hosts:
    ---
    #!/bin/bash
    #SBATCH -J TestJobMICNativeHybrid
    #SBATCH -N 4
    #SBATCH -p q_mics
    #SBATCH -o TestJob-%j.out
    #SBATCH -e TestJob-%j.err
    #SBATCH --time=30

    module purge
    module load impi intel/13.1.3

    export MIC_NUM_PER_HOST=0
    export OMP_NUM_THREADS=32

    mpirun-mic -x 1 -c ./impi_native_hybrid
    ---

  Batch Script2 - Only mics:
    ---
    #!/bin/bash
    #SBATCH -J TestJobMICNativeHybrid
    #SBATCH -N 4
    #SBATCH -p q_mics
    #SBATCH -o TestJob-%j.out
    #SBATCH -e TestJob-%j.err
    #SBATCH --time=30

    module purge
    module load impi intel/13.1.3

    export MIC_NUM_PER_HOST=2
    export MIC_OMP_NUM_THREADS=240

    mpirun-mic -z 1 -m ./impi_native_hybrid.mic
    ---

  Batch Script3 - Hosts and MICs:
    ---
    #!/bin/bash
    #SBATCH -J TestJobMICNativeHybrid
    #SBATCH -N 2
    #SBATCH -p q_mics
    #SBATCH -o TestJob-%j.out
    #SBATCH -e TestJob-%j.err
    #SBATCH --time=30

    module purge
    module load impi intel/13.1.3

    export MIC_NUM_PER_HOST=2
    export OMP_NUM_THREADS=2
    export MIC_OMP_NUM_THREADS=4

    mpirun-mic -v -x 16 -c ./impi_native_hybrid -z 60 -m 
./impi_native_hybrid.mic
    ---
";

# check script arguments 
if [ $# -lt 1 ] ; then
   echo "$USAGE" >&2
   exit 1
fi

# get script arguments
while getopts "vhc:m:x:z:" OPTION
do
  case $OPTION in
    h)
      echo "$USAGE";
      exit 0;
      ;;
    c)
      HOST_BINARY=$OPTARG
      ;;
    m)
      MIC_BINARY=$OPTARG
      ;;
    x)
      HOST_PPN=$OPTARG
      ;;
    z)
      MIC_PPN=$OPTARG
      ;;
    v)
      MPIRUN_MIC_VERBOSE=1
      ;;
    \?)
      echo "$USAGE";
      exit 1;
      ;;
    esac
done


### prepare the environment
# If not under SLURM just run on the local system, but still we must be on a 
compute node..
if [[ -z "$SLURM_PROCID" ]] ; then
    SLURM_PROCID=0
fi
if [[ -z "$SLURM_NODELIST" ]] ; then
    SLURM_NODELIST=`hostname`
fi

# give default values
if [[ -z "$MIC_PPN" ]] ; then
    MIC_PPN=1
fi
if [[ -z "$HOST_PPN" ]] ; then
    HOST_PPN=1
fi

if [[ -z "$MIC_NUM_PER_HOST" ]] ; then
    MIC_NUM_PER_HOST=2
fi


# We will use OMP_NUM_THREADS to decide if the user will run a Hybrid 
MPI+OpenMP job
# Here set default value for MIC_OMP_NUM_THREADS
if [[ -n "$OMP_NUM_THREADS" ]] ; then
  if [[ -z "$MIC_OMP_NUM_THREADS" ]] ; then
    MIC_OMP_NUM_THREADS=$OMP_NUM_THREADS
  fi
fi

# check the important values
if [[ -z "$HOST_BINARY" ]] &&  [[ -z "$MIC_BINARY" ]] ; then
  echo "$USAGE" >&2
  exit 1;
fi

# create the command line
#MPI_EXEC=mpirun
MPI_EXEC=mpiexec.hydra
EXEC_ARGS=""

# create the list of the nodes that are configured to have MICs
LLIST_HOSTS_WITH_MICS="";
SLIST_HOSTS_WITH_MICS=`sinfo -h -o "%N %G" | grep mic | awk '{ print $1; }'`;
for host in `scontrol show hostname $SLIST_HOSTS_WITH_MICS` ; do
  LLIST_HOSTS_WITH_MICS="${LLIST_HOSTS_WITH_MICS} ${host}";
done

# create the lists of HOSTS AND MICS!
HOST_NODELIST="";
MIC_NODELIST="";
for host in `scontrol show hostname $SLURM_NODELIST` ; do
  echo $LLIST_HOSTS_WITH_MICS | grep $host  &> /dev/null
  if [ $? -eq 0 ] ; then
    if [ $MIC_NUM_PER_HOST -eq 1 ] ; then
      MIC_NODELIST="${MIC_NODELIST} ${host}-mic0";
    elif [ $MIC_NUM_PER_HOST -eq 2 ] ; then
      MIC_NODELIST="${MIC_NODELIST} ${host}-mic0 ${host}-mic1";
    fi
  fi
  HOST_NODELIST="${HOST_NODELIST} ${host}";
done


# create the arguments 
# args for hosts here
# run job on hosts if host binary is not null
if [[ -n "$HOST_BINARY" ]] ; then
  if [[ -n "$HOST_NODELIST" ]] ; then
    for n in $HOST_NODELIST ; do
      if [[ -n "$OMP_NUM_THREADS" ]] ; then
        # with OpenMP
        EXEC_ARGS="${EXEC_ARGS} : -env OMP_NUM_THREADS $OMP_NUM_THREADS 
$MPIEXEC_FLAGS_HOST -n $HOST_PPN -host $n $HOST_BINARY";
      else
        # without OpenMP
        EXEC_ARGS="${EXEC_ARGS} : $MPIEXEC_FLAGS_HOST -n $HOST_PPN -host $n 
$HOST_BINARY";
      fi
    done
  fi
fi
# args for mics here
# run job on mics if mic binary is not null and MIC_NUM_PER_HOST is 1 or 2
if [[ -n "$MIC_NODELIST" ]] ; then
  for n in $MIC_NODELIST ; do
    if [[ -n "$MIC_OMP_NUM_THREADS" ]] ; then
      # with OpenMP
      EXEC_ARGS="${EXEC_ARGS} : -env OMP_NUM_THREADS $MIC_OMP_NUM_THREADS -env 
LD_LIBRARY_PATH $MIC_LD_LIBRARY_PATH:$LD_LIBRARY_PATH $MPIEXEC_FLAGS_MIC -n 
$MIC_PPN -host $n $MIC_BINARY";
      #EXEC_ARGS="${EXEC_ARGS} : -env OMP_NUM_THREADS $MIC_OMP_NUM_THREADS 
$MPIEXEC_FLAGS_MIC -n $MIC_PPN -host $n $MIC_BINARY";
    else
      # NO OpenMP
      EXEC_ARGS="${EXEC_ARGS} : -env LD_LIBRARY_PATH 
$MIC_LD_LIBRARY_PATH:$LD_LIBRARY_PATH $MPIEXEC_FLAGS_MIC -n $MIC_PPN -host $n 
$MIC_BINARY";
      #EXEC_ARGS="${EXEC_ARGS} : $MPIEXEC_FLAGS_MIC -n $MIC_PPN -host $n 
$MIC_BINARY";
    fi
  done
fi

RUNCMD="$MPI_EXEC $EXEC_ARGS";

if [[ -n "$MPIEXEC_PREFIX" ]] ; then
  RUNCMD="$MPIEXEC_PREFIX $RUNCMD";
fi

# extra important env (Local System depended)
#export LD_LIBRARY_PATH="$MIC_LD_LIBRARY_PATH:$LD_LIBRARY_PATH"
export I_MPI_MIC=1
export I_MPI_DAPL_PROVIDER_LIST=ofa-v2-mlx4_0-1
unset I_MPI_DEVICE
unset I_MPI_PMI_LIBRARY

# start the job
if [ $SLURM_PROCID -eq 0 ] ; then

    if [[ -n "$MPIRUN_MIC_VERBOSE" ]] ; then
        echo
        echo 
"########################################################################"
        echo "MPI Tasks per host:          $HOST_PPN"
        echo "Threads per host MPI task:   $OMP_NUM_THREADS"
        echo "Binary for the hosts:        $HOST_BINARY"
        echo "MPI Tasks per MIC:           $MIC_PPN"
        echo "Threads per MIC MPI task:    $MIC_OMP_NUM_THREADS"
        echo "Binary for the mics:         $MIC_BINARY"
        echo "MIC_NUM_PER_HOST:            $MIC_NUM_PER_HOST"
        echo
        echo "MPIEXEC_PREFIX:              $MPIEXEC_PREFIX"
        echo "MPIEXEC_FLAGS_HOST:          $MPIEXEC_FLAGS_HOST"
        echo "MPIEXEC_FLAGS_MIC:           $MPIEXEC_FLAGS_MIC"
        echo ""
        echo "Run command: "
        echo "$RUNCMD"
        echo 
"########################################################################"
        echo 
    fi

    $RUNCMD

fi



Reply via email to