Hello,

When running the attached program with 128 processes I get the following errors for most runs:

_openmpi/1.8.4 with --enable-debug --enable-mem-debug with NAG 6.0:_

opal_list_remove_item - the item 0x195d820 is not on the list 0x2b05da5101e0
connect/btl_openib_connect_udcm.c:2132: udcm_send_timeout: Assertion `((0xdeafbeedULL << 32) + 0xdeafbeedULL) == ((opal_object_t *) (msg))->obj_magic_id' failed.

_openmpi/1.8.1 with NAG 6.0:_

deadlock

_openmpi/1.8.4 with Intel15 (also with NAG 6.0):_

Program terminated with signal 11, Segmentation fault.
#0  0x000000000070a885 in opal_memory_ptmalloc2_int_malloc ()
(gdb) bt
#0  0x000000000070a885 in opal_memory_ptmalloc2_int_malloc ()
#1  0x000000000070a05f in opal_memory_linux_memalign_hook ()
#2  0x00000000004afc29 in ompi_free_list_grow ()
#3  0x0000000000535fde in match_one ()
#4  0x0000000000533e16 in mca_pml_ob1_recv_frag_callback_match ()
#5  0x0000000000431e92 in btl_openib_handle_incoming ()
#6  0x0000000000431413 in btl_openib_component_progress ()
#7  0x00000000006862f6 in opal_progress ()
#8  0x000000000041f222 in ompi_request_default_test_any ()
#9  0x0000000000428ca9 in PMPI_Testany ()
#10 0x000000000040775f in get_free_send_request_handle ()
    at ***:34
#11 0x0000000000407819 in isend (send_buffer=0x7fffe9672c14, count=1, dest=70, tag=1)
    at ***:52
#12 0x0000000000407b82 in main () at ***:112

or

Program terminated with signal 7, Bus error.
#0  0x000000000070a89a in opal_memory_ptmalloc2_int_malloc ()
(gdb) bt
#0  0x000000000070a89a in opal_memory_ptmalloc2_int_malloc ()
#1  0x000000000070a05f in opal_memory_linux_memalign_hook ()
#2  0x00000000004afc29 in ompi_free_list_grow ()
#3  0x0000000000535fde in match_one ()
#4  0x0000000000533e16 in mca_pml_ob1_recv_frag_callback_match ()
#5  0x0000000000431e92 in btl_openib_handle_incoming ()
#6  0x0000000000431413 in btl_openib_component_progress ()
#7  0x00000000006862f6 in opal_progress ()
#8  0x000000000041f222 in ompi_request_default_test_any ()
#9  0x0000000000428ca9 in PMPI_Testany ()
#10 0x000000000040775f in get_free_send_request_handle ()
    at ***:34
#11 0x0000000000407819 in isend (send_buffer=0x7fffabba7b64, count=1, dest=7, tag=1)
    at ***:52
#12 0x0000000000407b82 in main () at ***:112

or

Program terminated with signal 11, Segmentation fault.
#0  0x000000000070a951 in opal_memory_ptmalloc2_int_malloc ()
(gdb) bt
#0  0x000000000070a951 in opal_memory_ptmalloc2_int_malloc ()
#1  0x000000000070a05f in opal_memory_linux_memalign_hook ()
#2 0x00002b9f7b05b248 in mlx4_create_ah (pd=0x1212680, attr=0xb26988) at src/verbs.c:728 #3 0x00002b9f70eafaed in __ibv_create_ah (pd=0xb26640, attr=0xb26988) at src/verbs.c:508
#4  0x000000000043c9df in udcm_module_start_connect ()
#5  0x0000000000445bcd in mca_btl_openib_endpoint_send ()
#6  0x0000000000546050 in mca_pml_ob1_send_request_start_copy ()
#7  0x00000000005310e4 in mca_pml_ob1_isend ()
#8  0x0000000000428689 in PMPI_Isend ()
#9 0x0000000000407876 in isend (send_buffer=0x7fff0e12cd94, count=1, dest=82, tag=0)
    at ***:54
#10 0x0000000000407aae in main () at ***:95

_ openmpi/1.6.5 with NAG 5.3__:_

no error

With less processes the program runs fine most of the time. It looks like an issue with requests in Open MPI. Can someone confirm this?
(I am running my tests on an Intel Xeon cluster.)

Thanks!
Moritz
#include <mpi.h>
#include <stdlib.h>
#include <stdio.h>

// send stuff
static MPI_Request * isend_requests = NULL;
static int num_isend_requests;

//taken from http://beige.ucs.indiana.edu/I590/node85.html
static void mpi_err_handler(int error_code) {

   if (error_code != MPI_SUCCESS) {

      char error_string[1024];
      int length_of_error_string, error_class;
      int global_rank; //!< rank of process in MPI_COMM_WORLD

      MPI_Comm_rank(MPI_COMM_WORLD, &global_rank);
      MPI_Error_class(error_code, &error_class);
      MPI_Error_string(error_class, error_string, &length_of_error_string);
      fprintf(stderr, "%3d: %s\n", global_rank, error_string);
      MPI_Error_string(error_code, error_string, &length_of_error_string);
      fprintf(stderr, "%3d: %s\n", global_rank, error_string);
      MPI_Abort(MPI_COMM_WORLD, error_code);
   }
}

static int get_free_send_request_handle () {

  int flag, idx;
  MPI_Status lstatus;

  // test whether any send request has been fulfilled
  mpi_err_handler(MPI_Testany(num_isend_requests, isend_requests, &idx, &flag,
                              &lstatus));
  if (flag && idx != MPI_UNDEFINED) return idx;

  // look for free send request
  for (int i = 0; i < num_isend_requests; i++)
    if (isend_requests[i] == MPI_REQUEST_NULL)
      return i;

  fputs("ERROR: Did not find free request handle in "
        "get_free_send_request_handle.", stderr);
  exit(EXIT_FAILURE);

  return -1;
}

static void isend(void const * send_buffer, int count, int dest, int tag) {

  int request_idx = get_free_send_request_handle();

  mpi_err_handler(MPI_Isend(send_buffer, count, MPI_INT, dest, tag,
                            MPI_COMM_WORLD, &isend_requests[request_idx]));
}

int main () {

  MPI_Init (0, NULL);

  int domain_id;
  int global_size, global_rank;

  int is_domain_root;
  int local_domain_size;
  int remote_domain_size;

  int dummy = -1;

  mpi_err_handler(MPI_Comm_rank(MPI_COMM_WORLD, &global_rank));
  mpi_err_handler(MPI_Comm_size(MPI_COMM_WORLD, &global_size));
  domain_id = global_rank >= (global_size / 2);
  local_domain_size = global_size / 2;
  remote_domain_size = global_size - local_domain_size;
  if (domain_id) {
    int temp = local_domain_size;
    local_domain_size = remote_domain_size;
    remote_domain_size = temp;
  }
  is_domain_root = (global_rank == 0) || (global_rank == global_size / 2);

  // buffer for isend
  num_isend_requests = remote_domain_size;
  if (is_domain_root) num_isend_requests += local_domain_size;
  isend_requests = (MPI_Request *) malloc (num_isend_requests * sizeof(MPI_Request));
  for (int i = 0; i < num_isend_requests; i++) isend_requests[i] = MPI_REQUEST_NULL;

  {
    int tag = 0;

    if (is_domain_root)
      // send message to all process of local domain (including self)
      for (int i = 0; i < local_domain_size; ++i)
         isend(&dummy, 1, i + ((domain_id)?(global_size/2):(0)), tag);

    // receive message from domain root
    mpi_err_handler(MPI_Recv(&dummy, 1, MPI_INT, MPI_ANY_SOURCE,
                             tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
  }

// #define NOBUG
#ifdef NOBUG
  MPI_Barrier(MPI_COMM_WORLD);
#endif

  {
    int tag = 1;

    // send message to all process of remote domain
    for (int i = 0; i < remote_domain_size; ++i)
      isend(&dummy, 1, i + ((domain_id)?(0):(global_size/2)), tag);

    // receive message from all process of remote domain
    for (int i = 0; i < remote_domain_size; ++i) {
      mpi_err_handler(MPI_Recv(&dummy, 1, MPI_INT, i + ((domain_id)?(0):(global_size/2)),
                                tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
    }
  }

  mpi_err_handler(MPI_Waitall(num_isend_requests, isend_requests,
                              MPI_STATUSES_IGNORE));
  free(isend_requests);

  MPI_Finalize();

  return EXIT_SUCCESS;
}

Reply via email to