Hello,
When running the attached program with 128 processes I get the following
errors for most runs:
_openmpi/1.8.4 with --enable-debug --enable-mem-debug with NAG 6.0:_
opal_list_remove_item - the item 0x195d820 is not on the list 0x2b05da5101e0
connect/btl_openib_connect_udcm.c:2132: udcm_send_timeout: Assertion
`((0xdeafbeedULL << 32) + 0xdeafbeedULL) == ((opal_object_t *)
(msg))->obj_magic_id' failed.
_openmpi/1.8.1 with NAG 6.0:_
deadlock
_openmpi/1.8.4 with Intel15 (also with NAG 6.0):_
Program terminated with signal 11, Segmentation fault.
#0 0x000000000070a885 in opal_memory_ptmalloc2_int_malloc ()
(gdb) bt
#0 0x000000000070a885 in opal_memory_ptmalloc2_int_malloc ()
#1 0x000000000070a05f in opal_memory_linux_memalign_hook ()
#2 0x00000000004afc29 in ompi_free_list_grow ()
#3 0x0000000000535fde in match_one ()
#4 0x0000000000533e16 in mca_pml_ob1_recv_frag_callback_match ()
#5 0x0000000000431e92 in btl_openib_handle_incoming ()
#6 0x0000000000431413 in btl_openib_component_progress ()
#7 0x00000000006862f6 in opal_progress ()
#8 0x000000000041f222 in ompi_request_default_test_any ()
#9 0x0000000000428ca9 in PMPI_Testany ()
#10 0x000000000040775f in get_free_send_request_handle ()
at ***:34
#11 0x0000000000407819 in isend (send_buffer=0x7fffe9672c14, count=1,
dest=70, tag=1)
at ***:52
#12 0x0000000000407b82 in main () at ***:112
or
Program terminated with signal 7, Bus error.
#0 0x000000000070a89a in opal_memory_ptmalloc2_int_malloc ()
(gdb) bt
#0 0x000000000070a89a in opal_memory_ptmalloc2_int_malloc ()
#1 0x000000000070a05f in opal_memory_linux_memalign_hook ()
#2 0x00000000004afc29 in ompi_free_list_grow ()
#3 0x0000000000535fde in match_one ()
#4 0x0000000000533e16 in mca_pml_ob1_recv_frag_callback_match ()
#5 0x0000000000431e92 in btl_openib_handle_incoming ()
#6 0x0000000000431413 in btl_openib_component_progress ()
#7 0x00000000006862f6 in opal_progress ()
#8 0x000000000041f222 in ompi_request_default_test_any ()
#9 0x0000000000428ca9 in PMPI_Testany ()
#10 0x000000000040775f in get_free_send_request_handle ()
at ***:34
#11 0x0000000000407819 in isend (send_buffer=0x7fffabba7b64, count=1,
dest=7, tag=1)
at ***:52
#12 0x0000000000407b82 in main () at ***:112
or
Program terminated with signal 11, Segmentation fault.
#0 0x000000000070a951 in opal_memory_ptmalloc2_int_malloc ()
(gdb) bt
#0 0x000000000070a951 in opal_memory_ptmalloc2_int_malloc ()
#1 0x000000000070a05f in opal_memory_linux_memalign_hook ()
#2 0x00002b9f7b05b248 in mlx4_create_ah (pd=0x1212680, attr=0xb26988)
at src/verbs.c:728
#3 0x00002b9f70eafaed in __ibv_create_ah (pd=0xb26640, attr=0xb26988)
at src/verbs.c:508
#4 0x000000000043c9df in udcm_module_start_connect ()
#5 0x0000000000445bcd in mca_btl_openib_endpoint_send ()
#6 0x0000000000546050 in mca_pml_ob1_send_request_start_copy ()
#7 0x00000000005310e4 in mca_pml_ob1_isend ()
#8 0x0000000000428689 in PMPI_Isend ()
#9 0x0000000000407876 in isend (send_buffer=0x7fff0e12cd94, count=1,
dest=82, tag=0)
at ***:54
#10 0x0000000000407aae in main () at ***:95
_ openmpi/1.6.5 with NAG 5.3__:_
no error
With less processes the program runs fine most of the time. It looks
like an issue with requests in Open MPI. Can someone confirm this?
(I am running my tests on an Intel Xeon cluster.)
Thanks!
Moritz
#include <mpi.h>
#include <stdlib.h>
#include <stdio.h>
// send stuff
static MPI_Request * isend_requests = NULL;
static int num_isend_requests;
//taken from http://beige.ucs.indiana.edu/I590/node85.html
static void mpi_err_handler(int error_code) {
if (error_code != MPI_SUCCESS) {
char error_string[1024];
int length_of_error_string, error_class;
int global_rank; //!< rank of process in MPI_COMM_WORLD
MPI_Comm_rank(MPI_COMM_WORLD, &global_rank);
MPI_Error_class(error_code, &error_class);
MPI_Error_string(error_class, error_string, &length_of_error_string);
fprintf(stderr, "%3d: %s\n", global_rank, error_string);
MPI_Error_string(error_code, error_string, &length_of_error_string);
fprintf(stderr, "%3d: %s\n", global_rank, error_string);
MPI_Abort(MPI_COMM_WORLD, error_code);
}
}
static int get_free_send_request_handle () {
int flag, idx;
MPI_Status lstatus;
// test whether any send request has been fulfilled
mpi_err_handler(MPI_Testany(num_isend_requests, isend_requests, &idx, &flag,
&lstatus));
if (flag && idx != MPI_UNDEFINED) return idx;
// look for free send request
for (int i = 0; i < num_isend_requests; i++)
if (isend_requests[i] == MPI_REQUEST_NULL)
return i;
fputs("ERROR: Did not find free request handle in "
"get_free_send_request_handle.", stderr);
exit(EXIT_FAILURE);
return -1;
}
static void isend(void const * send_buffer, int count, int dest, int tag) {
int request_idx = get_free_send_request_handle();
mpi_err_handler(MPI_Isend(send_buffer, count, MPI_INT, dest, tag,
MPI_COMM_WORLD, &isend_requests[request_idx]));
}
int main () {
MPI_Init (0, NULL);
int domain_id;
int global_size, global_rank;
int is_domain_root;
int local_domain_size;
int remote_domain_size;
int dummy = -1;
mpi_err_handler(MPI_Comm_rank(MPI_COMM_WORLD, &global_rank));
mpi_err_handler(MPI_Comm_size(MPI_COMM_WORLD, &global_size));
domain_id = global_rank >= (global_size / 2);
local_domain_size = global_size / 2;
remote_domain_size = global_size - local_domain_size;
if (domain_id) {
int temp = local_domain_size;
local_domain_size = remote_domain_size;
remote_domain_size = temp;
}
is_domain_root = (global_rank == 0) || (global_rank == global_size / 2);
// buffer for isend
num_isend_requests = remote_domain_size;
if (is_domain_root) num_isend_requests += local_domain_size;
isend_requests = (MPI_Request *) malloc (num_isend_requests * sizeof(MPI_Request));
for (int i = 0; i < num_isend_requests; i++) isend_requests[i] = MPI_REQUEST_NULL;
{
int tag = 0;
if (is_domain_root)
// send message to all process of local domain (including self)
for (int i = 0; i < local_domain_size; ++i)
isend(&dummy, 1, i + ((domain_id)?(global_size/2):(0)), tag);
// receive message from domain root
mpi_err_handler(MPI_Recv(&dummy, 1, MPI_INT, MPI_ANY_SOURCE,
tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
}
// #define NOBUG
#ifdef NOBUG
MPI_Barrier(MPI_COMM_WORLD);
#endif
{
int tag = 1;
// send message to all process of remote domain
for (int i = 0; i < remote_domain_size; ++i)
isend(&dummy, 1, i + ((domain_id)?(0):(global_size/2)), tag);
// receive message from all process of remote domain
for (int i = 0; i < remote_domain_size; ++i) {
mpi_err_handler(MPI_Recv(&dummy, 1, MPI_INT, i + ((domain_id)?(0):(global_size/2)),
tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
}
}
mpi_err_handler(MPI_Waitall(num_isend_requests, isend_requests,
MPI_STATUSES_IGNORE));
free(isend_requests);
MPI_Finalize();
return EXIT_SUCCESS;
}