Source: openmpi Version: 4.0.2-4 Severity: serious Control: block 946422 with -1 Control: block 946582 with -1
Hi,
this bug was initially observed as a silx autopkgtest failure (#946422)
in pocl (#946582). I've reduced it to a python-free OpenCL example in C
and now I'm convinced that it is caused by by OpenMPI 4.
Attached you can find a trivial OpenCL example (yes, that needs a bit of
boilerplate code) that is linked against MPI and calls MPI_Init().
This works fine with OpenMPI 3 in buster, MPICH in sid (and no MPI as
well), but fails with OpenMPI 4 in sid.
You need opencl-dev and libopenmpi-dev to build it and pocl-opencl-icd
to run it.
You build it with
mpicc -o test_mpi_ocl test_mpi_ocl.c -lOpenCL
and get this when running successfully:
# ./test_mpi_ocl
Success
but the failure with OpenMPI 4 is
# ./test_mpi_ocl
pocl error: lt_dlopen("(null)") or lt_dlsym() failed with 'can't close resident
module'.
note: missing symbols in the kernel binary might be reported as 'file not
found' errors.
Aborted
OpenMPI 4 seems to change some state causing subsequent lt_dlopen() to fail.
In gdb we have
(gdb) bt
#0 0x00007ffff7d10081 in raise () from /lib/x86_64-linux-gnu/libc.so.6
#1 0x00007ffff7cfb535 in abort () from /lib/x86_64-linux-gnu/libc.so.6
#2 0x00007fffed2b66d1 in pocl_check_kernel_dlhandle_cache
(cmd=cmd@entry=0x55555562b730, initial_refcount=initial_refcount@entry=1) at
./lib/CL/devices/common.c:1097
#3 0x00007fffed2bc327 in pocl_pthread_prepare_kernel (cmd=0x55555562b730,
data=0x5555556cb5e0) at ./lib/CL/devices/pthread/pthread_scheduler.c:413
#4 pocl_pthread_exec_command (td=0x5555556cd200, cmd=0x55555562b730) at
./lib/CL/devices/pthread/pthread_scheduler.c:450
#5 pocl_pthread_driver_thread (p=<optimized out>) at
./lib/CL/devices/pthread/pthread_scheduler.c:496
#6 0x00007ffff79aafb7 in start_thread () from
/lib/x86_64-linux-gnu/libpthread.so.0
#7 0x00007ffff7dd02df in clone () from /lib/x86_64-linux-gnu/libc.so.6
The error stems from line lib/CL/devices/common.c 1062
ci->dlhandle = lt_dlopen (module_fn);
where module_fn =
"//.cache/pocl/kcache/IL/PFLJNNHLKAHONADOJOEENLMDLFHDJKOMFJHEO/foo/1-1-1/foo.so"
OK, we can further minimize this testcase if we just take foo.so
(amd64 version attached) and lt_dlopen() it:
// mpicc -o test_lt_dlopen test_lt_dlopen.c -lltdl
#include <stdio.h>
#include <ltdl.h>
#include <mpi.h>
int main(int argc, char **argv)
{
MPI_Init(&argc, &argv);
lt_dlinit();
lt_dlhandle handle = lt_dlopen ("./foo.so");
const char * dl_error = lt_dlerror ();
printf("%p %s\n", handle, dl_error ? dl_error : "(null)");
}
Without OpenMPI 4 we succeed:
# ./test_lt_dlopen
0x55c4accfc480 (null)
but with OpenMPI 4 we run into the same problem:
# ./test_lt_dlopen
0x559b5c923250 can't close resident module
Andreas
// install packages // apt-get install libopenmpi-dev opencl-dev pocl-opencl-icd // compile with // mpicc -o test_mpi_ocl test_mpi_ocl.c -lOpenCL // based on https://wiki.aalto.fi/display/HPEC/OpenCL+tutorial #define CL_TARGET_OPENCL_VERSION 100 // 0a_trivial.c #include <stdio.h> #include <stdlib.h> #ifdef __APPLE__ #include <OpenCL/opencl.h> #else #include <CL/cl.h> #endif #include <mpi.h> /* A kernel which does nothing */ const char * source_str = "__kernel void foo(void)" "{" "" "}"; int main(int argc, char** argv) { MPI_Init(&argc, &argv); /* Get platform and device information */ cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_uint num_devices; cl_uint num_platforms; cl_int ret = clGetPlatformIDs(1, &platform_id, &num_platforms); ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &num_devices); /* Create an OpenCL context */ cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); /* Create a command queue */ cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret); /* Create a program from the kernel source */ cl_program program = clCreateProgramWithSource(context, 1, &source_str, NULL, &ret); /* Build the program */ ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); /* Create the OpenCL kernel */ cl_kernel kernel = clCreateKernel(program, "foo", &ret); /* Execute the OpenCL kernel */ size_t global_item_size = 1; size_t local_item_size = 1; ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); ret = clFlush(command_queue); ret = clFinish(command_queue); if (ret == CL_SUCCESS) printf("Success\n"); else printf("OpenCL error executing kernel: %d\n", ret); /* Clean up */ ret = clReleaseKernel(kernel); ret = clReleaseProgram(program); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); return 0; }
// mpicc -o test_lt_dlopen test_lt_dlopen.c -lltdl
#include <stdio.h>
#include <ltdl.h>
#include <mpi.h>
int main(int argc, char **argv)
{
MPI_Init(&argc, &argv);
lt_dlinit();
lt_dlhandle handle = lt_dlopen ("./foo.so");
const char * dl_error = lt_dlerror ();
printf("%p %s\n", handle, dl_error ? dl_error : "(null)");
}
foo.so
Description: application/sharedlib

