On 11/19/20 11:20 PM, Rebecca N. Palmer wrote: > This isn't testable in a qemu-armhf chroot, as pocl doesn't work there.
It works for me in a armhf pbuilder chroot driven via qemu-user-static on an amd64 host. Trying to reproduce on abel.d.o, a real armhf porterbox: * install in the chroot: pocl-opencl-icd libpocl2-dbgsym libllvm10-dbgsym libclblas2-dbgsym libgpuarray3-dbgsym libc6-dbg python3-nose python3-pygpu-dbg python3-scipy libclblas-dev gdb $ ulimit -c unlimited $ DEVICE=opencl0:0 python3.9 -m nose -v pygpu.tests.test_blas *** Testing for pthread-0x584 pygpu.tests.test_blas.test_dot(1, 'float32', True, True, True, False) ... Segmentation fault (core dumped) $ gdb /usr/bin/python3.9 core [...] bt #0 getEmissionKind () at /build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/include/llvm/IR/DebugInfoMetadata.h:1244 #1 initialize () at /build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/CodeGen/LexicalScopes.cpp:53 #2 0xafa922f0 in computeIntervals () at /build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/CodeGen/LiveDebugVariables.cpp:979 #3 runOnMachineFunction () at /build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/CodeGen/LiveDebugVariables.cpp:996 #4 runOnMachineFunction () at /build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/CodeGen/LiveDebugVariables.cpp:1023 #5 0xafb076c8 in runOnFunction () at /build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/CodeGen/MachineFunctionPass.cpp:73 #6 0xaf981494 in runOnFunction () at /build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/IR/LegacyPassManager.cpp:1481 #7 0xaf981750 in runOnModule () at /build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/IR/LegacyPassManager.cpp:1517 #8 0xaf981ba8 in runOnModule () at /build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/IR/LegacyPassManager.cpp:1582 #9 run () at /build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/IR/LegacyPassManager.cpp:1694 #10 0xb54e4c82 in pocl_llvm_codegen (Device=Device@entry=0x1cdea20, Modp=0x4b88240, Output=Output@entry=0xbee61784, OutputSize=OutputSize@entry=0xbee61798) at ./lib/CL/pocl_llvm_wg.cc:624 #11 0xb54a91de in llvm_codegen (output=output@entry=0x4b7f530 "/home/anbe/.cache/pocl/kcache/LE/ELACJMEDJOLOPBPKFGKJKDCIPBPEMINDFHLHI/Sdot_kernel/0-0-0/Sdot_kernel.so", device_i=device_i@entry=0, kernel=kernel@entry=0xbee63158, device=0x1cdea20, command=command@entry=0xbee63190, specialize=specialize@entry=0) at ./lib/CL/devices/common.c:158 #12 0xb54aae44 in pocl_check_kernel_disk_cache (command=command@entry=0xbee63190, specialized=specialized@entry=0) at ./lib/CL/devices/common.c:958 #13 0xb54ab262 in pocl_check_kernel_dlhandle_cache (command=0xbee63190, initial_refcount=0, specialize=0) at ./lib/CL/devices/common.c:1081 #14 0xb54833d4 in program_compile_dynamic_wg_binaries (program=program@entry=0x460d1e0) at ./lib/CL/pocl_build.c:179 #15 0xb5493f20 in get_binary_sizes (sizes=0xbee63288, program=0x460d1e0) at ./lib/CL/clGetProgramInfo.c:36 #16 POclGetProgramInfo (program=0x460d1e0, param_name=4453, param_value_size=4, param_value=0xbee63288, param_value_size_ret=0x0) at ./lib/CL/clGetProgramInfo.c:115 #17 0xa1ea1722 in getSingleBinaryFromProgram (binary=std::vector of length 0, capacity 0, program=0x460d1e0) at ./src/library/blas/generic/binary_lookup.cc:392 #18 BinaryLookup::populateCache (this=this@entry=0xbee63308) at ./src/library/blas/generic/binary_lookup.cc:466 #19 0xa1e9d738 in makeKernelCached (device=0x1cdea20, context=0x1cdf230, sid=sid@entry=320, key=key@entry=0xbee6348c, kernelGenerator=kernelGenerator@entry=0xa1ec8ad9 <generator(char*, size_t, SubproblemDim const*, PGranularity const*, void*)>, dims=0x440e670, pgran=pgran@entry=0x440e6ac, extra=extra@entry=0xbee634d4, buildOpts=buildOpts@entry=0xbee6372c "-g -DINCX_NONUNITY -DINCY_NONUNITY", error=error@entry=0xbee63410) at ./src/library/blas/generic/common2.cc:90 #20 0xa1ea0662 in makeSolutionSeq (funcID=funcID@entry=CLBLAS_DOT, args=args@entry=0xbee639f0, numCommandQueues=numCommandQueues@entry=1, commandQueues=commandQueues@entry=0x1a2aee0, numEventsInWaitList=numEventsInWaitList@entry=0, eventWaitList=eventWaitList@entry=0x0, events=events@entry=0xbee63894, seq=seq@entry=0xbee63898) at ./src/library/blas/generic/solution_seq_make.c:587 #21 0xa1e8c9b6 in doDot (kargs=kargs@entry=0xbee639f0, N=1, dotProduct=<optimized out>, offDP=0, X=0x1d4c118, offx=1, incx=2, Y=0x43d7f90, offy=1, incy=2, scratchBuff=0x440e370, doConj=0, numCommandQueues=1, commandQueues=0x1a2aee0, numEventsInWaitList=0, eventWaitList=0x0, events=0xbee63b44) at ./src/library/blas/xdot.c:132 #22 0xa1e8cac8 in clblasSdot (N=<optimized out>, dotProduct=<optimized out>, offDP=<optimized out>, X=0x1d4c118, offx=1, incx=2, Y=0x43d7f90, offy=1, incy=2, scratchBuff=0x440e370, numCommandQueues=1, commandQueues=0x1a2aee0, numEventsInWaitList=0, eventWaitList=0x0, events=0xbee63b44) at ./src/library/blas/xdot.c:193 #23 0xb62314c2 in sdot (N=<optimized out>, X=0x1d3ce20, offX=1, incX=2, Y=0x1899b38, offY=1, incY=2, Z=0x4403428, offZ=0) at ./src/gpuarray_blas_opencl_clblas.c:212 #24 0xb621e25c in GpuArray_rdot (X=X@entry=0xa3510f34, Y=Y@entry=0xa3510ef4, Z=Z@entry=0xa3510f74, nocopy=nocopy@entry=0) at ./src/gpuarray_array_blas.c:77 #25 0xad10d7d4 in __pyx_f_5pygpu_4blas_pygpu_blas_rdot (__pyx_v_X=__pyx_v_X@entry=0xa3510f28, __pyx_v_Y=__pyx_v_Y@entry=0xa3510ee8, __pyx_v_Z=__pyx_v_Z@entry=0xa3510f68, __pyx_v_nocopy=__pyx_v_nocopy@entry=0) at pygpu/blas.c:1931 #26 0xad10ddb4 in __pyx_pf_5pygpu_4blas_dot (__pyx_self=<optimized out>, __pyx_v_overwrite_z=<optimized out>, __pyx_v_Z=0xa3510f68, __pyx_v_Y=<optimized out>, __pyx_v_X=<optimized out>) at pygpu/blas.c:2871 #27 __pyx_pw_5pygpu_4blas_1dot (__pyx_self=<optimized out>, __pyx_args=<optimized out>, __pyx_kwds=<optimized out>) at pygpu/blas.c:2757 #28 0x0009fff4 in cfunction_call (func=<built-in function dot>, args=<optimized out>, kwargs={'overwrite_z': True}) at ../Objects/methodobject.c:539 #29 0x00084ef8 in _PyObject_MakeTpCall (tstate=0x1890930, callable=<built-in function dot>, args=0xb55e34b4, nargs=<optimized out>, keywords=<optimized out>) at ../Objects/call.c:191 #30 0x0007e618 in _PyObject_VectorcallTstate (kwnames=('overwrite_z',), nargsf=<optimized out>, args=<optimized out>, callable=<built-in function dot>, tstate=0x1890930) at ../Include/cpython/abstract.h:116 #31 _PyObject_VectorcallTstate (kwnames=('overwrite_z',), nargsf=<optimized out>, args=<optimized out>, callable=<built-in function dot>, tstate=0x1890930) at ../Include/cpython/abstract.h:103 #32 PyObject_Vectorcall (kwnames=('overwrite_z',), nargsf=<optimized out>, args=<optimized out>, callable=<built-in function dot>) at ../Include/cpython/abstract.h:127 [...] $ POCL_LEAVE_KERNEL_COMPILER_TEMP_FILES=1 POCL_DEBUG=all DEVICE=opencl0:0 python3.9 -m nose -v pygpu.tests.test_blas [...] [2020-10-21 01:44:25.555831328]POCL: in fn compile_and_link_program at line 570: | LLVM | building program with options -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -DINCX_NONUNITY -DINCY_NONUNITY [2020-10-21 01:44:25.555985087]POCL: in fn compile_and_link_program at line 634: | GENERAL | building from sources for device 0 [2020-10-21 01:44:25.556798560]POCL: in fn int pocl_llvm_build_program(cl_program, unsigned int, const char*, char*, cl_uint, _cl_program* const*, const char**, int) at line 365: | LLVM | all build options: -Dcl_khr_byte_addressable_store -Dcl_khr_global_int32_base_atomics -Dcl_khr_global_int32_extended_atomics -Dcl_khr_local_int32_base_atomics -Dcl_khr_local_int32_extended_atomics -Dcl_khr_3d_image_writes -Dcl_khr_fp16 -Dcl_khr_fp64 -cl-ext=-all,+cl_khr_byte_addressable_store,+cl_khr_global_int32_base_atomics,+cl_khr_global_int32_extended_atomics,+cl_khr_local_int32_base_atomics,+cl_khr_local_int32_extended_atomics,+cl_khr_3d_image_writes,+cl_khr_fp16,+cl_khr_fp64 -Dcl_khr_int64 -DPOCL_DEVICE_ADDRESS_BITS=32 -D__USE_CLANG_OPENCL_C_H -xcl -Dinline= -I. -cl-kernel-arg-info -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -DINCX_NONUNITY -DINCY_NONUNITY -D__ENDIAN_LITTLE__=1 -D__IMAGE_SUPPORT__=1 -DCL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE=0 -D__OPENCL_VERSION__=120 -cl-std=CL1.2 -D__OPENCL_C_VERSION__=120 -fno-builtin -triple=armv7-unknown-linux-gnueabihf -target-cpu arm1156t2f-s [2020-10-21 01:44:27.973547965]POCL: in fn int pocl_llvm_build_program(cl_program, unsigned int, const char*, char*, cl_uint, _cl_program* const*, const char**, int) at line 598: | LLVM | Writing program.bc to /home/anbe/.cache/pocl/kcache/DJ/NEHIGCOLOHNAFHJKDGHCIKCLCFLEMKPKFDCKC/program.bc. [2020-10-21 01:44:27.997969828]POCL: in fn int pocl_llvm_get_kernels_metadata(cl_program, unsigned int) at line 525: | LLVM | Automatic local detected in kernel Sdot_kernel: Sdot_kernel.t309a [2020-10-21 01:44:29.034043005]POCL: in fn int pocl_llvm_generate_workgroup_function_nowrite(unsigned int, cl_device_id, cl_kernel, _cl_command_node*, void**, int) at line 439: | TIMING | >>> 1.030578520 s API: llvm_workgroup_ir_func_gen [2020-10-21 01:44:29.034120404]POCL: in fn llvm_codegen at line 135: | LLVM | Writing parallel.bc to /home/anbe/.cache/pocl/kcache/DJ/NEHIGCOLOHNAFHJKDGHCIKCLCFLEMKPKFDCKC/Sdot_kernel/0-0-0/parallel.bc. [2020-10-21 01:44:29.052400801]POCL: in fn int pocl_llvm_codegen(cl_device_id, void*, char**, uint64_t*) at line 620: | LLVM | Generating an object file directly. Segmentation fault (core dumped) Kernel source dump is attached. > Did you have libllvm10-dbgsym installed? If not, does installing that > give a more detailed backtrace? (I suspect an invalid 'this', given > that the crashing line accesses only a class member.) Yes, that's with debug symbols installed. Otherwise the backtrace ends immediately with "Backtrace stopped: previous frame identical to this frame (corrupt stack?)" Andreas
#ifdef DOUBLE_PRECISION #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable #else #pragma OPENCL EXTENSION cl_amd_fp64 : enable #endif #endif __kernel void Sdot_kernel( __global float *_X, __global float *_Y, __global float *scratchBuff, uint N, uint offx, int incx, uint offy, int incy, int doConj ) { __global float *X = _X + offx; __global float *Y = _Y + offy; float dotP = (float) 0.0; if ( incx < 0 ) { X = X + (N - 1) * abs(incx); } if ( incy < 0 ) { Y = Y + (N - 1) * abs(incy); } int gOffset; for( gOffset=(get_global_id(0) * 4); (gOffset + 4 - 1)<N; gOffset+=( get_global_size(0) * 4 ) ) { float4 vReg1, vReg2, res; #ifdef INCX_NONUNITY vReg1 = (float4)( (X + (gOffset*incx))[0 + ( incx * 0)], (X + (gOffset*incx))[0 + ( incx * 1)], (X + (gOffset*incx))[0 + ( incx * 2)], (X + (gOffset*incx))[0 + ( incx * 3)]); #else vReg1 = vload4( 0, (__global float *) (X + gOffset) ); #endif #ifdef INCY_NONUNITY vReg2 = (float4)( (Y + (gOffset*incy))[0 + ( incy * 0)], (Y + (gOffset*incy))[0 + ( incy * 1)], (Y + (gOffset*incy))[0 + ( incy * 2)], (Y + (gOffset*incy))[0 + ( incy * 3)]); #else vReg2 = vload4( 0, (__global float *) (Y + gOffset) ); #endif ; res = vReg1 * vReg2 ; dotP += res .S0 + res .S1 + res .S2 + res .S3; ; // Add-up elements in the vector to give a scalar } // Loop for the last thread to handle the tail part of the vector // Using the same gOffset used above for( ; gOffset<N; gOffset++ ) { float sReg1, sReg2, res; sReg1 = X[gOffset * incx]; sReg2 = Y[gOffset * incy]; ; res = sReg1 * sReg2 ; dotP = dotP + res ; } // Note: this has to be called outside any if-conditions- because REDUCTION uses barrier // dotP of work-item 0 will have the final reduced item of the work-group __local float t309a [ 64 ]; uint AGnQN = get_local_id(0); t309a [ AGnQN ] = dotP ; barrier(CLK_LOCAL_MEM_FENCE); if( AGnQN < 32 ) { t309a [ AGnQN ] = t309a [ AGnQN ] + t309a [ AGnQN + 32 ]; } barrier(CLK_LOCAL_MEM_FENCE); if( AGnQN < 16 ) { t309a [ AGnQN ] = t309a [ AGnQN ] + t309a [ AGnQN + 16 ]; } barrier(CLK_LOCAL_MEM_FENCE); if( AGnQN < 8 ) { t309a [ AGnQN ] = t309a [ AGnQN ] + t309a [ AGnQN + 8 ]; } barrier(CLK_LOCAL_MEM_FENCE); if( AGnQN < 4 ) { t309a [ AGnQN ] = t309a [ AGnQN ] + t309a [ AGnQN + 4 ]; } barrier(CLK_LOCAL_MEM_FENCE); if( AGnQN < 2 ) { t309a [ AGnQN ] = t309a [ AGnQN ] + t309a [ AGnQN + 2 ]; } barrier(CLK_LOCAL_MEM_FENCE); if( AGnQN == 0 ) { dotP = t309a [0] + t309a [1]; } if( (get_local_id(0)) == 0 ) { scratchBuff[ get_group_id(0) ] = dotP; } }