On 11/19/20 11:20 PM, Rebecca N. Palmer wrote:
> This isn't testable in a qemu-armhf chroot, as pocl doesn't work there.

It works for me in a armhf pbuilder chroot driven via qemu-user-static 
on an amd64 host.

Trying to reproduce on abel.d.o, a real armhf porterbox:

* install in the chroot:
    pocl-opencl-icd libpocl2-dbgsym libllvm10-dbgsym libclblas2-dbgsym 
libgpuarray3-dbgsym libc6-dbg python3-nose python3-pygpu-dbg python3-scipy 
libclblas-dev gdb

$ ulimit -c unlimited

$ DEVICE=opencl0:0 python3.9 -m nose -v pygpu.tests.test_blas
*** Testing for pthread-0x584
pygpu.tests.test_blas.test_dot(1, 'float32', True, True, True, False) ... 
Segmentation fault (core dumped)

$ gdb /usr/bin/python3.9 core

[...]
bt
#0  getEmissionKind () at 
/build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/include/llvm/IR/DebugInfoMetadata.h:1244
#1  initialize () at 
/build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/CodeGen/LexicalScopes.cpp:53
#2  0xafa922f0 in computeIntervals () at 
/build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/CodeGen/LiveDebugVariables.cpp:979
#3  runOnMachineFunction () at 
/build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/CodeGen/LiveDebugVariables.cpp:996
#4  runOnMachineFunction () at 
/build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/CodeGen/LiveDebugVariables.cpp:1023
#5  0xafb076c8 in runOnFunction () at 
/build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/CodeGen/MachineFunctionPass.cpp:73
#6  0xaf981494 in runOnFunction () at 
/build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/IR/LegacyPassManager.cpp:1481
#7  0xaf981750 in runOnModule () at 
/build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/IR/LegacyPassManager.cpp:1517
#8  0xaf981ba8 in runOnModule () at 
/build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/IR/LegacyPassManager.cpp:1582
#9  run () at 
/build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/IR/LegacyPassManager.cpp:1694
#10 0xb54e4c82 in pocl_llvm_codegen (Device=Device@entry=0x1cdea20, 
Modp=0x4b88240, Output=Output@entry=0xbee61784, 
OutputSize=OutputSize@entry=0xbee61798) at ./lib/CL/pocl_llvm_wg.cc:624
#11 0xb54a91de in llvm_codegen (output=output@entry=0x4b7f530 
"/home/anbe/.cache/pocl/kcache/LE/ELACJMEDJOLOPBPKFGKJKDCIPBPEMINDFHLHI/Sdot_kernel/0-0-0/Sdot_kernel.so",
 device_i=device_i@entry=0, kernel=kernel@entry=0xbee63158, device=0x1cdea20, 
    command=command@entry=0xbee63190, specialize=specialize@entry=0) at 
./lib/CL/devices/common.c:158
#12 0xb54aae44 in pocl_check_kernel_disk_cache 
(command=command@entry=0xbee63190, specialized=specialized@entry=0) at 
./lib/CL/devices/common.c:958
#13 0xb54ab262 in pocl_check_kernel_dlhandle_cache (command=0xbee63190, 
initial_refcount=0, specialize=0) at ./lib/CL/devices/common.c:1081
#14 0xb54833d4 in program_compile_dynamic_wg_binaries 
(program=program@entry=0x460d1e0) at ./lib/CL/pocl_build.c:179
#15 0xb5493f20 in get_binary_sizes (sizes=0xbee63288, program=0x460d1e0) at 
./lib/CL/clGetProgramInfo.c:36
#16 POclGetProgramInfo (program=0x460d1e0, param_name=4453, param_value_size=4, 
param_value=0xbee63288, param_value_size_ret=0x0) at 
./lib/CL/clGetProgramInfo.c:115
#17 0xa1ea1722 in getSingleBinaryFromProgram (binary=std::vector of length 0, 
capacity 0, program=0x460d1e0) at 
./src/library/blas/generic/binary_lookup.cc:392
#18 BinaryLookup::populateCache (this=this@entry=0xbee63308) at 
./src/library/blas/generic/binary_lookup.cc:466
#19 0xa1e9d738 in makeKernelCached (device=0x1cdea20, context=0x1cdf230, 
sid=sid@entry=320, key=key@entry=0xbee6348c, 
kernelGenerator=kernelGenerator@entry=0xa1ec8ad9 <generator(char*, size_t, 
SubproblemDim const*, PGranularity const*, void*)>, 
    dims=0x440e670, pgran=pgran@entry=0x440e6ac, extra=extra@entry=0xbee634d4, 
buildOpts=buildOpts@entry=0xbee6372c "-g -DINCX_NONUNITY -DINCY_NONUNITY", 
error=error@entry=0xbee63410) at ./src/library/blas/generic/common2.cc:90
#20 0xa1ea0662 in makeSolutionSeq (funcID=funcID@entry=CLBLAS_DOT, 
args=args@entry=0xbee639f0, numCommandQueues=numCommandQueues@entry=1, 
commandQueues=commandQueues@entry=0x1a2aee0, 
numEventsInWaitList=numEventsInWaitList@entry=0, 
    eventWaitList=eventWaitList@entry=0x0, events=events@entry=0xbee63894, 
seq=seq@entry=0xbee63898) at ./src/library/blas/generic/solution_seq_make.c:587
#21 0xa1e8c9b6 in doDot (kargs=kargs@entry=0xbee639f0, N=1, 
dotProduct=<optimized out>, offDP=0, X=0x1d4c118, offx=1, incx=2, Y=0x43d7f90, 
offy=1, incy=2, scratchBuff=0x440e370, doConj=0, numCommandQueues=1, 
commandQueues=0x1a2aee0, numEventsInWaitList=0, 
    eventWaitList=0x0, events=0xbee63b44) at ./src/library/blas/xdot.c:132
#22 0xa1e8cac8 in clblasSdot (N=<optimized out>, dotProduct=<optimized out>, 
offDP=<optimized out>, X=0x1d4c118, offx=1, incx=2, Y=0x43d7f90, offy=1, 
incy=2, scratchBuff=0x440e370, numCommandQueues=1, commandQueues=0x1a2aee0, 
numEventsInWaitList=0, 
    eventWaitList=0x0, events=0xbee63b44) at ./src/library/blas/xdot.c:193
#23 0xb62314c2 in sdot (N=<optimized out>, X=0x1d3ce20, offX=1, incX=2, 
Y=0x1899b38, offY=1, incY=2, Z=0x4403428, offZ=0) at 
./src/gpuarray_blas_opencl_clblas.c:212
#24 0xb621e25c in GpuArray_rdot (X=X@entry=0xa3510f34, Y=Y@entry=0xa3510ef4, 
Z=Z@entry=0xa3510f74, nocopy=nocopy@entry=0) at ./src/gpuarray_array_blas.c:77
#25 0xad10d7d4 in __pyx_f_5pygpu_4blas_pygpu_blas_rdot 
(__pyx_v_X=__pyx_v_X@entry=0xa3510f28, __pyx_v_Y=__pyx_v_Y@entry=0xa3510ee8, 
__pyx_v_Z=__pyx_v_Z@entry=0xa3510f68, __pyx_v_nocopy=__pyx_v_nocopy@entry=0) at 
pygpu/blas.c:1931
#26 0xad10ddb4 in __pyx_pf_5pygpu_4blas_dot (__pyx_self=<optimized out>, 
__pyx_v_overwrite_z=<optimized out>, __pyx_v_Z=0xa3510f68, __pyx_v_Y=<optimized 
out>, __pyx_v_X=<optimized out>) at pygpu/blas.c:2871
#27 __pyx_pw_5pygpu_4blas_1dot (__pyx_self=<optimized out>, 
__pyx_args=<optimized out>, __pyx_kwds=<optimized out>) at pygpu/blas.c:2757
#28 0x0009fff4 in cfunction_call (func=<built-in function dot>, args=<optimized 
out>, kwargs={'overwrite_z': True}) at ../Objects/methodobject.c:539
#29 0x00084ef8 in _PyObject_MakeTpCall (tstate=0x1890930, callable=<built-in 
function dot>, args=0xb55e34b4, nargs=<optimized out>, keywords=<optimized 
out>) at ../Objects/call.c:191
#30 0x0007e618 in _PyObject_VectorcallTstate (kwnames=('overwrite_z',), 
nargsf=<optimized out>, args=<optimized out>, callable=<built-in function dot>, 
tstate=0x1890930) at ../Include/cpython/abstract.h:116
#31 _PyObject_VectorcallTstate (kwnames=('overwrite_z',), nargsf=<optimized 
out>, args=<optimized out>, callable=<built-in function dot>, tstate=0x1890930) 
at ../Include/cpython/abstract.h:103
#32 PyObject_Vectorcall (kwnames=('overwrite_z',), nargsf=<optimized out>, 
args=<optimized out>, callable=<built-in function dot>) at 
../Include/cpython/abstract.h:127
[...]

$ POCL_LEAVE_KERNEL_COMPILER_TEMP_FILES=1 POCL_DEBUG=all DEVICE=opencl0:0 
python3.9 -m nose -v pygpu.tests.test_blas

[...]
[2020-10-21 01:44:25.555831328]POCL: in fn compile_and_link_program at line 570:
  |      LLVM |  building program with options -dwarf-column-info 
-debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -DINCX_NONUNITY 
-DINCY_NONUNITY
[2020-10-21 01:44:25.555985087]POCL: in fn compile_and_link_program at line 634:
  |   GENERAL |  building from sources for device 0
[2020-10-21 01:44:25.556798560]POCL: in fn int 
pocl_llvm_build_program(cl_program, unsigned int, const char*, char*, cl_uint, 
_cl_program* const*, const char**, int) at line 365:
  |      LLVM |  all build options: -Dcl_khr_byte_addressable_store 
-Dcl_khr_global_int32_base_atomics -Dcl_khr_global_int32_extended_atomics 
-Dcl_khr_local_int32_base_atomics -Dcl_khr_local_int32_extended_atomics 
-Dcl_khr_3d_image_writes -Dcl_khr_fp16 -Dcl_khr_fp64 
-cl-ext=-all,+cl_khr_byte_addressable_store,+cl_khr_global_int32_base_atomics,+cl_khr_global_int32_extended_atomics,+cl_khr_local_int32_base_atomics,+cl_khr_local_int32_extended_atomics,+cl_khr_3d_image_writes,+cl_khr_fp16,+cl_khr_fp64
 -Dcl_khr_int64 -DPOCL_DEVICE_ADDRESS_BITS=32 -D__USE_CLANG_OPENCL_C_H -xcl 
-Dinline= -I. -cl-kernel-arg-info -dwarf-column-info -debug-info-kind=limited 
-dwarf-version=4 -debugger-tuning=gdb -DINCX_NONUNITY -DINCY_NONUNITY 
-D__ENDIAN_LITTLE__=1 -D__IMAGE_SUPPORT__=1 
-DCL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE=0 -D__OPENCL_VERSION__=120 -cl-std=CL1.2 
-D__OPENCL_C_VERSION__=120 -fno-builtin -triple=armv7-unknown-linux-gnueabihf 
-target-cpu arm1156t2f-s 
[2020-10-21 01:44:27.973547965]POCL: in fn int 
pocl_llvm_build_program(cl_program, unsigned int, const char*, char*, cl_uint, 
_cl_program* const*, const char**, int) at line 598:
  |      LLVM |  Writing program.bc to 
/home/anbe/.cache/pocl/kcache/DJ/NEHIGCOLOHNAFHJKDGHCIKCLCFLEMKPKFDCKC/program.bc.
[2020-10-21 01:44:27.997969828]POCL: in fn int 
pocl_llvm_get_kernels_metadata(cl_program, unsigned int) at line 525:
  |      LLVM |  Automatic local detected in kernel Sdot_kernel: 
Sdot_kernel.t309a
[2020-10-21 01:44:29.034043005]POCL: in fn int 
pocl_llvm_generate_workgroup_function_nowrite(unsigned int, cl_device_id, 
cl_kernel, _cl_command_node*, void**, int) at line 439:
  |    TIMING |       >>>         1.030578520  s    API: 
llvm_workgroup_ir_func_gen
[2020-10-21 01:44:29.034120404]POCL: in fn llvm_codegen at line 135:
  |      LLVM |  Writing parallel.bc to 
/home/anbe/.cache/pocl/kcache/DJ/NEHIGCOLOHNAFHJKDGHCIKCLCFLEMKPKFDCKC/Sdot_kernel/0-0-0/parallel.bc.
[2020-10-21 01:44:29.052400801]POCL: in fn int pocl_llvm_codegen(cl_device_id, 
void*, char**, uint64_t*) at line 620:
  |      LLVM |  Generating an object file directly.
Segmentation fault (core dumped)

Kernel source dump is attached.

> Did you have libllvm10-dbgsym installed?  If not, does installing that
> give a more detailed backtrace?  (I suspect an invalid 'this', given
> that the crashing line accesses only a class member.)

Yes, that's with debug symbols installed. Otherwise the backtrace ends
immediately with
"Backtrace stopped: previous frame identical to this frame (corrupt stack?)"

Andreas
#ifdef DOUBLE_PRECISION
    #ifdef cl_khr_fp64
    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
    #else
    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
    #endif
#endif

__kernel void Sdot_kernel( __global float *_X, __global float *_Y, __global 
float *scratchBuff,
                                        uint N, uint offx, int incx, uint offy, 
int incy, int doConj )
{
    __global float *X = _X + offx;
    __global float *Y = _Y + offy;
    float dotP = (float) 0.0;

    if ( incx < 0 ) {
        X = X + (N - 1) * abs(incx);
    }
    if ( incy < 0 ) {
        Y = Y + (N - 1) * abs(incy);
    }

    int gOffset;
    for( gOffset=(get_global_id(0) * 4); (gOffset + 4 - 1)<N; gOffset+=( 
get_global_size(0) * 4 ) )
    {
        float4 vReg1, vReg2, res;

        #ifdef INCX_NONUNITY
             vReg1 = (float4)(  (X + (gOffset*incx))[0 + ( incx * 0)],  (X + 
(gOffset*incx))[0 + ( incx * 1)],  (X + (gOffset*incx))[0 + ( incx * 2)],  (X + 
(gOffset*incx))[0 + ( incx * 3)]);
        #else
            vReg1 = vload4(  0, (__global float *) (X + gOffset) );
        #endif

        #ifdef INCY_NONUNITY
             vReg2 = (float4)(  (Y + (gOffset*incy))[0 + ( incy * 0)],  (Y + 
(gOffset*incy))[0 + ( incy * 1)],  (Y + (gOffset*incy))[0 + ( incy * 2)],  (Y + 
(gOffset*incy))[0 + ( incy * 3)]);
        #else
            vReg2 = vload4(  0, (__global float *) (Y + gOffset) );
        #endif

        ;
         res =  vReg1 *  vReg2 ;
        dotP +=  res .S0 +  res .S1 +  res .S2 +  res .S3;
;          // Add-up elements in the vector to give a scalar
    }

    // Loop for the last thread to handle the tail part of the vector
    // Using the same gOffset used above
    for( ; gOffset<N; gOffset++ )
    {
        float sReg1, sReg2, res;
        sReg1 = X[gOffset * incx];
        sReg2 = Y[gOffset * incy];

        ;
             res =  sReg1 *  sReg2 ;
             dotP =  dotP +  res ;
        }

    // Note: this has to be called outside any if-conditions- because REDUCTION 
uses barrier
    // dotP of work-item 0 will have the final reduced item of the work-group
    __local float t309a [ 64 ];
        uint AGnQN = get_local_id(0);
         t309a [ AGnQN ] =  dotP ;
        barrier(CLK_LOCAL_MEM_FENCE);

        if( AGnQN < 32 ) {
                 t309a [ AGnQN ] = t309a [ AGnQN ] + t309a [ AGnQN + 32 ];
        }
        barrier(CLK_LOCAL_MEM_FENCE);

        if( AGnQN < 16 ) {
                 t309a [ AGnQN ] = t309a [ AGnQN ] + t309a [ AGnQN + 16 ];
        }
        barrier(CLK_LOCAL_MEM_FENCE);

        if( AGnQN < 8 ) {
                 t309a [ AGnQN ] = t309a [ AGnQN ] + t309a [ AGnQN + 8 ];
        }
        barrier(CLK_LOCAL_MEM_FENCE);

        if( AGnQN < 4 ) {
                 t309a [ AGnQN ] = t309a [ AGnQN ] + t309a [ AGnQN + 4 ];
        }
        barrier(CLK_LOCAL_MEM_FENCE);

        if( AGnQN < 2 ) {
                 t309a [ AGnQN ] = t309a [ AGnQN ] + t309a [ AGnQN + 2 ];
        }
        barrier(CLK_LOCAL_MEM_FENCE);

        if( AGnQN == 0 ) {
         dotP  = t309a [0] + t309a [1];
        }

    if( (get_local_id(0)) == 0 ) {
        scratchBuff[ get_group_id(0) ] = dotP;
    }
}

Reply via email to