Hello guys,

If it is not too much trouble, could someone with Mavericks run these
two scripts (with a GPU device) and tell me the output? I'm getting a
strange results on 10.8, and the expected behavior on Linux+Tesla, and
I've read that OpenCL in 10.9 got updated.

In case you are interested what they do:
- t_reduce.py runs a reduction of an array of 3 elements of a nested
struct dtype. For some reason, if I initialize the zero element as {0,
{0}, 0}, I get wrong results, and if I write 0 separately to each of
its fields, the results are correct.
- t_cbrng.py uses a counter-based RNG to generate 2M normally
distributed floats. If I compile it with '-cl-fast-relaxed-math'
option, the mean&std are correct (-2 and 10), and if I compile it with
default options, both mean and std are off.

Thank you in advance.

Best regards,
Bogdan
import numpy
import pyopencl as cl
import pyopencl.array as array

ctx = cl.create_some_context()

src = """

    // taken from pyopencl._cluda
    #define LOCAL_BARRIER barrier(CLK_LOCAL_MEM_FENCE)

    // 'static' helps to avoid the "no previous prototype for function" warning
    #if __OPENCL_VERSION__ >= 120
    #define WITHIN_KERNEL static
    #else
    #define WITHIN_KERNEL
    #endif

    #define KERNEL __kernel
    #define GLOBAL_MEM __global
    #define LOCAL_MEM __local
    #define LOCAL_MEM_DYNAMIC __local
    #define LOCAL_MEM_ARG __local
    #define INLINE inline
    #define SIZE_T size_t
    #define VSIZE_T size_t

    // used to align fields in structures
    #define ALIGN(bytes) __attribute__ ((aligned(bytes)))

    #if defined(cl_khr_fp64)
    #pragma OPENCL EXTENSION cl_khr_fp64: enable
    #elif defined(cl_amd_fp64)
    #pragma OPENCL EXTENSION cl_amd_fp64: enable
    #endif


    #define COMPLEX_CTR(T) (T)


WITHIN_KERNEL VSIZE_T virtual_local_id(unsigned int dim)
{
    if (dim == 0)
    {
        SIZE_T flat_id =
            get_local_id(0) * 1 +
            0;

        return (flat_id / 1);
    }

    return 0;
}

WITHIN_KERNEL VSIZE_T virtual_local_size(unsigned int dim)
{
    if (dim == 0)
    {
        return 256;
    }

    return 1;
}

WITHIN_KERNEL VSIZE_T virtual_group_id(unsigned int dim)
{
    if (dim == 0)
    {
        SIZE_T flat_id =
            get_group_id(0) * 1 +
            0;

        return (flat_id / 1);
    }

    return 0;
}

WITHIN_KERNEL VSIZE_T virtual_num_groups(unsigned int dim)
{
    if (dim == 0)
    {
        return 40;
    }

    return 1;
}

WITHIN_KERNEL VSIZE_T virtual_global_id(unsigned int dim)
{
    return virtual_local_id(dim) + virtual_group_id(dim) * virtual_local_size(dim);
}

WITHIN_KERNEL VSIZE_T virtual_global_size(unsigned int dim)
{
    if(dim == 0)
    {
        return 10000;
    }

    return 1;
}

WITHIN_KERNEL VSIZE_T virtual_global_flat_id()
{
    return
        virtual_global_id(0) * 1 +
        0;
}

WITHIN_KERNEL VSIZE_T virtual_global_flat_size()
{
    return
        virtual_global_size(0) *
        1;
}


WITHIN_KERNEL bool virtual_skip_local_threads()
{

    return false;
}

WITHIN_KERNEL bool virtual_skip_groups()
{

    return false;
}

WITHIN_KERNEL bool virtual_skip_global_threads()
{
    if (virtual_global_id(0) >= 10000)
        return true;

    return false;
}

#define VIRTUAL_SKIP_THREADS if(virtual_skip_local_threads() || virtual_skip_groups() || virtual_skip_global_threads()) return

typedef struct _module1__ {
    unsigned long  v[2];
} ALIGN(8) _module1_;



typedef struct _module2__ {
    unsigned long  v[4];
} ALIGN(8) _module2_;




#define _module0_KEY_WORDS 2
#define _module0_COUNTER_WORDS 4
#define _module0_Word unsigned long
#define _module0_Key _module1_
#define _module0_Counter _module2_

WITHIN_KERNEL _module2_ _module0_make_counter_from_int(int x)
{
    _module2_ result;
    result.v[0] = 0;
    result.v[1] = 0;
    result.v[2] = 0;
    result.v[3] = x;
    return result;
}


WITHIN_KERNEL INLINE unsigned long _module0_mulhilo(
    unsigned long *hip, unsigned long a, unsigned long b)
{
#ifdef CUDA
    *hip = __umul64hi(a, b);
#else
    *hip = mul_hi(a, b);
#endif
    return a * b;
}

WITHIN_KERNEL _module2_ _module0_bijection(
    const _module1_ key, const _module2_ counter)
{
    _module2_ X = counter;

    unsigned long key0 = key.v[0];
    unsigned long key1 = key.v[1];

    unsigned long hi0, lo0, hi1, lo1;

    // round 0

        lo0 = _module0_mulhilo(&hi0, 15197193596820024467, X.v[0]);
        lo1 = _module0_mulhilo(&hi1, 14581110107779764567, X.v[2]);
        X.v[0] = hi1 ^ X.v[1] ^ key0;
        X.v[1] = lo1;
        X.v[2] = hi0 ^ X.v[3] ^ key1;
        X.v[3] = lo0;

    // bump key
    key0 += 11400714819323198485;
    key1 += 13503953896175478587;

    // round 1

        lo0 = _module0_mulhilo(&hi0, 15197193596820024467, X.v[0]);
        lo1 = _module0_mulhilo(&hi1, 14581110107779764567, X.v[2]);
        X.v[0] = hi1 ^ X.v[1] ^ key0;
        X.v[1] = lo1;
        X.v[2] = hi0 ^ X.v[3] ^ key1;
        X.v[3] = lo0;

    // bump key
    key0 += 11400714819323198485;
    key1 += 13503953896175478587;

    // round 2

        lo0 = _module0_mulhilo(&hi0, 15197193596820024467, X.v[0]);
        lo1 = _module0_mulhilo(&hi1, 14581110107779764567, X.v[2]);
        X.v[0] = hi1 ^ X.v[1] ^ key0;
        X.v[1] = lo1;
        X.v[2] = hi0 ^ X.v[3] ^ key1;
        X.v[3] = lo0;

    // bump key
    key0 += 11400714819323198485;
    key1 += 13503953896175478587;

    // round 3

        lo0 = _module0_mulhilo(&hi0, 15197193596820024467, X.v[0]);
        lo1 = _module0_mulhilo(&hi1, 14581110107779764567, X.v[2]);
        X.v[0] = hi1 ^ X.v[1] ^ key0;
        X.v[1] = lo1;
        X.v[2] = hi0 ^ X.v[3] ^ key1;
        X.v[3] = lo0;

    // bump key
    key0 += 11400714819323198485;
    key1 += 13503953896175478587;

    // round 4

        lo0 = _module0_mulhilo(&hi0, 15197193596820024467, X.v[0]);
        lo1 = _module0_mulhilo(&hi1, 14581110107779764567, X.v[2]);
        X.v[0] = hi1 ^ X.v[1] ^ key0;
        X.v[1] = lo1;
        X.v[2] = hi0 ^ X.v[3] ^ key1;
        X.v[3] = lo0;

    // bump key
    key0 += 11400714819323198485;
    key1 += 13503953896175478587;

    // round 5

        lo0 = _module0_mulhilo(&hi0, 15197193596820024467, X.v[0]);
        lo1 = _module0_mulhilo(&hi1, 14581110107779764567, X.v[2]);
        X.v[0] = hi1 ^ X.v[1] ^ key0;
        X.v[1] = lo1;
        X.v[2] = hi0 ^ X.v[3] ^ key1;
        X.v[3] = lo0;

    // bump key
    key0 += 11400714819323198485;
    key1 += 13503953896175478587;

    // round 6

        lo0 = _module0_mulhilo(&hi0, 15197193596820024467, X.v[0]);
        lo1 = _module0_mulhilo(&hi1, 14581110107779764567, X.v[2]);
        X.v[0] = hi1 ^ X.v[1] ^ key0;
        X.v[1] = lo1;
        X.v[2] = hi0 ^ X.v[3] ^ key1;
        X.v[3] = lo0;

    // bump key
    key0 += 11400714819323198485;
    key1 += 13503953896175478587;

    // round 7

        lo0 = _module0_mulhilo(&hi0, 15197193596820024467, X.v[0]);
        lo1 = _module0_mulhilo(&hi1, 14581110107779764567, X.v[2]);
        X.v[0] = hi1 ^ X.v[1] ^ key0;
        X.v[1] = lo1;
        X.v[2] = hi0 ^ X.v[3] ^ key1;
        X.v[3] = lo0;

    // bump key
    key0 += 11400714819323198485;
    key1 += 13503953896175478587;

    // round 8

        lo0 = _module0_mulhilo(&hi0, 15197193596820024467, X.v[0]);
        lo1 = _module0_mulhilo(&hi1, 14581110107779764567, X.v[2]);
        X.v[0] = hi1 ^ X.v[1] ^ key0;
        X.v[1] = lo1;
        X.v[2] = hi0 ^ X.v[3] ^ key1;
        X.v[3] = lo0;

    // bump key
    key0 += 11400714819323198485;
    key1 += 13503953896175478587;

    // round 9

        lo0 = _module0_mulhilo(&hi0, 15197193596820024467, X.v[0]);
        lo1 = _module0_mulhilo(&hi1, 14581110107779764567, X.v[2]);
        X.v[0] = hi1 ^ X.v[1] ^ key0;
        X.v[1] = lo1;
        X.v[2] = hi0 ^ X.v[3] ^ key1;
        X.v[3] = lo0;



    return X;
}




typedef unsigned int _module0_uint32;
typedef unsigned long _module0_uint64;

typedef struct _module0_
{
    _module1_ key;
    _module2_ counter;
    union {
        _module2_ buffer;
        unsigned int buffer_uint32[8];
    };
    int buffer_uint32_cursor;
} _module0_State;


WITHIN_KERNEL void _module0_bump_counter(_module0_State *state)
{
    state->counter.v[3] += 1;
    if (state->counter.v[3] == 0)
    {
    state->counter.v[2] += 1;
    if (state->counter.v[2] == 0)
    {
    state->counter.v[1] += 1;
    if (state->counter.v[1] == 0)
    {
    state->counter.v[0] += 1;
    }
    }
    }
}

WITHIN_KERNEL _module2_ _module0_get_next_unused_counter(_module0_State state)
{
    if (state.buffer_uint32_cursor > 0)
    {
        _module0_bump_counter(&state);
    }
    return state.counter;
}

WITHIN_KERNEL void _module0_refill_buffer(_module0_State *state)
{
    state->buffer = _module0_bijection(state->key, state->counter);
}

WITHIN_KERNEL _module0_State _module0_make_state(_module1_ key, _module2_ counter)
{
    _module0_State state;
    state.key = key;
    state.counter = counter;
    state.buffer_uint32_cursor = 0;
    _module0_refill_buffer(&state);
    return state;
}

WITHIN_KERNEL unsigned int _module0_get_raw_uint32(_module0_State *state)
{
    if (state->buffer_uint32_cursor == 8)
    {
        _module0_bump_counter(state);
        state->buffer_uint32_cursor = 0;
        _module0_refill_buffer(state);
    }

    int cur = state->buffer_uint32_cursor;
    state->buffer_uint32_cursor += 1;
    return state->buffer_uint32[cur];
}

WITHIN_KERNEL unsigned long _module0_get_raw_uint64(_module0_State *state)
{
    if (state->buffer_uint32_cursor >= 8 - 1)
    {
        _module0_bump_counter(state);
        state->buffer_uint32_cursor = 0;
        _module0_refill_buffer(state);
    }

    int cur = state->buffer_uint32_cursor;
    state->buffer_uint32_cursor += 2;
    return state->buffer.v[cur / 2];
}





            WITHIN_KERNEL _module0_Key _module3_key_from_int(int idx)
            {
                _module0_Key result;

                result.v[0] = 8618484947917433856
                ;
                result.v[1] = 0
                    + idx
                ;

                return result;
            }





#define _module5_Value float;
#define _module5_RANDOMS_PER_CALL 1

typedef struct
{
    float v[1];
} _module5_Result;


WITHIN_KERNEL _module5_Result _module5_sample(_module0_State *state)
{
    _module5_Result result;
    float normalized = (float)_module0_get_raw_uint32(state) / 4294967296.0f;
    result.v[0] = normalized * (1.0f) + (0.0f);
    return result;
}




WITHIN_KERNEL float2 _module6_(float theta)
{
    float2 res;

    #ifdef CUDA
        sincosf(theta, &(res.y), &(res.x));
    #else
    #ifdef COMPILE_FAST_MATH
        res.x = native_cos(theta);
        res.y = native_sin(theta);
    #else
        float tmp;
        res.y = sincos(theta, &tmp);
        res.x = tmp;
    #endif
    #endif

    return res;
}




#define _module4_Value float;
#define _module4_RANDOMS_PER_CALL 2

typedef struct
{
    float v[2];
} _module4_Result;


WITHIN_KERNEL _module4_Result _module4_sample(_module0_State *state)
{
    _module4_Result result;
    _module5_Result r1 = _module5_sample(state);
    _module5_Result r2 = _module5_sample(state);
    float u1 = r1.v[0];
    float u2 = r2.v[0];

    float ang = 6.2831854820251465f * u2;
    float2 cos_sin = _module6_(ang);
    float coeff = sqrt(-2.0f * log(u1)) * (10.0f);

    result.v[0] = coeff * cos_sin.x + (-2.0f);
    result.v[1] = coeff * cos_sin.y + (-2.0f);
    return result;
}



        KERNEL void test(GLOBAL_MEM float *dest, int ctr_start)
        {
            VIRTUAL_SKIP_THREADS;
            const VSIZE_T idx = virtual_global_id(0);

            _module0_Key key = _module3_key_from_int(idx);
            _module0_Counter ctr = _module0_make_counter_from_int(ctr_start);
            _module0_State st = _module0_make_state(key, ctr);

            _module4_Result res;

            for(int j = 0; j < 100; j++)
            {
                res = _module4_sample(&st);

                dest[j * 20000 + 0 + idx] = res.v[0];
                dest[j * 20000 + 10000 + idx] = res.v[1];
            }

            _module0_Counter next_ctr = _module0_get_next_unused_counter(st);
        }

"""

queue = cl.CommandQueue(ctx)
prg = cl.Program(ctx, src).build(
    #options=['-cl-fast-relaxed-math']
    )
test = prg.test

local_size = (256,)
global_size = (10240,)
shape = (100, 2, 10000)

res_dev = array.to_device(queue, numpy.empty(shape, numpy.float32))

test(queue, global_size, local_size, res_dev.data, numpy.int32(0))
res = res_dev.get()
# Expected: ~-2, ~10
print(res.mean(), res.std())
import numpy
import pyopencl as cl
import pyopencl.array as array

ctx = cl.create_some_context()

src = """
    // taken from pyopencl._cluda
    #define LOCAL_BARRIER barrier(CLK_LOCAL_MEM_FENCE)

    // 'static' helps to avoid the "no previous prototype for function" warning
    #if __OPENCL_VERSION__ >= 120
    #define WITHIN_KERNEL static
    #else
    #define WITHIN_KERNEL
    #endif

    #define KERNEL __kernel
    #define GLOBAL_MEM __global
    #define LOCAL_MEM __local
    #define LOCAL_MEM_DYNAMIC __local
    #define LOCAL_MEM_ARG __local
    #define INLINE inline
    #define SIZE_T size_t
    #define VSIZE_T size_t

    // used to align fields in structures
    #define ALIGN(bytes) __attribute__ ((aligned(bytes)))

    #if defined(cl_khr_fp64)
    #pragma OPENCL EXTENSION cl_khr_fp64: enable
    #elif defined(cl_amd_fp64)
    #pragma OPENCL EXTENSION cl_amd_fp64: enable
    #endif


    #define COMPLEX_CTR(T) (T)


WITHIN_KERNEL VSIZE_T virtual_local_id(unsigned int dim)
{
    if (dim == 1)
    {
        SIZE_T flat_id =
            get_local_id(0) * 1 +
            0;

        return (flat_id / 1);
    }
    if (dim == 0)
    {
        SIZE_T flat_id =
            get_local_id(1) * 1 +
            0;

        return (flat_id / 1);
    }

    return 0;
}

WITHIN_KERNEL VSIZE_T virtual_local_size(unsigned int dim)
{
    if (dim == 1)
    {
        return 4;
    }
    if (dim == 0)
    {
        return 1;
    }

    return 1;
}

WITHIN_KERNEL VSIZE_T virtual_group_id(unsigned int dim)
{
    if (dim == 1)
    {
        SIZE_T flat_id =
            get_group_id(0) * 1 +
            0;

        return (flat_id / 1) % 1;
    }
    if (dim == 0)
    {
        SIZE_T flat_id =
            get_group_id(0) * 1 +
            0;

        return (flat_id / 1);
    }

    return 0;
}

WITHIN_KERNEL VSIZE_T virtual_num_groups(unsigned int dim)
{
    if (dim == 1)
    {
        return 1;
    }
    if (dim == 0)
    {
        return 1;
    }

    return 1;
}

WITHIN_KERNEL VSIZE_T virtual_global_id(unsigned int dim)
{
    return virtual_local_id(dim) + virtual_group_id(dim) * virtual_local_size(dim);
}

WITHIN_KERNEL VSIZE_T virtual_global_size(unsigned int dim)
{
    if(dim == 1)
    {
        return 4;
    }
    if(dim == 0)
    {
        return 1;
    }

    return 1;
}

WITHIN_KERNEL VSIZE_T virtual_global_flat_id()
{
    return
        virtual_global_id(1) * 1 +
        virtual_global_id(0) * 4 +
        0;
}

WITHIN_KERNEL VSIZE_T virtual_global_flat_size()
{
    return
        virtual_global_size(1) *
        virtual_global_size(0) *
        1;
}


WITHIN_KERNEL bool virtual_skip_local_threads()
{

    return false;
}

WITHIN_KERNEL bool virtual_skip_groups()
{

    return false;
}

WITHIN_KERNEL bool virtual_skip_global_threads()
{

    return false;
}

#define VIRTUAL_SKIP_THREADS if(virtual_skip_local_threads() || virtual_skip_groups() || virtual_skip_global_threads()) return

typedef struct _module1__ {
    unsigned long  v;
} ALIGN(8) _module1_;



typedef struct _module0__ {
    unsigned int  i1;
    _module1_  nested;
    unsigned int  i2;
} ALIGN(8) _module0_;




    // leaf input macro for "input"
    #define _module3_(_idx0) (_leaf_input[_idx0 * 1])






    INLINE WITHIN_KERNEL _module0_ _module2_func(
        GLOBAL_MEM _module0_ *_leaf_input, VSIZE_T _c_idx0, VSIZE_T _c_idx1)
    {





        VSIZE_T _idx0 = _c_idx1 / 1;



        return
        _module3_(_idx0);
    }

    #define _module2_(_c_idx0, _c_idx1) _module2_func(        _leaf_input, _c_idx0, _c_idx1)





    // leaf output macro for "output"
    #define _module5_(_val) _leaf_output[0] = (_val)






    INLINE WITHIN_KERNEL void _module4_func(
        GLOBAL_MEM _module0_ *_leaf_output, VSIZE_T _c_idx0, VSIZE_T _c_idx1, _module0_ _val)
    {







        _module5_(_val);
    }

    #define _module4_(_c_idx0, _c_idx1, _val) _module4_func(        _leaf_output, _c_idx0, _c_idx1, _val)







INLINE WITHIN_KERNEL _module0_ reduction_op(_module0_ input1, _module0_ input2)
{


            _module0_ result = input1;
            result.i1 += input2.i1;
            result.nested.v += input2.nested.v;
            result.i2 += input2.i2;
            return result;


}


KERNEL void _kernel_func(GLOBAL_MEM _module0_ *_leaf_output, GLOBAL_MEM _module0_ *_leaf_input)

{
    VIRTUAL_SKIP_THREADS;

    LOCAL_MEM unsigned int local_mem_i1[4];
    LOCAL_MEM unsigned long local_mem_nested_v[4];
    LOCAL_MEM unsigned int local_mem_i2[4];

    const VSIZE_T tid = virtual_local_id(1);
    const VSIZE_T bid = virtual_group_id(1);
    const VSIZE_T part_num = virtual_global_id(0);

    const VSIZE_T index_in_part = 4 * bid + tid;
    _module0_ empty = {0, {0L}, 0};
    //_module0_ empty;
    //empty.i1 = 0;
    //empty.nested.v = 0;
    //empty.i2 = 0;

    _module0_ v;

        if(tid < 3)
        {
            const _module0_ t =
                _module2_(
                    part_num, index_in_part + 0);
            v = t;
        }
        else
        {
            v = empty;
        }


    local_mem_i1[tid] = v.i1;
    local_mem_nested_v[tid] = v.nested.v;
    local_mem_i2[tid] = v.i2;
    LOCAL_BARRIER;

    // We could use the volatile trick here and execute the last several iterations
    // (that fit in a single warp) without LOCAL_BARRIERs, but it gives only
    // a minor performance boost, and works only for some platforms (and only for simple types).
        if(tid < 2)
        {
            _module0_ val1, val2;
            val1.i1 = local_mem_i1[tid];
            val2.i1 = local_mem_i1[tid + 2];
            val1.nested.v = local_mem_nested_v[tid];
            val2.nested.v = local_mem_nested_v[tid + 2];
            val1.i2 = local_mem_i2[tid];
            val2.i2 = local_mem_i2[tid + 2];
            const _module0_ val = reduction_op(val1, val2);

            local_mem_i1[tid] = val.i1;
            local_mem_nested_v[tid] = val.nested.v;
            local_mem_i2[tid] = val.i2;
        }
        LOCAL_BARRIER;
        if(tid < 1)
        {
            _module0_ val1, val2;
            val1.i1 = local_mem_i1[tid];
            val2.i1 = local_mem_i1[tid + 1];
            val1.nested.v = local_mem_nested_v[tid];
            val2.nested.v = local_mem_nested_v[tid + 1];
            val1.i2 = local_mem_i2[tid];
            val2.i2 = local_mem_i2[tid + 1];
            const _module0_ val = reduction_op(val1, val2);

            local_mem_i1[tid] = val.i1;
            local_mem_nested_v[tid] = val.nested.v;
            local_mem_i2[tid] = val.i2;
        }
        LOCAL_BARRIER;

    if (tid == 0)
    {
        v.i1 = local_mem_i1[0];
        v.nested.v = local_mem_nested_v[0];
        v.i2 = local_mem_i2[0];

        _module4_(part_num, bid, v);
    }
}

"""

shape = (3,)

dtype_nested = numpy.dtype(dict(
    names=['v'],
    formats=[numpy.uint64],
    aligned=True))
dtype = numpy.dtype(dict(
    names=['i1', 'nested', 'i2'],
    formats=[numpy.uint32, dtype_nested, numpy.uint32],
    offsets=[0, 8, 16],
    itemsize=24,
    aligned=True))

queue = cl.CommandQueue(ctx)
prg = cl.Program(ctx, src).build()
test = prg._kernel_func

n = numpy.empty(3, dtype)
n['i1'] = 1
n['nested']['v'] = 2
n['i2'] = 3
n_dev = array.to_device(queue, n)
res_dev = array.to_device(queue, numpy.zeros(1, dtype))

test(queue, (4,), (4,), res_dev.data, n_dev.data)
# Expected: (3, (6,), 9)
print(res_dev.get())
_______________________________________________
PyOpenCL mailing list
[email protected]
http://lists.tiker.net/listinfo/pyopencl

Reply via email to