Hello guys,
If it is not too much trouble, could someone with Mavericks run these
two scripts (with a GPU device) and tell me the output? I'm getting a
strange results on 10.8, and the expected behavior on Linux+Tesla, and
I've read that OpenCL in 10.9 got updated.
In case you are interested what they do:
- t_reduce.py runs a reduction of an array of 3 elements of a nested
struct dtype. For some reason, if I initialize the zero element as {0,
{0}, 0}, I get wrong results, and if I write 0 separately to each of
its fields, the results are correct.
- t_cbrng.py uses a counter-based RNG to generate 2M normally
distributed floats. If I compile it with '-cl-fast-relaxed-math'
option, the mean&std are correct (-2 and 10), and if I compile it with
default options, both mean and std are off.
Thank you in advance.
Best regards,
Bogdan
import numpy
import pyopencl as cl
import pyopencl.array as array
ctx = cl.create_some_context()
src = """
// taken from pyopencl._cluda
#define LOCAL_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
// 'static' helps to avoid the "no previous prototype for function" warning
#if __OPENCL_VERSION__ >= 120
#define WITHIN_KERNEL static
#else
#define WITHIN_KERNEL
#endif
#define KERNEL __kernel
#define GLOBAL_MEM __global
#define LOCAL_MEM __local
#define LOCAL_MEM_DYNAMIC __local
#define LOCAL_MEM_ARG __local
#define INLINE inline
#define SIZE_T size_t
#define VSIZE_T size_t
// used to align fields in structures
#define ALIGN(bytes) __attribute__ ((aligned(bytes)))
#if defined(cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
#define COMPLEX_CTR(T) (T)
WITHIN_KERNEL VSIZE_T virtual_local_id(unsigned int dim)
{
if (dim == 0)
{
SIZE_T flat_id =
get_local_id(0) * 1 +
0;
return (flat_id / 1);
}
return 0;
}
WITHIN_KERNEL VSIZE_T virtual_local_size(unsigned int dim)
{
if (dim == 0)
{
return 256;
}
return 1;
}
WITHIN_KERNEL VSIZE_T virtual_group_id(unsigned int dim)
{
if (dim == 0)
{
SIZE_T flat_id =
get_group_id(0) * 1 +
0;
return (flat_id / 1);
}
return 0;
}
WITHIN_KERNEL VSIZE_T virtual_num_groups(unsigned int dim)
{
if (dim == 0)
{
return 40;
}
return 1;
}
WITHIN_KERNEL VSIZE_T virtual_global_id(unsigned int dim)
{
return virtual_local_id(dim) + virtual_group_id(dim) * virtual_local_size(dim);
}
WITHIN_KERNEL VSIZE_T virtual_global_size(unsigned int dim)
{
if(dim == 0)
{
return 10000;
}
return 1;
}
WITHIN_KERNEL VSIZE_T virtual_global_flat_id()
{
return
virtual_global_id(0) * 1 +
0;
}
WITHIN_KERNEL VSIZE_T virtual_global_flat_size()
{
return
virtual_global_size(0) *
1;
}
WITHIN_KERNEL bool virtual_skip_local_threads()
{
return false;
}
WITHIN_KERNEL bool virtual_skip_groups()
{
return false;
}
WITHIN_KERNEL bool virtual_skip_global_threads()
{
if (virtual_global_id(0) >= 10000)
return true;
return false;
}
#define VIRTUAL_SKIP_THREADS if(virtual_skip_local_threads() || virtual_skip_groups() || virtual_skip_global_threads()) return
typedef struct _module1__ {
unsigned long v[2];
} ALIGN(8) _module1_;
typedef struct _module2__ {
unsigned long v[4];
} ALIGN(8) _module2_;
#define _module0_KEY_WORDS 2
#define _module0_COUNTER_WORDS 4
#define _module0_Word unsigned long
#define _module0_Key _module1_
#define _module0_Counter _module2_
WITHIN_KERNEL _module2_ _module0_make_counter_from_int(int x)
{
_module2_ result;
result.v[0] = 0;
result.v[1] = 0;
result.v[2] = 0;
result.v[3] = x;
return result;
}
WITHIN_KERNEL INLINE unsigned long _module0_mulhilo(
unsigned long *hip, unsigned long a, unsigned long b)
{
#ifdef CUDA
*hip = __umul64hi(a, b);
#else
*hip = mul_hi(a, b);
#endif
return a * b;
}
WITHIN_KERNEL _module2_ _module0_bijection(
const _module1_ key, const _module2_ counter)
{
_module2_ X = counter;
unsigned long key0 = key.v[0];
unsigned long key1 = key.v[1];
unsigned long hi0, lo0, hi1, lo1;
// round 0
lo0 = _module0_mulhilo(&hi0, 15197193596820024467, X.v[0]);
lo1 = _module0_mulhilo(&hi1, 14581110107779764567, X.v[2]);
X.v[0] = hi1 ^ X.v[1] ^ key0;
X.v[1] = lo1;
X.v[2] = hi0 ^ X.v[3] ^ key1;
X.v[3] = lo0;
// bump key
key0 += 11400714819323198485;
key1 += 13503953896175478587;
// round 1
lo0 = _module0_mulhilo(&hi0, 15197193596820024467, X.v[0]);
lo1 = _module0_mulhilo(&hi1, 14581110107779764567, X.v[2]);
X.v[0] = hi1 ^ X.v[1] ^ key0;
X.v[1] = lo1;
X.v[2] = hi0 ^ X.v[3] ^ key1;
X.v[3] = lo0;
// bump key
key0 += 11400714819323198485;
key1 += 13503953896175478587;
// round 2
lo0 = _module0_mulhilo(&hi0, 15197193596820024467, X.v[0]);
lo1 = _module0_mulhilo(&hi1, 14581110107779764567, X.v[2]);
X.v[0] = hi1 ^ X.v[1] ^ key0;
X.v[1] = lo1;
X.v[2] = hi0 ^ X.v[3] ^ key1;
X.v[3] = lo0;
// bump key
key0 += 11400714819323198485;
key1 += 13503953896175478587;
// round 3
lo0 = _module0_mulhilo(&hi0, 15197193596820024467, X.v[0]);
lo1 = _module0_mulhilo(&hi1, 14581110107779764567, X.v[2]);
X.v[0] = hi1 ^ X.v[1] ^ key0;
X.v[1] = lo1;
X.v[2] = hi0 ^ X.v[3] ^ key1;
X.v[3] = lo0;
// bump key
key0 += 11400714819323198485;
key1 += 13503953896175478587;
// round 4
lo0 = _module0_mulhilo(&hi0, 15197193596820024467, X.v[0]);
lo1 = _module0_mulhilo(&hi1, 14581110107779764567, X.v[2]);
X.v[0] = hi1 ^ X.v[1] ^ key0;
X.v[1] = lo1;
X.v[2] = hi0 ^ X.v[3] ^ key1;
X.v[3] = lo0;
// bump key
key0 += 11400714819323198485;
key1 += 13503953896175478587;
// round 5
lo0 = _module0_mulhilo(&hi0, 15197193596820024467, X.v[0]);
lo1 = _module0_mulhilo(&hi1, 14581110107779764567, X.v[2]);
X.v[0] = hi1 ^ X.v[1] ^ key0;
X.v[1] = lo1;
X.v[2] = hi0 ^ X.v[3] ^ key1;
X.v[3] = lo0;
// bump key
key0 += 11400714819323198485;
key1 += 13503953896175478587;
// round 6
lo0 = _module0_mulhilo(&hi0, 15197193596820024467, X.v[0]);
lo1 = _module0_mulhilo(&hi1, 14581110107779764567, X.v[2]);
X.v[0] = hi1 ^ X.v[1] ^ key0;
X.v[1] = lo1;
X.v[2] = hi0 ^ X.v[3] ^ key1;
X.v[3] = lo0;
// bump key
key0 += 11400714819323198485;
key1 += 13503953896175478587;
// round 7
lo0 = _module0_mulhilo(&hi0, 15197193596820024467, X.v[0]);
lo1 = _module0_mulhilo(&hi1, 14581110107779764567, X.v[2]);
X.v[0] = hi1 ^ X.v[1] ^ key0;
X.v[1] = lo1;
X.v[2] = hi0 ^ X.v[3] ^ key1;
X.v[3] = lo0;
// bump key
key0 += 11400714819323198485;
key1 += 13503953896175478587;
// round 8
lo0 = _module0_mulhilo(&hi0, 15197193596820024467, X.v[0]);
lo1 = _module0_mulhilo(&hi1, 14581110107779764567, X.v[2]);
X.v[0] = hi1 ^ X.v[1] ^ key0;
X.v[1] = lo1;
X.v[2] = hi0 ^ X.v[3] ^ key1;
X.v[3] = lo0;
// bump key
key0 += 11400714819323198485;
key1 += 13503953896175478587;
// round 9
lo0 = _module0_mulhilo(&hi0, 15197193596820024467, X.v[0]);
lo1 = _module0_mulhilo(&hi1, 14581110107779764567, X.v[2]);
X.v[0] = hi1 ^ X.v[1] ^ key0;
X.v[1] = lo1;
X.v[2] = hi0 ^ X.v[3] ^ key1;
X.v[3] = lo0;
return X;
}
typedef unsigned int _module0_uint32;
typedef unsigned long _module0_uint64;
typedef struct _module0_
{
_module1_ key;
_module2_ counter;
union {
_module2_ buffer;
unsigned int buffer_uint32[8];
};
int buffer_uint32_cursor;
} _module0_State;
WITHIN_KERNEL void _module0_bump_counter(_module0_State *state)
{
state->counter.v[3] += 1;
if (state->counter.v[3] == 0)
{
state->counter.v[2] += 1;
if (state->counter.v[2] == 0)
{
state->counter.v[1] += 1;
if (state->counter.v[1] == 0)
{
state->counter.v[0] += 1;
}
}
}
}
WITHIN_KERNEL _module2_ _module0_get_next_unused_counter(_module0_State state)
{
if (state.buffer_uint32_cursor > 0)
{
_module0_bump_counter(&state);
}
return state.counter;
}
WITHIN_KERNEL void _module0_refill_buffer(_module0_State *state)
{
state->buffer = _module0_bijection(state->key, state->counter);
}
WITHIN_KERNEL _module0_State _module0_make_state(_module1_ key, _module2_ counter)
{
_module0_State state;
state.key = key;
state.counter = counter;
state.buffer_uint32_cursor = 0;
_module0_refill_buffer(&state);
return state;
}
WITHIN_KERNEL unsigned int _module0_get_raw_uint32(_module0_State *state)
{
if (state->buffer_uint32_cursor == 8)
{
_module0_bump_counter(state);
state->buffer_uint32_cursor = 0;
_module0_refill_buffer(state);
}
int cur = state->buffer_uint32_cursor;
state->buffer_uint32_cursor += 1;
return state->buffer_uint32[cur];
}
WITHIN_KERNEL unsigned long _module0_get_raw_uint64(_module0_State *state)
{
if (state->buffer_uint32_cursor >= 8 - 1)
{
_module0_bump_counter(state);
state->buffer_uint32_cursor = 0;
_module0_refill_buffer(state);
}
int cur = state->buffer_uint32_cursor;
state->buffer_uint32_cursor += 2;
return state->buffer.v[cur / 2];
}
WITHIN_KERNEL _module0_Key _module3_key_from_int(int idx)
{
_module0_Key result;
result.v[0] = 8618484947917433856
;
result.v[1] = 0
+ idx
;
return result;
}
#define _module5_Value float;
#define _module5_RANDOMS_PER_CALL 1
typedef struct
{
float v[1];
} _module5_Result;
WITHIN_KERNEL _module5_Result _module5_sample(_module0_State *state)
{
_module5_Result result;
float normalized = (float)_module0_get_raw_uint32(state) / 4294967296.0f;
result.v[0] = normalized * (1.0f) + (0.0f);
return result;
}
WITHIN_KERNEL float2 _module6_(float theta)
{
float2 res;
#ifdef CUDA
sincosf(theta, &(res.y), &(res.x));
#else
#ifdef COMPILE_FAST_MATH
res.x = native_cos(theta);
res.y = native_sin(theta);
#else
float tmp;
res.y = sincos(theta, &tmp);
res.x = tmp;
#endif
#endif
return res;
}
#define _module4_Value float;
#define _module4_RANDOMS_PER_CALL 2
typedef struct
{
float v[2];
} _module4_Result;
WITHIN_KERNEL _module4_Result _module4_sample(_module0_State *state)
{
_module4_Result result;
_module5_Result r1 = _module5_sample(state);
_module5_Result r2 = _module5_sample(state);
float u1 = r1.v[0];
float u2 = r2.v[0];
float ang = 6.2831854820251465f * u2;
float2 cos_sin = _module6_(ang);
float coeff = sqrt(-2.0f * log(u1)) * (10.0f);
result.v[0] = coeff * cos_sin.x + (-2.0f);
result.v[1] = coeff * cos_sin.y + (-2.0f);
return result;
}
KERNEL void test(GLOBAL_MEM float *dest, int ctr_start)
{
VIRTUAL_SKIP_THREADS;
const VSIZE_T idx = virtual_global_id(0);
_module0_Key key = _module3_key_from_int(idx);
_module0_Counter ctr = _module0_make_counter_from_int(ctr_start);
_module0_State st = _module0_make_state(key, ctr);
_module4_Result res;
for(int j = 0; j < 100; j++)
{
res = _module4_sample(&st);
dest[j * 20000 + 0 + idx] = res.v[0];
dest[j * 20000 + 10000 + idx] = res.v[1];
}
_module0_Counter next_ctr = _module0_get_next_unused_counter(st);
}
"""
queue = cl.CommandQueue(ctx)
prg = cl.Program(ctx, src).build(
#options=['-cl-fast-relaxed-math']
)
test = prg.test
local_size = (256,)
global_size = (10240,)
shape = (100, 2, 10000)
res_dev = array.to_device(queue, numpy.empty(shape, numpy.float32))
test(queue, global_size, local_size, res_dev.data, numpy.int32(0))
res = res_dev.get()
# Expected: ~-2, ~10
print(res.mean(), res.std())
import numpy
import pyopencl as cl
import pyopencl.array as array
ctx = cl.create_some_context()
src = """
// taken from pyopencl._cluda
#define LOCAL_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
// 'static' helps to avoid the "no previous prototype for function" warning
#if __OPENCL_VERSION__ >= 120
#define WITHIN_KERNEL static
#else
#define WITHIN_KERNEL
#endif
#define KERNEL __kernel
#define GLOBAL_MEM __global
#define LOCAL_MEM __local
#define LOCAL_MEM_DYNAMIC __local
#define LOCAL_MEM_ARG __local
#define INLINE inline
#define SIZE_T size_t
#define VSIZE_T size_t
// used to align fields in structures
#define ALIGN(bytes) __attribute__ ((aligned(bytes)))
#if defined(cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
#define COMPLEX_CTR(T) (T)
WITHIN_KERNEL VSIZE_T virtual_local_id(unsigned int dim)
{
if (dim == 1)
{
SIZE_T flat_id =
get_local_id(0) * 1 +
0;
return (flat_id / 1);
}
if (dim == 0)
{
SIZE_T flat_id =
get_local_id(1) * 1 +
0;
return (flat_id / 1);
}
return 0;
}
WITHIN_KERNEL VSIZE_T virtual_local_size(unsigned int dim)
{
if (dim == 1)
{
return 4;
}
if (dim == 0)
{
return 1;
}
return 1;
}
WITHIN_KERNEL VSIZE_T virtual_group_id(unsigned int dim)
{
if (dim == 1)
{
SIZE_T flat_id =
get_group_id(0) * 1 +
0;
return (flat_id / 1) % 1;
}
if (dim == 0)
{
SIZE_T flat_id =
get_group_id(0) * 1 +
0;
return (flat_id / 1);
}
return 0;
}
WITHIN_KERNEL VSIZE_T virtual_num_groups(unsigned int dim)
{
if (dim == 1)
{
return 1;
}
if (dim == 0)
{
return 1;
}
return 1;
}
WITHIN_KERNEL VSIZE_T virtual_global_id(unsigned int dim)
{
return virtual_local_id(dim) + virtual_group_id(dim) * virtual_local_size(dim);
}
WITHIN_KERNEL VSIZE_T virtual_global_size(unsigned int dim)
{
if(dim == 1)
{
return 4;
}
if(dim == 0)
{
return 1;
}
return 1;
}
WITHIN_KERNEL VSIZE_T virtual_global_flat_id()
{
return
virtual_global_id(1) * 1 +
virtual_global_id(0) * 4 +
0;
}
WITHIN_KERNEL VSIZE_T virtual_global_flat_size()
{
return
virtual_global_size(1) *
virtual_global_size(0) *
1;
}
WITHIN_KERNEL bool virtual_skip_local_threads()
{
return false;
}
WITHIN_KERNEL bool virtual_skip_groups()
{
return false;
}
WITHIN_KERNEL bool virtual_skip_global_threads()
{
return false;
}
#define VIRTUAL_SKIP_THREADS if(virtual_skip_local_threads() || virtual_skip_groups() || virtual_skip_global_threads()) return
typedef struct _module1__ {
unsigned long v;
} ALIGN(8) _module1_;
typedef struct _module0__ {
unsigned int i1;
_module1_ nested;
unsigned int i2;
} ALIGN(8) _module0_;
// leaf input macro for "input"
#define _module3_(_idx0) (_leaf_input[_idx0 * 1])
INLINE WITHIN_KERNEL _module0_ _module2_func(
GLOBAL_MEM _module0_ *_leaf_input, VSIZE_T _c_idx0, VSIZE_T _c_idx1)
{
VSIZE_T _idx0 = _c_idx1 / 1;
return
_module3_(_idx0);
}
#define _module2_(_c_idx0, _c_idx1) _module2_func( _leaf_input, _c_idx0, _c_idx1)
// leaf output macro for "output"
#define _module5_(_val) _leaf_output[0] = (_val)
INLINE WITHIN_KERNEL void _module4_func(
GLOBAL_MEM _module0_ *_leaf_output, VSIZE_T _c_idx0, VSIZE_T _c_idx1, _module0_ _val)
{
_module5_(_val);
}
#define _module4_(_c_idx0, _c_idx1, _val) _module4_func( _leaf_output, _c_idx0, _c_idx1, _val)
INLINE WITHIN_KERNEL _module0_ reduction_op(_module0_ input1, _module0_ input2)
{
_module0_ result = input1;
result.i1 += input2.i1;
result.nested.v += input2.nested.v;
result.i2 += input2.i2;
return result;
}
KERNEL void _kernel_func(GLOBAL_MEM _module0_ *_leaf_output, GLOBAL_MEM _module0_ *_leaf_input)
{
VIRTUAL_SKIP_THREADS;
LOCAL_MEM unsigned int local_mem_i1[4];
LOCAL_MEM unsigned long local_mem_nested_v[4];
LOCAL_MEM unsigned int local_mem_i2[4];
const VSIZE_T tid = virtual_local_id(1);
const VSIZE_T bid = virtual_group_id(1);
const VSIZE_T part_num = virtual_global_id(0);
const VSIZE_T index_in_part = 4 * bid + tid;
_module0_ empty = {0, {0L}, 0};
//_module0_ empty;
//empty.i1 = 0;
//empty.nested.v = 0;
//empty.i2 = 0;
_module0_ v;
if(tid < 3)
{
const _module0_ t =
_module2_(
part_num, index_in_part + 0);
v = t;
}
else
{
v = empty;
}
local_mem_i1[tid] = v.i1;
local_mem_nested_v[tid] = v.nested.v;
local_mem_i2[tid] = v.i2;
LOCAL_BARRIER;
// We could use the volatile trick here and execute the last several iterations
// (that fit in a single warp) without LOCAL_BARRIERs, but it gives only
// a minor performance boost, and works only for some platforms (and only for simple types).
if(tid < 2)
{
_module0_ val1, val2;
val1.i1 = local_mem_i1[tid];
val2.i1 = local_mem_i1[tid + 2];
val1.nested.v = local_mem_nested_v[tid];
val2.nested.v = local_mem_nested_v[tid + 2];
val1.i2 = local_mem_i2[tid];
val2.i2 = local_mem_i2[tid + 2];
const _module0_ val = reduction_op(val1, val2);
local_mem_i1[tid] = val.i1;
local_mem_nested_v[tid] = val.nested.v;
local_mem_i2[tid] = val.i2;
}
LOCAL_BARRIER;
if(tid < 1)
{
_module0_ val1, val2;
val1.i1 = local_mem_i1[tid];
val2.i1 = local_mem_i1[tid + 1];
val1.nested.v = local_mem_nested_v[tid];
val2.nested.v = local_mem_nested_v[tid + 1];
val1.i2 = local_mem_i2[tid];
val2.i2 = local_mem_i2[tid + 1];
const _module0_ val = reduction_op(val1, val2);
local_mem_i1[tid] = val.i1;
local_mem_nested_v[tid] = val.nested.v;
local_mem_i2[tid] = val.i2;
}
LOCAL_BARRIER;
if (tid == 0)
{
v.i1 = local_mem_i1[0];
v.nested.v = local_mem_nested_v[0];
v.i2 = local_mem_i2[0];
_module4_(part_num, bid, v);
}
}
"""
shape = (3,)
dtype_nested = numpy.dtype(dict(
names=['v'],
formats=[numpy.uint64],
aligned=True))
dtype = numpy.dtype(dict(
names=['i1', 'nested', 'i2'],
formats=[numpy.uint32, dtype_nested, numpy.uint32],
offsets=[0, 8, 16],
itemsize=24,
aligned=True))
queue = cl.CommandQueue(ctx)
prg = cl.Program(ctx, src).build()
test = prg._kernel_func
n = numpy.empty(3, dtype)
n['i1'] = 1
n['nested']['v'] = 2
n['i2'] = 3
n_dev = array.to_device(queue, n)
res_dev = array.to_device(queue, numpy.zeros(1, dtype))
test(queue, (4,), (4,), res_dev.data, n_dev.data)
# Expected: (3, (6,), 9)
print(res_dev.get())
_______________________________________________
PyOpenCL mailing list
[email protected]
http://lists.tiker.net/listinfo/pyopencl