Hehe, I thought as much... :-)
Luckily I got around learning opencl one more time: I found multi_put()
and made myself some zeropadding (my old problem from another thread).
If someone is interested, I attached a crude benchmark script. Gives me
some speedup over numpy when precomputing the indices.
# -*- coding: utf-8 -*-

import numpy as np, pyopencl as cl, pyopencl.array as cla
from time import time

device = cl.get_platforms()[0].get_devices(cl.device_type.GPU)[0]
queue = cl.CommandQueue(cl.Context([device]))
mem_pool = cl.tools.MemoryPool(cl.tools.ImmediateAllocator(queue))

def zeropad(arr,newshape,indices):
	out = cla.zeros(queue=arr.queue,shape=newshape,dtype=arr.dtype,allocator=arr.allocator)
	cla.multi_put([arr],indices,out=[out])
	return out

seq = 2**np.arange(8,12)
res1 = []; res2 = []
k = 100

# original array: n x n
# padded array: m x m
for m in seq:
	n = m/2
	a = np.random.rand(n,n).astype(np.float32)
	ad = cla.to_device(queue, a, allocator=mem_pool)
	indices = np.tile( np.arange((m-n)/2,m-(m-n)/2), n ) + np.repeat( np.arange(n)*m, n ) + (m-n)/2*m
	indices = cla.to_device(queue, indices.astype(np.int32), allocator=mem_pool)
	t0 = time()
	for i in range(k):
		bd = zeropad(ad,(m,m),indices)
	queue.finish()
	res1 = np.append(res1,time()-t0)
	t0 = time()
	for i in range(k):
		b = np.zeros((m,m),a.dtype)
		b[(m-n)/2:m-(m-n)/2,(m-n)/2:m-(m-n)/2] = a
	res2 = np.append(res2,time()-t0)
	print 'm: %4i'%m,'n: %4i'%n,'diff: %.1f'%np.sum(np.abs(b-bd.get())),'speedup: %.1fx'%(res2[-1]/res1[-1])
	mem_pool.free_held()
_______________________________________________
PyOpenCL mailing list
[email protected]
http://lists.tiker.net/listinfo/pyopencl

Reply via email to