Hehe, I thought as much... :-) Luckily I got around learning opencl one more time: I found multi_put() and made myself some zeropadding (my old problem from another thread). If someone is interested, I attached a crude benchmark script. Gives me some speedup over numpy when precomputing the indices.
# -*- coding: utf-8 -*-
import numpy as np, pyopencl as cl, pyopencl.array as cla from time import time device = cl.get_platforms()[0].get_devices(cl.device_type.GPU)[0] queue = cl.CommandQueue(cl.Context([device])) mem_pool = cl.tools.MemoryPool(cl.tools.ImmediateAllocator(queue)) def zeropad(arr,newshape,indices): out = cla.zeros(queue=arr.queue,shape=newshape,dtype=arr.dtype,allocator=arr.allocator) cla.multi_put([arr],indices,out=[out]) return out seq = 2**np.arange(8,12) res1 = []; res2 = [] k = 100 # original array: n x n # padded array: m x m for m in seq: n = m/2 a = np.random.rand(n,n).astype(np.float32) ad = cla.to_device(queue, a, allocator=mem_pool) indices = np.tile( np.arange((m-n)/2,m-(m-n)/2), n ) + np.repeat( np.arange(n)*m, n ) + (m-n)/2*m indices = cla.to_device(queue, indices.astype(np.int32), allocator=mem_pool) t0 = time() for i in range(k): bd = zeropad(ad,(m,m),indices) queue.finish() res1 = np.append(res1,time()-t0) t0 = time() for i in range(k): b = np.zeros((m,m),a.dtype) b[(m-n)/2:m-(m-n)/2,(m-n)/2:m-(m-n)/2] = a res2 = np.append(res2,time()-t0) print 'm: %4i'%m,'n: %4i'%n,'diff: %.1f'%np.sum(np.abs(b-bd.get())),'speedup: %.1fx'%(res2[-1]/res1[-1]) mem_pool.free_held()
_______________________________________________ PyOpenCL mailing list [email protected] http://lists.tiker.net/listinfo/pyopencl
