Hello. I have wrote attached code that calculates parallel prefix sum. There are two variants - the first one (exclusive) is based on article by Mark Harris from NVIDIA, second one (inclusive) is based on diagram from Wikipedia. Inclusive scan could be optimised with regard to shared memory access conflicts, similarly to the exclusive version. Also it seems that inclusive scan is less stable numerically - results differ from CPU version when calculating scan of large arrays.
I would like to put this code into PyCUDA, to serve similar purpose as reduction kernels - that's why I tried to keep similar API. Please give feedback. Regards. -- Tomasz Rybak <[email protected]> GPG/PGP key ID: 2AD5 9860 Fingerprint A481 824E 7DD3 9C0E C40A 488E C654 FB33 2AD5 9860 http://member.acm.org/~tomaszrybak
#! /usr/bin/python
import sys
import array
import math
import numpy
import pycuda
import pycuda.compiler
import pycuda.curandom
import pycuda.driver
import pycuda.gpuarray
import pycuda.tools
import pycuda.autoinit
# Exclusive version, after Mark Harris atricle
def get_exclusive_prefix_module(data_type, sum_expression, neutral,
name = 'prefix_kernel', optimise = False, keep = False, options = [],
preamble = ''):
"""
Implementation of prefix-sum implemented based on whitepaper
``Parallel Prefix Sum (Scan) with CUDA''
by Mark Harris from NVIDIA
"""
from pycuda.compiler import SourceModule
if optimise:
preamble = preamble+"\n#define SHARED_BANK_PADDING(n) ((n) >> 4)\n"
else:
preamble = preamble+"\n#define SHARED_BANK_PADDING(n) 0\n"
src = """
#define SUM(a, b) (%(sum_expression)s)
%(preamble)s
__global__ void %(name)s(%(data_type)s *result,
const %(data_type)s *material, const int n) {
extern __shared__ %(data_type)s sharedArray[];
const int blockOffset = 2*blockIdx.x*blockDim.x;
int treeNodeDistance = 1;
int sourceNodeIndex = threadIdx.x;
int targetNodeIndex = threadIdx.x + (n/2);
sharedArray[sourceNodeIndex + SHARED_BANK_PADDING(sourceNodeIndex)] =
material[sourceNodeIndex + blockOffset];
sharedArray[targetNodeIndex + SHARED_BANK_PADDING(targetNodeIndex)] =
material[targetNodeIndex + blockOffset];
// Travel upwards, from leaves to the root of the tree
// During each loop time distance between nodes increases two times,
// and number of nodes decreases two times
for (int numberOfNodes = n>>1; numberOfNodes > 0; numberOfNodes >>= 1) {
__syncthreads();
if (threadIdx.x < numberOfNodes) {
int sourceNodeIndex = treeNodeDistance*(2*threadIdx.x+1)-1;
int targetNodeIndex = treeNodeDistance*(2*threadIdx.x+2)-1;
sourceNodeIndex += SHARED_BANK_PADDING(sourceNodeIndex);
targetNodeIndex += SHARED_BANK_PADDING(targetNodeIndex);
sharedArray[targetNodeIndex] = SUM(sharedArray[targetNodeIndex],
sharedArray[sourceNodeIndex]);
}
treeNodeDistance <<= 1;
}
if (threadIdx.x == 0) {
sharedArray[n-1 + SHARED_BANK_PADDING(n-1)] = %(neutral)s;
}
// Travel downwards, from root to the leaves
// During each loop number of nodes increases two times and
// distance between nodes decreases two times
for (int numberOfNodes = 1; numberOfNodes < n; numberOfNodes <<= 1) {
__syncthreads();
treeNodeDistance >>= 1;
if (threadIdx.x < numberOfNodes) {
int sourceNodeIndex = treeNodeDistance*(2*threadIdx.x+1)-1;
int targetNodeIndex = treeNodeDistance*(2*threadIdx.x+2)-1;
sourceNodeIndex += SHARED_BANK_PADDING(sourceNodeIndex);
targetNodeIndex += SHARED_BANK_PADDING(targetNodeIndex);
%(data_type)s temp = sharedArray[sourceNodeIndex];
sharedArray[sourceNodeIndex] = sharedArray[targetNodeIndex];
sharedArray[targetNodeIndex] = SUM(sharedArray[targetNodeIndex],
temp);
}
}
__syncthreads();
result[sourceNodeIndex + blockOffset] = sharedArray[sourceNodeIndex +
SHARED_BANK_PADDING(sourceNodeIndex)];
result[targetNodeIndex + blockOffset] = sharedArray[targetNodeIndex +
SHARED_BANK_PADDING(targetNodeIndex)];
}
""" % {
'data_type': data_type,
'name': name,
'neutral': neutral,
'sum_expression': sum_expression,
'preamble': preamble,
}
return SourceModule(src, options=options, keep=keep)
def get_exclusive_prefix_module_tail(data_type, sum_expression, neutral,
name = 'prefix_kernel', optimise = False, keep = False, options = [],
preamble = ''):
"""
Implementation of prefix-sum after whitepaper
Parallel Prefix Sum (Scan) with CUDA
by Mark Harris from NVIDIA
Can be used for tables which size is not power-of-2.
Excepts size of shared memory larger than size of input data.
"""
from pycuda.compiler import SourceModule
if optimise:
preamble = preamble+"\n#define SHARED_BANK_PADDING(n) ((n) >> 4)\n"
else:
preamble = preamble+"\n#define SHARED_BANK_PADDING(n) 0\n"
src = """
#define SUM(a, b) (%(sum_expression)s)
%(preamble)s
__global__ void %(name)s(%(data_type)s *result,
const %(data_type)s *material, const int n, const int blockOffset) {
extern __shared__ %(data_type)s sharedArray[];
int treeNodeDistance = 1;
// Size of shared array that can contain input data
// The smallest array of size power-of-2 that can contain original array
const int fullArraySize = 1<<((int)(floor(log2((float)n))+1));
int sourceNodeIndex = threadIdx.x;
int targetNodeIndex = threadIdx.x + (n/2);
if (sourceNodeIndex < n) {
sharedArray[sourceNodeIndex + SHARED_BANK_PADDING(sourceNodeIndex)] = material[sourceNodeIndex + blockOffset];
} else {
sharedArray[sourceNodeIndex + SHARED_BANK_PADDING(sourceNodeIndex)] = %(neutral)s;
}
if (targetNodeIndex < n) {
sharedArray[targetNodeIndex + SHARED_BANK_PADDING(targetNodeIndex)] = material[targetNodeIndex + blockOffset];
} else {
sharedArray[targetNodeIndex + SHARED_BANK_PADDING(targetNodeIndex)] = %(neutral)s;
}
// Travel upwards, from leaves to the root of the tree
// Each time distance between nodes increases two times,
// and number of nodes decreases two times
for (int numberOfNodes = fullArraySize>>1; numberOfNodes > 0;
numberOfNodes >>= 1) {
__syncthreads();
if (threadIdx.x < numberOfNodes) {
int sourceNodeIndex = treeNodeDistance*(2*threadIdx.x+1)-1;
int targetNodeIndex = treeNodeDistance*(2*threadIdx.x+2)-1;
sourceNodeIndex += SHARED_BANK_PADDING(sourceNodeIndex);
targetNodeIndex += SHARED_BANK_PADDING(targetNodeIndex);
sharedArray[targetNodeIndex] = SUM(sharedArray[targetNodeIndex],
sharedArray[sourceNodeIndex]);
}
treeNodeDistance <<= 1;
}
if (threadIdx.x == 0) { sharedArray[fullArraySize-1 + SHARED_BANK_PADDING(fullArraySize-1)] = %(neutral)s; }
// Travel downwards, from root to the leaves
// During each loop number of nodes increases two times and
// distance between nodes decreases two times
for (int numberOfNodes = 1; numberOfNodes < fullArraySize; numberOfNodes <<= 1) {
treeNodeDistance >>= 1;
__syncthreads();
if (threadIdx.x < numberOfNodes) {
int sourceNodeIndex = treeNodeDistance*(2*threadIdx.x+1)-1;
int targetNodeIndex = treeNodeDistance*(2*threadIdx.x+2)-1;
sourceNodeIndex += SHARED_BANK_PADDING(sourceNodeIndex);
targetNodeIndex += SHARED_BANK_PADDING(targetNodeIndex);
%(data_type)s temp = sharedArray[sourceNodeIndex];
sharedArray[sourceNodeIndex] = sharedArray[targetNodeIndex];
sharedArray[targetNodeIndex] = SUM(sharedArray[targetNodeIndex],
temp);
}
}
__syncthreads();
if (sourceNodeIndex < n) { result[sourceNodeIndex + blockOffset] = sharedArray[sourceNodeIndex + SHARED_BANK_PADDING(sourceNodeIndex)]; }
if (targetNodeIndex < n) { result[targetNodeIndex + blockOffset] = sharedArray[targetNodeIndex + SHARED_BANK_PADDING(targetNodeIndex)]; }
}
""" % {
'data_type': data_type,
'name': name,
'neutral': neutral,
'sum_expression': sum_expression,
'preamble': preamble,
}
return SourceModule(src, options=options, keep=keep)
class ExclusivePrefixKernel:
def __init__(self, data_type, sum_expression, neutral,
name = 'prefix_kernel', keep = False, options = [], preamble = ''):
# Determine the size of used data type
self.numpy_type = pycuda.tools.parse_c_arg("const %s * in" %
data_type).dtype
item_size = self.numpy_type.itemsize
# Determine the number of threads
dev = pycuda.driver.Context.get_device()
block_size = dev.get_attribute(pycuda.driver.device_attribute.MAX_THREADS_PER_BLOCK)
block_dimension = dev.get_attribute(pycuda.driver.device_attribute.MAX_BLOCK_DIM_X)
self.block_size = min(block_size, block_dimension)
# Shared memory size: two items per thread, number of threads, size of one data item
self.shared_size = self.block_size*item_size*2
# Padding to avoid bank conflicts
log_num_banks = 4
self.shared_size += ((self.block_size >> log_num_banks) *
item_size * 2)
# Check whether we do not exceed available shared memory size
max_shared_size = dev.get_attribute(pycuda.driver.device_attribute.MAX_SHARED_MEMORY_PER_BLOCK)
if self.shared_size > max_shared_size:
print ("Changed shared size")
self.shared_size = max_shared_size
self.block_size = self.shared_size/(2*item_size)
main_function = get_exclusive_prefix_module(data_type, sum_expression,
neutral, name, True, keep, options, preamble)
self.main_function = main_function.get_function(name)
self.main_function.prepare("PPi", block=(self.block_size, 1, 1),
shared=self.shared_size)
tail_function = get_exclusive_prefix_module_tail(data_type, sum_expression,
neutral, name, True, keep, options, preamble)
self.tail_function = tail_function.get_function(name)
self.tail_function.prepare("PPii", block=(self.block_size, 1, 1), shared=self.shared_size)
from pycuda.compiler import SourceModule
finish_source = """
#define SUM(a, b) (%(sum_expression)s)
%(preamble)s
__global__ void finish(%(data_type)s *result,
const %(data_type)s *material, const %(data_type)s *partial,
const int n) {
const int x = 2*blockIdx.x*blockDim.x+threadIdx.x;
const int y = 2*blockIdx.x*blockDim.x+threadIdx.x+blockDim.x;
if (x < n) {
for (int i = 1; i <= blockIdx.x; i++) {
// Program produces exclusive scan
// We need to add last element from previous block to get correct sum
// It calculates ``inclusive'' sum for internal purposes
result[x] = SUM(result[x], material[2*i*blockDim.x-1]);
result[x] = SUM(result[x], partial[i-1]);
if (y < n) {
result[y] = SUM(result[y], material[2*i*blockDim.x-1]);
result[y] = SUM(result[y], partial[i-1]);
}
}
}
}
""" % {
'data_type': data_type,
'sum_expression': sum_expression,
'preamble': preamble,
}
self.finish_function = SourceModule(finish_source, options=options, keep=keep).get_function('finish')
self.finish_function.prepare("PPPi", block=(self.block_size, 1, 1))
# Use maximum available shared memory in 2.x devices
# TODO: is it needed as we are more limited by number of threads?
# Might be needed for large data types (?)
if dev.compute_capability() >= (2, 0):
cacheSize = pycuda.driver.func_cache.PREFER_SHARED
self.main_function.set_cache_config(cacheSize)
self.tail_function.set_cache_config(cacheSize)
def __call__(self, *args, **kwargs):
result = args[0]
material = args[1]
inputSize = args[2]
block_count = ((inputSize + 2 * self.block_size - 1) //
(2 * self.block_size))
if inputSize != block_count * self.block_size * 2:
if block_count > 1:
self.main_function.prepared_call((block_count-1, 1),
result.gpudata, material.gpudata,
2 * self.block_size)
self.tail_function.prepared_call((1, 1), result.gpudata,
material.gpudata,
inputSize - (block_count - 1) * self.block_size * 2,
(block_count - 1) * self.block_size * 2)
else:
self.main_function.prepared_call((block_count, 1),
result.gpudata, material.gpudata,
2 * self.block_size)
blockSumArrayGPU = pycuda.gpuarray.zeros([block_count],
self.numpy_type)
blockSumArray = numpy.zeros(block_count).astype(self.numpy_type)
# Get final elements from each of blocks into table
# Sum of the last block is not needed
for i in range(block_count-1):
index = 2 * (i + 1) * self.block_size - 1
blockSumArray[i] = result[index:index+1].get()[0]
blockSumArrayGPU.set(blockSumArray)
# TODO: is it possible use GPUArray per-element kernel here?
# Rather not (we are adding different number of values for each slice
# of the array, but maybe I am not seeing something)
self.finish_function.prepared_call((block_count, 1),
result.gpudata, material.gpudata,
blockSumArrayGPU.gpudata, inputSize)
# Inclusive version, after http://en.wikipedia.org/wiki/Prefix_sum
# This function does not use "neutral" but I have left it to maintain the same
# list of arguments as other functions
def get_inclusive_prefix_module(data_type, sum_expression, neutral,
name = 'prefix_kernel', keep = False, options = [], preamble = ''):
"""
Implementation of prefix-sum implemented based on Wikipedia ``Prefix sum'' article
"""
from pycuda.compiler import SourceModule
src = """
#define SUM(a, b) (%(sum_expression)s)
%(preamble)s
__global__ void %(name)s(%(data_type)s *result,
const %(data_type)s *material, const int n) {
extern __shared__ %(data_type)s sharedArray[];
const int blockOffset = blockIdx.x*blockDim.x;
int sourceNodeIndex = threadIdx.x;
sharedArray[sourceNodeIndex] = material[sourceNodeIndex + blockOffset];
// During each loop time distance between nodes increases two times,
// and number of nodes decreases two times
for (int treeNodeDistance = 1; treeNodeDistance < n; treeNodeDistance <<= 1) {
// Ensure that all values are in an array
__syncthreads();
int targetNodeIndex = sourceNodeIndex + treeNodeDistance;
%(data_type)s partial;
if (targetNodeIndex < n) {
partial = SUM(sharedArray[targetNodeIndex],
sharedArray[sourceNodeIndex]);
}
// Ensure that all sums are in registers
__syncthreads();
// Write to an array
if (targetNodeIndex < n) {
sharedArray[targetNodeIndex] = partial;
}
}
__syncthreads();
result[sourceNodeIndex + blockOffset] = sharedArray[sourceNodeIndex];
}
""" % {
'data_type': data_type,
'name': name,
'sum_expression': sum_expression,
'preamble': preamble,
}
return SourceModule(src, options=options, keep=keep)
def get_inclusive_prefix_module_tail(data_type, sum_expression, neutral,
name = 'prefix_kernel', keep = False, options = [], preamble = ''):
"""
Implementation of prefix-sum implemented based on Wikipedia ``Prefix sum'' article
"""
from pycuda.compiler import SourceModule
src = """
#define SUM(a, b) (%(sum_expression)s)
%(preamble)s
__global__ void %(name)s(%(data_type)s *result,
const %(data_type)s *material, const int n, const int blockOffset) {
extern __shared__ %(data_type)s sharedArray[];
int sourceNodeIndex = threadIdx.x;
if (sourceNodeIndex < n) {
sharedArray[sourceNodeIndex] = material[sourceNodeIndex + blockOffset];
} else {
sharedArray[sourceNodeIndex] = %(neutral)s;
}
// During each loop time distance between nodes increases two times,
// and number of nodes decreases two times
for (int treeNodeDistance = 1; treeNodeDistance < n; treeNodeDistance <<= 1) {
// Ensure that all values are in an array
__syncthreads();
int targetNodeIndex = sourceNodeIndex + treeNodeDistance;
%(data_type)s partial;
if (targetNodeIndex < n) {
partial = SUM(sharedArray[targetNodeIndex],
sharedArray[sourceNodeIndex]);
}
// Ensure that all sums are in registers
__syncthreads();
// Write to an array
if (targetNodeIndex < n) {
sharedArray[targetNodeIndex] = partial;
}
}
__syncthreads();
if (sourceNodeIndex < n) {
result[sourceNodeIndex + blockOffset] = sharedArray[sourceNodeIndex];
}
}
""" % {
'data_type': data_type,
'name': name,
'sum_expression': sum_expression,
'neutral': neutral,
'preamble': preamble,
}
return SourceModule(src, options=options, keep=keep)
class InclusivePrefixKernel:
def __init__(self, data_type, sum_expression, neutral,
name = 'prefix_kernel', keep = False, options = [], preamble = ''):
# Determine the size of used data type
self.numpy_type = pycuda.tools.parse_c_arg("const %s * in" %
data_type).dtype
item_size = self.numpy_type.itemsize
# Determine the number of threads
dev = pycuda.driver.Context.get_device()
block_size = dev.get_attribute(pycuda.driver.device_attribute.MAX_THREADS_PER_BLOCK)
block_dimension = dev.get_attribute(pycuda.driver.device_attribute.MAX_BLOCK_DIM_X)
self.block_size = min(block_size, block_dimension)
# Shared memory size: two items per thread, number of threads, size of one data item
self.shared_size = self.block_size*item_size
# Check whether we do not exceed available shared memory size
max_shared_size = dev.get_attribute(pycuda.driver.device_attribute.MAX_SHARED_MEMORY_PER_BLOCK)
if self.shared_size > max_shared_size:
print ("Changed shared size")
self.shared_size = max_shared_size
self.block_size = self.shared_size/(2*item_size)
main_function = get_inclusive_prefix_module(data_type, sum_expression,
neutral, name, keep, options, preamble)
self.main_function = main_function.get_function(name)
self.main_function.prepare("PPi", block=(self.block_size, 1, 1),
shared=self.shared_size)
tail_function = get_inclusive_prefix_module_tail(data_type, sum_expression,
neutral, name, keep, options, preamble)
self.tail_function = tail_function.get_function(name)
self.tail_function.prepare("PPii", block=(self.block_size, 1, 1), shared=self.shared_size)
from pycuda.compiler import SourceModule
finish_source = """
#define SUM(a, b) (%(sum_expression)s)
%(preamble)s
__global__ void finish(%(data_type)s *result,
const %(data_type)s *partial, const int n) {
const int x = blockIdx.x*blockDim.x+threadIdx.x;
if (x < n) {
for (int i = 1; i <= blockIdx.x; i++) {
result[x] = SUM(result[x], partial[i-1]);
}
}
}
""" % {
'data_type': data_type,
'sum_expression': sum_expression,
'preamble': preamble,
}
self.finish_function = SourceModule(finish_source, options=options, keep=keep).get_function('finish')
self.finish_function.prepare("PPi", block=(self.block_size, 1, 1))
# Use maximum available shared memory in 2.x devices
# TODO: is it needed as we are more limited by number of threads?
# Might be needed for large data types (?)
if dev.compute_capability() >= (2, 0):
cacheSize = pycuda.driver.func_cache.PREFER_SHARED
self.main_function.set_cache_config(cacheSize)
self.tail_function.set_cache_config(cacheSize)
def __call__(self, *args, **kwargs):
result = args[0]
material = args[1]
inputSize = args[2]
block_count = ((inputSize + self.block_size - 1) //
self.block_size)
if inputSize != block_count * self.block_size:
if block_count > 1:
self.main_function.prepared_call((block_count-1, 1),
result.gpudata, material.gpudata, self.block_size)
self.tail_function.prepared_call((1, 1), result.gpudata,
material.gpudata,
inputSize - (block_count - 1) * self.block_size,
(block_count - 1) * self.block_size)
else:
self.main_function.prepared_call((block_count, 1),
result.gpudata, material.gpudata, self.block_size)
blockSumArrayGPU = pycuda.gpuarray.zeros([block_count],
self.numpy_type)
blockSumArray = numpy.zeros(block_count).astype(self.numpy_type)
# Get final elements from each of blocks into table
# Sum of the last block is not needed
for i in range(block_count-1):
index = (i + 1) * self.block_size - 1
blockSumArray[i] = result[index:index+1].get()[0]
blockSumArrayGPU.set(blockSumArray)
# TODO: is it possible use GPUArray per-element kernel here?
# Rather not (we are adding different number of values for each slice
# of the array, but maybe I am not seeing something)
self.finish_function.prepared_call((block_count, 1),
result.gpudata, blockSumArrayGPU.gpudata, inputSize)
def fun():
f = """
__global__ void compute(float *data, int N) {
const int x = blockIdx.x*blockDim.x+threadIdx.x;
if (x < N) {
data[x] = 1.0f;
}
}
"""
a = pycuda.compiler.SourceModule(f)
f = a.get_function("compute")
data = pycuda.gpuarray.zeros([10000], numpy.float32)
f.prepare("Pi", block = (512, 1, 1))
f.prepared_call((20, 1), data.gpudata, 10000)
sc = ExclusivePrefixKernel('float', 'a+b', '0', 'something')
datao = pycuda.gpuarray.zeros([10000], numpy.float32)
sc(datao, data, 5000)
print (datao[2000:2600])
print (datao[4095:4200])
print (datao[4900:5000])
def hasDoubleSupport():
from pycuda.driver import Context
return Context.get_device().compute_capability >= (1, 3)
class TestPrefixScan:
def compute_exclusive(self, what):
result = numpy.zeros((what.size, ), dtype = what.dtype)
prefix = 0
result[0] = prefix
for i in range(what.size-1):
prefix += what[i]
result[i+1] = prefix
return result
def compute_inclusive(self, what):
result = numpy.zeros((what.size, ), dtype = what.dtype)
prefix = what[0]
result[0] = prefix
for i in range(1, what.size):
prefix += what[i]
result[i] = prefix
return result
@pycuda.tools.mark_cuda_test
def test_exclusive_small(self):
p = ExclusivePrefixKernel('float', 'a+b', '0')
size = p.block_size
a_gpu = pycuda.curandom.rand((size,), dtype = numpy.float32)
b_gpu = pycuda.gpuarray.zeros([size], numpy.float32)
a = a_gpu.get()
p(b_gpu, a_gpu, size)
b = self.compute_exclusive(a)
assert (b - b_gpu.get() < 1e-4).all()
@pycuda.tools.mark_cuda_test
def test_exclusive_large(self):
p = ExclusivePrefixKernel('float', 'a+b', '0')
size = 16*p.block_size
a_gpu = pycuda.curandom.rand((size,), dtype = numpy.float32)
b_gpu = pycuda.gpuarray.zeros([size], numpy.float32)
a = a_gpu.get()
p(b_gpu, a_gpu, size)
b = self.compute_exclusive(a)
assert (b - b_gpu.get() < 1e-3).all()
@pycuda.tools.mark_cuda_test
def test_exclusive_small_non2(self):
p = ExclusivePrefixKernel('float', 'a+b', '0')
size = p.block_size-3
a_gpu = pycuda.curandom.rand((size,), dtype = numpy.float32)
b_gpu = pycuda.gpuarray.zeros([size], numpy.float32)
a = a_gpu.get()
p(b_gpu, a_gpu, size)
b = self.compute_exclusive(a)
assert (b - b_gpu.get() < 1e-4).all()
@pycuda.tools.mark_cuda_test
def test_exclusive_large_non2(self):
p = ExclusivePrefixKernel('float', 'a+b', '0')
size = 16*p.block_size-3
a_gpu = pycuda.curandom.rand((size,), dtype = numpy.float32)
b_gpu = pycuda.gpuarray.zeros([size], numpy.float32)
a = a_gpu.get()
p(b_gpu, a_gpu, size)
b = self.compute_exclusive(a)
assert (b - b_gpu.get() < 1e-3).all()
@pycuda.tools.mark_cuda_test
def test_inclusive_small(self):
p = InclusivePrefixKernel('float', 'a+b', '0')
size = p.block_size
a_gpu = pycuda.curandom.rand((size,), dtype = numpy.float32)
b_gpu = pycuda.gpuarray.zeros([size], numpy.float32)
a = a_gpu.get()
p(b_gpu, a_gpu, size)
b = self.compute_inclusive(a)
assert (b - b_gpu.get() < 1e-3).all()
@pycuda.tools.mark_cuda_test
def test_inclusive_large(self):
p = InclusivePrefixKernel('float', 'a+b', '0')
size = 16*p.block_size
a_gpu = pycuda.curandom.rand((size,), dtype = numpy.float32)
b_gpu = pycuda.gpuarray.zeros([size], numpy.float32)
a = a_gpu.get()
p(b_gpu, a_gpu, size)
b = self.compute_inclusive(a)
assert (b - b_gpu.get() < 1e-2).all()
@pycuda.tools.mark_cuda_test
def test_inclusive_small_non2(self):
p = InclusivePrefixKernel('float', 'a+b', '0')
size = p.block_size-3
a_gpu = pycuda.curandom.rand((size,), dtype = numpy.float32)
b_gpu = pycuda.gpuarray.zeros([size], numpy.float32)
a = a_gpu.get()
p(b_gpu, a_gpu, size)
b = self.compute_inclusive(a)
assert (b - b_gpu.get() < 1e-3).all()
@pycuda.tools.mark_cuda_test
def test_inclusive_large_non2(self):
p = InclusivePrefixKernel('float', 'a+b', '0')
size = 16*p.block_size-3
a_gpu = pycuda.curandom.rand((size,), dtype = numpy.float32)
b_gpu = pycuda.gpuarray.zeros([size], numpy.float32)
a = a_gpu.get()
p(b_gpu, a_gpu, size)
b = self.compute_inclusive(a)
assert (b - b_gpu.get() < 1e-2).all()
if __name__ == '__main__':
import pycuda.autoinit
from py.test.cmdline import main
main([__file__])
signature.asc
Description: This is a digitally signed message part
_______________________________________________ PyCUDA mailing list [email protected] http://lists.tiker.net/listinfo/pycuda
