Hello.
I have wrote attached code that calculates parallel prefix sum.
There are two variants - the first one (exclusive) is based on article
by Mark Harris from NVIDIA, second one (inclusive) is based on diagram
from Wikipedia.
Inclusive scan could be optimised with regard to shared memory access
conflicts, similarly to the exclusive version.
Also it seems that inclusive scan is less stable numerically - results
differ from CPU version when calculating scan of large arrays.

I would like to put this code into PyCUDA, to serve similar
purpose as reduction kernels - that's why I tried to keep similar API.

Please give feedback.
Regards.

-- 
Tomasz Rybak <[email protected]> GPG/PGP key ID: 2AD5 9860
Fingerprint A481 824E 7DD3 9C0E C40A  488E C654 FB33 2AD5 9860
http://member.acm.org/~tomaszrybak
#! /usr/bin/python

import sys
import array
import math
import numpy

import pycuda
import pycuda.compiler
import pycuda.curandom
import pycuda.driver
import pycuda.gpuarray
import pycuda.tools
import pycuda.autoinit


# Exclusive version, after Mark Harris atricle
def get_exclusive_prefix_module(data_type, sum_expression, neutral,
    name = 'prefix_kernel', optimise = False, keep = False, options = [],
    preamble = ''):
    """
    Implementation of prefix-sum implemented based on whitepaper
    ``Parallel Prefix Sum (Scan) with CUDA''
    by Mark Harris from NVIDIA
    """
    from pycuda.compiler import SourceModule
    if optimise:
        preamble = preamble+"\n#define SHARED_BANK_PADDING(n) ((n) >> 4)\n"
    else:
        preamble = preamble+"\n#define SHARED_BANK_PADDING(n) 0\n"
    src = """
    #define SUM(a, b) (%(sum_expression)s)

    %(preamble)s

    __global__ void %(name)s(%(data_type)s *result,
        const %(data_type)s *material, const int n) {
        extern __shared__ %(data_type)s sharedArray[];

        const int blockOffset = 2*blockIdx.x*blockDim.x;
        int treeNodeDistance = 1;

        int sourceNodeIndex = threadIdx.x;
        int targetNodeIndex = threadIdx.x + (n/2);
        sharedArray[sourceNodeIndex + SHARED_BANK_PADDING(sourceNodeIndex)] =
            material[sourceNodeIndex + blockOffset];
        sharedArray[targetNodeIndex + SHARED_BANK_PADDING(targetNodeIndex)] =
            material[targetNodeIndex + blockOffset];

// Travel upwards, from leaves to the root of the tree
// During each loop time distance between nodes increases two times,
// and number of nodes decreases two times
        for (int numberOfNodes = n>>1; numberOfNodes > 0; numberOfNodes >>= 1) {
            __syncthreads();

            if (threadIdx.x < numberOfNodes) {
                int sourceNodeIndex = treeNodeDistance*(2*threadIdx.x+1)-1;
                int targetNodeIndex = treeNodeDistance*(2*threadIdx.x+2)-1;
                sourceNodeIndex += SHARED_BANK_PADDING(sourceNodeIndex);
                targetNodeIndex += SHARED_BANK_PADDING(targetNodeIndex);

                sharedArray[targetNodeIndex] = SUM(sharedArray[targetNodeIndex],
                    sharedArray[sourceNodeIndex]);
            }
            treeNodeDistance <<= 1;
        }

        if (threadIdx.x == 0) {
            sharedArray[n-1 + SHARED_BANK_PADDING(n-1)] = %(neutral)s;
        }

// Travel downwards, from root to the leaves
// During each loop number of nodes increases two times and
// distance between nodes decreases two times
        for (int numberOfNodes = 1; numberOfNodes < n; numberOfNodes <<= 1) {
            __syncthreads();

            treeNodeDistance >>= 1;
            if (threadIdx.x < numberOfNodes) {
                int sourceNodeIndex = treeNodeDistance*(2*threadIdx.x+1)-1;
                int targetNodeIndex = treeNodeDistance*(2*threadIdx.x+2)-1;
                sourceNodeIndex += SHARED_BANK_PADDING(sourceNodeIndex);
                targetNodeIndex += SHARED_BANK_PADDING(targetNodeIndex);

                %(data_type)s temp   = sharedArray[sourceNodeIndex];
                sharedArray[sourceNodeIndex] = sharedArray[targetNodeIndex];
                sharedArray[targetNodeIndex] = SUM(sharedArray[targetNodeIndex],
                    temp);
            }
        }

        __syncthreads();

        result[sourceNodeIndex + blockOffset] = sharedArray[sourceNodeIndex +
            SHARED_BANK_PADDING(sourceNodeIndex)];
        result[targetNodeIndex + blockOffset] = sharedArray[targetNodeIndex +
            SHARED_BANK_PADDING(targetNodeIndex)];
    }
    """ % {
        'data_type': data_type,
        'name': name,
        'neutral': neutral,
        'sum_expression': sum_expression,
        'preamble': preamble,
        }
    return SourceModule(src, options=options, keep=keep)

def get_exclusive_prefix_module_tail(data_type, sum_expression, neutral,
    name = 'prefix_kernel', optimise = False, keep = False, options = [],
    preamble = ''):
    """
    Implementation of prefix-sum after whitepaper 
    Parallel Prefix Sum (Scan) with CUDA
    by Mark Harris from NVIDIA

    Can be used for tables which size is not power-of-2.
    Excepts size of shared memory larger than size of input data.
    """
    from pycuda.compiler import SourceModule
    if optimise:
        preamble = preamble+"\n#define SHARED_BANK_PADDING(n) ((n) >> 4)\n"
    else:
        preamble = preamble+"\n#define SHARED_BANK_PADDING(n) 0\n"
    src = """
    #define SUM(a, b) (%(sum_expression)s)

    %(preamble)s

    __global__ void %(name)s(%(data_type)s *result,
        const %(data_type)s *material, const int n, const int blockOffset) {
        extern __shared__ %(data_type)s sharedArray[];

        int treeNodeDistance = 1;
// Size of shared array that can contain input data
// The smallest array of size power-of-2 that can contain original array
        const int fullArraySize = 1<<((int)(floor(log2((float)n))+1));

        int sourceNodeIndex = threadIdx.x;
        int targetNodeIndex = threadIdx.x + (n/2);
        if (sourceNodeIndex < n) {
            sharedArray[sourceNodeIndex + SHARED_BANK_PADDING(sourceNodeIndex)] = material[sourceNodeIndex + blockOffset];
        } else {
            sharedArray[sourceNodeIndex + SHARED_BANK_PADDING(sourceNodeIndex)] = %(neutral)s;
        }
        if (targetNodeIndex < n) {
            sharedArray[targetNodeIndex + SHARED_BANK_PADDING(targetNodeIndex)] = material[targetNodeIndex + blockOffset];
        } else {
            sharedArray[targetNodeIndex + SHARED_BANK_PADDING(targetNodeIndex)] = %(neutral)s;
        }

// Travel upwards, from leaves to the root of the tree
// Each time distance between nodes increases two times,
// and number of nodes decreases two times
        for (int numberOfNodes = fullArraySize>>1; numberOfNodes > 0;
            numberOfNodes >>= 1) {
            __syncthreads();

            if (threadIdx.x < numberOfNodes) {
                int sourceNodeIndex = treeNodeDistance*(2*threadIdx.x+1)-1;
                int targetNodeIndex = treeNodeDistance*(2*threadIdx.x+2)-1;
                sourceNodeIndex += SHARED_BANK_PADDING(sourceNodeIndex);
                targetNodeIndex += SHARED_BANK_PADDING(targetNodeIndex);

                sharedArray[targetNodeIndex] = SUM(sharedArray[targetNodeIndex],
                    sharedArray[sourceNodeIndex]);
            }
            treeNodeDistance <<= 1;
        }

        if (threadIdx.x == 0) { sharedArray[fullArraySize-1 + SHARED_BANK_PADDING(fullArraySize-1)] = %(neutral)s; }

// Travel downwards, from root to the leaves
// During each loop number of nodes increases two times and
// distance between nodes decreases two times
        for (int numberOfNodes = 1; numberOfNodes < fullArraySize; numberOfNodes <<= 1) {
            treeNodeDistance >>= 1;
            __syncthreads();

            if (threadIdx.x < numberOfNodes) {
                int sourceNodeIndex = treeNodeDistance*(2*threadIdx.x+1)-1;
                int targetNodeIndex = treeNodeDistance*(2*threadIdx.x+2)-1;
                sourceNodeIndex += SHARED_BANK_PADDING(sourceNodeIndex);
                targetNodeIndex += SHARED_BANK_PADDING(targetNodeIndex);

                %(data_type)s temp   = sharedArray[sourceNodeIndex];
                sharedArray[sourceNodeIndex] = sharedArray[targetNodeIndex];
                sharedArray[targetNodeIndex] = SUM(sharedArray[targetNodeIndex],
                    temp);
            }
        }

        __syncthreads();

        if (sourceNodeIndex < n) { result[sourceNodeIndex + blockOffset] = sharedArray[sourceNodeIndex + SHARED_BANK_PADDING(sourceNodeIndex)]; }
        if (targetNodeIndex < n) { result[targetNodeIndex + blockOffset] = sharedArray[targetNodeIndex + SHARED_BANK_PADDING(targetNodeIndex)]; }
    }
    """ % {
        'data_type': data_type,
        'name': name,
        'neutral': neutral,
        'sum_expression': sum_expression,
        'preamble': preamble,
        }
    return SourceModule(src, options=options, keep=keep)

class ExclusivePrefixKernel:
    def __init__(self, data_type, sum_expression, neutral,
        name = 'prefix_kernel', keep = False, options = [], preamble = ''):
# Determine the size of used data type
        self.numpy_type = pycuda.tools.parse_c_arg("const %s * in" % 
            data_type).dtype
        item_size = self.numpy_type.itemsize

# Determine the number of threads
        dev = pycuda.driver.Context.get_device()
        block_size = dev.get_attribute(pycuda.driver.device_attribute.MAX_THREADS_PER_BLOCK)
        block_dimension =  dev.get_attribute(pycuda.driver.device_attribute.MAX_BLOCK_DIM_X)
        self.block_size = min(block_size, block_dimension)

# Shared memory size: two items per thread, number of threads, size of one data item
        self.shared_size = self.block_size*item_size*2
# Padding to avoid bank conflicts
        log_num_banks = 4
        self.shared_size += ((self.block_size >> log_num_banks) * 
            item_size * 2)

# Check whether we do not exceed available shared memory size
        max_shared_size = dev.get_attribute(pycuda.driver.device_attribute.MAX_SHARED_MEMORY_PER_BLOCK)
        if self.shared_size > max_shared_size:
            print ("Changed shared size")
            self.shared_size = max_shared_size
            self.block_size = self.shared_size/(2*item_size)

        main_function = get_exclusive_prefix_module(data_type, sum_expression,
            neutral, name, True, keep, options, preamble)
        self.main_function = main_function.get_function(name)
        self.main_function.prepare("PPi", block=(self.block_size, 1, 1),
            shared=self.shared_size)

        tail_function = get_exclusive_prefix_module_tail(data_type, sum_expression,
            neutral, name, True, keep, options, preamble)
        self.tail_function = tail_function.get_function(name)
        self.tail_function.prepare("PPii", block=(self.block_size, 1, 1), shared=self.shared_size)

        from pycuda.compiler import SourceModule
        finish_source = """
        #define SUM(a, b) (%(sum_expression)s)

        %(preamble)s

        __global__ void finish(%(data_type)s *result,
            const %(data_type)s *material, const %(data_type)s *partial,
	    const int n) {
            const int x = 2*blockIdx.x*blockDim.x+threadIdx.x;
            const int y = 2*blockIdx.x*blockDim.x+threadIdx.x+blockDim.x;
            if (x < n) {
                for (int i = 1; i <= blockIdx.x; i++) {
// Program produces exclusive scan
// We need to add last element from previous block to get correct sum
// It calculates ``inclusive'' sum for internal purposes
                    result[x] = SUM(result[x], material[2*i*blockDim.x-1]);
                    result[x] = SUM(result[x], partial[i-1]);
                    if (y < n) {
                        result[y] = SUM(result[y], material[2*i*blockDim.x-1]);
                        result[y] = SUM(result[y], partial[i-1]);
                    }
                }
            }
        }
        """ % {
            'data_type': data_type,
            'sum_expression': sum_expression,
            'preamble': preamble,
            }
        self.finish_function = SourceModule(finish_source, options=options, keep=keep).get_function('finish')
        self.finish_function.prepare("PPPi", block=(self.block_size, 1, 1))
# Use maximum available shared memory in 2.x devices
# TODO: is it needed as we are more limited by number of threads?
# Might be needed for large data types (?)
        if dev.compute_capability() >= (2, 0):
            cacheSize = pycuda.driver.func_cache.PREFER_SHARED
            self.main_function.set_cache_config(cacheSize)
            self.tail_function.set_cache_config(cacheSize)
    def __call__(self, *args, **kwargs):
        result = args[0]
        material = args[1]
        inputSize = args[2]
        block_count = ((inputSize + 2 * self.block_size - 1) //
            (2 * self.block_size))
        if inputSize != block_count * self.block_size * 2:
            if block_count > 1:
                self.main_function.prepared_call((block_count-1, 1),
                    result.gpudata, material.gpudata,
                    2 * self.block_size)
            self.tail_function.prepared_call((1, 1), result.gpudata,
                material.gpudata,
                inputSize - (block_count - 1) * self.block_size * 2,
                (block_count - 1) * self.block_size * 2)
        else:
            self.main_function.prepared_call((block_count, 1),
                result.gpudata, material.gpudata,
                2 * self.block_size)
        blockSumArrayGPU = pycuda.gpuarray.zeros([block_count],
            self.numpy_type)
        blockSumArray = numpy.zeros(block_count).astype(self.numpy_type)
# Get final elements from each of blocks into table
# Sum of the last block is not needed
        for i in range(block_count-1):
            index = 2 * (i + 1) * self.block_size - 1
            blockSumArray[i] = result[index:index+1].get()[0]
        blockSumArrayGPU.set(blockSumArray)
# TODO: is it possible use GPUArray per-element kernel here?
# Rather not (we are adding different number of values for each slice
# of the array, but maybe I am not seeing something)
        self.finish_function.prepared_call((block_count, 1),
            result.gpudata, material.gpudata,
            blockSumArrayGPU.gpudata, inputSize)


# Inclusive version, after http://en.wikipedia.org/wiki/Prefix_sum

# This function does not use "neutral" but I have left it to maintain the same
# list of arguments as other functions
def get_inclusive_prefix_module(data_type, sum_expression, neutral,
    name = 'prefix_kernel', keep = False, options = [], preamble = ''):
    """
    Implementation of prefix-sum implemented based on Wikipedia ``Prefix sum'' article
    """
    from pycuda.compiler import SourceModule
    src = """
    #define SUM(a, b) (%(sum_expression)s)

    %(preamble)s

    __global__ void %(name)s(%(data_type)s *result,
        const %(data_type)s *material, const int n) {
        extern __shared__ %(data_type)s sharedArray[];

        const int blockOffset = blockIdx.x*blockDim.x;

        int sourceNodeIndex = threadIdx.x;
        sharedArray[sourceNodeIndex] = material[sourceNodeIndex + blockOffset];

// During each loop time distance between nodes increases two times,
// and number of nodes decreases two times
        for (int treeNodeDistance = 1; treeNodeDistance < n; treeNodeDistance <<= 1) {
// Ensure that all values are in an array
            __syncthreads();

            int targetNodeIndex = sourceNodeIndex + treeNodeDistance;
            %(data_type)s partial;
            if (targetNodeIndex < n) {
                partial = SUM(sharedArray[targetNodeIndex],
                    sharedArray[sourceNodeIndex]);
            }

// Ensure that all sums are in registers
            __syncthreads();

// Write to an array
            if (targetNodeIndex < n) {
                sharedArray[targetNodeIndex] = partial;
            }
        }

        __syncthreads();
        result[sourceNodeIndex + blockOffset] = sharedArray[sourceNodeIndex];
    }
    """ % {
        'data_type': data_type,
        'name': name,
        'sum_expression': sum_expression,
        'preamble': preamble,
        }
    return SourceModule(src, options=options, keep=keep)

def get_inclusive_prefix_module_tail(data_type, sum_expression, neutral,
    name = 'prefix_kernel', keep = False, options = [], preamble = ''):
    """
    Implementation of prefix-sum implemented based on Wikipedia ``Prefix sum'' article
    """
    from pycuda.compiler import SourceModule
    src = """
    #define SUM(a, b) (%(sum_expression)s)

    %(preamble)s

    __global__ void %(name)s(%(data_type)s *result,
        const %(data_type)s *material, const int n, const int blockOffset) {
        extern __shared__ %(data_type)s sharedArray[];

        int sourceNodeIndex = threadIdx.x;
        if (sourceNodeIndex < n) {
            sharedArray[sourceNodeIndex] = material[sourceNodeIndex + blockOffset];
        } else {
            sharedArray[sourceNodeIndex] = %(neutral)s;
        }

// During each loop time distance between nodes increases two times,
// and number of nodes decreases two times
        for (int treeNodeDistance = 1; treeNodeDistance < n; treeNodeDistance <<= 1) {
// Ensure that all values are in an array
            __syncthreads();

            int targetNodeIndex = sourceNodeIndex + treeNodeDistance;
            %(data_type)s partial;
            if (targetNodeIndex < n) {
                partial = SUM(sharedArray[targetNodeIndex],
                    sharedArray[sourceNodeIndex]);
            }

// Ensure that all sums are in registers
            __syncthreads();

// Write to an array
            if (targetNodeIndex < n) {
                sharedArray[targetNodeIndex] = partial;
            }
        }

        __syncthreads();
        if (sourceNodeIndex < n) {
            result[sourceNodeIndex + blockOffset] = sharedArray[sourceNodeIndex];
        }
    }
    """ % {
        'data_type': data_type,
        'name': name,
        'sum_expression': sum_expression,
        'neutral': neutral,
        'preamble': preamble,
        }
    return SourceModule(src, options=options, keep=keep)

class InclusivePrefixKernel:
    def __init__(self, data_type, sum_expression, neutral,
        name = 'prefix_kernel', keep = False, options = [], preamble = ''):
# Determine the size of used data type
        self.numpy_type = pycuda.tools.parse_c_arg("const %s * in" % 
            data_type).dtype
        item_size = self.numpy_type.itemsize

# Determine the number of threads
        dev = pycuda.driver.Context.get_device()
        block_size = dev.get_attribute(pycuda.driver.device_attribute.MAX_THREADS_PER_BLOCK)
        block_dimension =  dev.get_attribute(pycuda.driver.device_attribute.MAX_BLOCK_DIM_X)
        self.block_size = min(block_size, block_dimension)

# Shared memory size: two items per thread, number of threads, size of one data item
        self.shared_size = self.block_size*item_size

# Check whether we do not exceed available shared memory size
        max_shared_size = dev.get_attribute(pycuda.driver.device_attribute.MAX_SHARED_MEMORY_PER_BLOCK)
        if self.shared_size > max_shared_size:
            print ("Changed shared size")
            self.shared_size = max_shared_size
            self.block_size = self.shared_size/(2*item_size)

        main_function = get_inclusive_prefix_module(data_type, sum_expression,
            neutral, name, keep, options, preamble)
        self.main_function = main_function.get_function(name)
        self.main_function.prepare("PPi", block=(self.block_size, 1, 1),
            shared=self.shared_size)

        tail_function = get_inclusive_prefix_module_tail(data_type, sum_expression,
            neutral, name, keep, options, preamble)
        self.tail_function = tail_function.get_function(name)
        self.tail_function.prepare("PPii", block=(self.block_size, 1, 1), shared=self.shared_size)

        from pycuda.compiler import SourceModule
        finish_source = """
        #define SUM(a, b) (%(sum_expression)s)

        %(preamble)s

        __global__ void finish(%(data_type)s *result,
            const %(data_type)s *partial, const int n) {
            const int x = blockIdx.x*blockDim.x+threadIdx.x;
            if (x < n) {
                for (int i = 1; i <= blockIdx.x; i++) {
                    result[x] = SUM(result[x], partial[i-1]);
                }
            }
        }
        """ % {
            'data_type': data_type,
            'sum_expression': sum_expression,
            'preamble': preamble,
            }
        self.finish_function = SourceModule(finish_source, options=options, keep=keep).get_function('finish')
        self.finish_function.prepare("PPi", block=(self.block_size, 1, 1))
# Use maximum available shared memory in 2.x devices
# TODO: is it needed as we are more limited by number of threads?
# Might be needed for large data types (?)
        if dev.compute_capability() >= (2, 0):
            cacheSize = pycuda.driver.func_cache.PREFER_SHARED
            self.main_function.set_cache_config(cacheSize)
            self.tail_function.set_cache_config(cacheSize)
    def __call__(self, *args, **kwargs):
        result = args[0]
        material = args[1]
        inputSize = args[2]
        block_count = ((inputSize + self.block_size - 1) //
            self.block_size)
        if inputSize != block_count * self.block_size:
            if block_count > 1:
                self.main_function.prepared_call((block_count-1, 1),
                    result.gpudata, material.gpudata, self.block_size)
            self.tail_function.prepared_call((1, 1), result.gpudata,
                material.gpudata,
                inputSize - (block_count - 1) * self.block_size,
                (block_count - 1) * self.block_size)
        else:
            self.main_function.prepared_call((block_count, 1),
                result.gpudata, material.gpudata, self.block_size)
        blockSumArrayGPU = pycuda.gpuarray.zeros([block_count],
            self.numpy_type)
        blockSumArray = numpy.zeros(block_count).astype(self.numpy_type)
# Get final elements from each of blocks into table
# Sum of the last block is not needed
        for i in range(block_count-1):
            index = (i + 1) * self.block_size - 1
            blockSumArray[i] = result[index:index+1].get()[0]
        blockSumArrayGPU.set(blockSumArray)
# TODO: is it possible use GPUArray per-element kernel here?
# Rather not (we are adding different number of values for each slice
# of the array, but maybe I am not seeing something)
        self.finish_function.prepared_call((block_count, 1),
            result.gpudata, blockSumArrayGPU.gpudata, inputSize)

def fun():
    f = """
    __global__ void compute(float *data, int N) {

        const int x = blockIdx.x*blockDim.x+threadIdx.x;
        if (x < N) {
            data[x] = 1.0f;
        }
    }
    """
    a = pycuda.compiler.SourceModule(f)
    f = a.get_function("compute")
    data = pycuda.gpuarray.zeros([10000], numpy.float32)
    f.prepare("Pi", block = (512, 1, 1))
    f.prepared_call((20, 1), data.gpudata, 10000)


    sc = ExclusivePrefixKernel('float', 'a+b', '0', 'something')
    datao = pycuda.gpuarray.zeros([10000], numpy.float32)
    sc(datao, data, 5000)

    print (datao[2000:2600])
    print (datao[4095:4200])
    print (datao[4900:5000])

def hasDoubleSupport():
    from pycuda.driver import Context
    return Context.get_device().compute_capability >= (1, 3)

class TestPrefixScan:
    def compute_exclusive(self, what):
        result = numpy.zeros((what.size, ), dtype = what.dtype)
        prefix = 0
        result[0] = prefix
        for i in range(what.size-1):
            prefix += what[i]
            result[i+1] = prefix
        return result
    def compute_inclusive(self, what):
        result = numpy.zeros((what.size, ), dtype = what.dtype)
        prefix = what[0]
        result[0] = prefix
        for i in range(1, what.size):
            prefix += what[i]
            result[i] = prefix
        return result

    @pycuda.tools.mark_cuda_test
    def test_exclusive_small(self):
        p = ExclusivePrefixKernel('float', 'a+b', '0')
        size = p.block_size
        a_gpu = pycuda.curandom.rand((size,), dtype = numpy.float32)
        b_gpu = pycuda.gpuarray.zeros([size], numpy.float32)
        a = a_gpu.get()
        p(b_gpu, a_gpu, size)
        b = self.compute_exclusive(a)
        assert (b - b_gpu.get() < 1e-4).all()
    @pycuda.tools.mark_cuda_test
    def test_exclusive_large(self):
        p = ExclusivePrefixKernel('float', 'a+b', '0')
        size = 16*p.block_size
        a_gpu = pycuda.curandom.rand((size,), dtype = numpy.float32)
        b_gpu = pycuda.gpuarray.zeros([size], numpy.float32)
        a = a_gpu.get()
        p(b_gpu, a_gpu, size)
        b = self.compute_exclusive(a)
        assert (b - b_gpu.get() < 1e-3).all()
    @pycuda.tools.mark_cuda_test
    def test_exclusive_small_non2(self):
        p = ExclusivePrefixKernel('float', 'a+b', '0')
        size = p.block_size-3
        a_gpu = pycuda.curandom.rand((size,), dtype = numpy.float32)
        b_gpu = pycuda.gpuarray.zeros([size], numpy.float32)
        a = a_gpu.get()
        p(b_gpu, a_gpu, size)
        b = self.compute_exclusive(a)
        assert (b - b_gpu.get() < 1e-4).all()
    @pycuda.tools.mark_cuda_test
    def test_exclusive_large_non2(self):
        p = ExclusivePrefixKernel('float', 'a+b', '0')
        size = 16*p.block_size-3
        a_gpu = pycuda.curandom.rand((size,), dtype = numpy.float32)
        b_gpu = pycuda.gpuarray.zeros([size], numpy.float32)
        a = a_gpu.get()
        p(b_gpu, a_gpu, size)
        b = self.compute_exclusive(a)
        assert (b - b_gpu.get() < 1e-3).all()
    @pycuda.tools.mark_cuda_test
    def test_inclusive_small(self):
        p = InclusivePrefixKernel('float', 'a+b', '0')
        size = p.block_size
        a_gpu = pycuda.curandom.rand((size,), dtype = numpy.float32)
        b_gpu = pycuda.gpuarray.zeros([size], numpy.float32)
        a = a_gpu.get()
        p(b_gpu, a_gpu, size)
        b = self.compute_inclusive(a)
        assert (b - b_gpu.get() < 1e-3).all()
    @pycuda.tools.mark_cuda_test
    def test_inclusive_large(self):
        p = InclusivePrefixKernel('float', 'a+b', '0')
        size = 16*p.block_size
        a_gpu = pycuda.curandom.rand((size,), dtype = numpy.float32)
        b_gpu = pycuda.gpuarray.zeros([size], numpy.float32)
        a = a_gpu.get()
        p(b_gpu, a_gpu, size)
        b = self.compute_inclusive(a)
        assert (b - b_gpu.get() < 1e-2).all()
    @pycuda.tools.mark_cuda_test
    def test_inclusive_small_non2(self):
        p = InclusivePrefixKernel('float', 'a+b', '0')
        size = p.block_size-3
        a_gpu = pycuda.curandom.rand((size,), dtype = numpy.float32)
        b_gpu = pycuda.gpuarray.zeros([size], numpy.float32)
        a = a_gpu.get()
        p(b_gpu, a_gpu, size)
        b = self.compute_inclusive(a)
        assert (b - b_gpu.get() < 1e-3).all()
    @pycuda.tools.mark_cuda_test
    def test_inclusive_large_non2(self):
        p = InclusivePrefixKernel('float', 'a+b', '0')
        size = 16*p.block_size-3
        a_gpu = pycuda.curandom.rand((size,), dtype = numpy.float32)
        b_gpu = pycuda.gpuarray.zeros([size], numpy.float32)
        a = a_gpu.get()
        p(b_gpu, a_gpu, size)
        b = self.compute_inclusive(a)
        assert (b - b_gpu.get() < 1e-2).all()

if __name__ == '__main__':
    import pycuda.autoinit
    from py.test.cmdline import main
    main([__file__])

Attachment: signature.asc
Description: This is a digitally signed message part

_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda

Reply via email to