Hello.
NVIDIA introduced library CURAND in CUDA 3.2.
It allows for generating pseudo-random numbers on GPU.
CURAND library consists of two parts - host part, and device
part.
The latter is set of __device__ kernels that can be called
from custom kernels.
I attach my wrapper around those kernels, and patch
for PyCUDA that would include my objects to pycuda.curandom.
I have checked those classes on both Fermi and 1.1 device.
On ION one cannot call 512 threads at one time - there is
not enough resources to initialise 512 random states at one time.

This is rather early version - so please give some feedback.
I am also using camelCase names while PyCUDA uses
names_with_underscores; I assume I will need to change it before
patch gets accepted.


-- 
Tomasz Rybak <[email protected]> GPG/PGP key ID: 2AD5 9860
Fingerprint A481 824E 7DD3 9C0E C40A  488E C654 FB33 2AD5 9860
http://member.acm.org/~tomaszrybak
diff --git a/pycuda/curandom.py b/pycuda/curandom.py
index 2105f40..297b764 100644
--- a/pycuda/curandom.py
+++ b/pycuda/curandom.py
@@ -233,6 +233,333 @@ def rand(shape, dtype=numpy.float32, stream=None):
     return result
 
 
+# Need to allocate memory for curandState - there in not enough memory
+# (neither shared nor private) on the chip for one for each thread ;-)
+# sizeof(curandState))
+curandStateSize = 32
+curandStateSobolSize = 32
+
+randomSource = """
+#include <curand_kernel.h>
+
+extern "C" {
+__global__ void prepareWithSeed(curandState *s, int n, int seed, int offset) {
+	if (threadIdx.x < n) {
+		curand_init(seed, threadIdx.x, offset, &s[threadIdx.x]);
+	}
+}
+
+__global__ void prepareWithSeeds(curandState *s, int n, int *seed, int offset) {
+	if (threadIdx.x < n) {
+		curand_init(seed[threadIdx.x], threadIdx.x, offset, &s[threadIdx.x]);
+	}
+}
+
+__global__ void uniformInt(curandState *s, int *d, int n) {
+	const int tidx = threadIdx.x;
+	for (int idx = tidx; idx < n; idx += blockDim.x) {
+		d[idx] = curand(&s[tidx]);
+	}
+}
+
+__global__ void uniformFloat(curandState *s, float *d, int n) {
+	const int tidx = threadIdx.x;
+	for (int idx = tidx; idx < n; idx += blockDim.x) {
+		d[idx] = curand_uniform(&s[tidx]);
+	}
+}
+
+__global__ void uniformDouble(curandState *s, double *d, int n) {
+	const int tidx = threadIdx.x;
+	for (int idx = tidx; idx < n; idx += blockDim.x) {
+		d[idx] = curand_uniform_double(&s[tidx]);
+	}
+}
+
+__global__ void normalFloat(curandState *s, float *d, int n) {
+	const int tidx = threadIdx.x;
+	for (int idx = tidx; idx < n; idx += blockDim.x) {
+		d[idx] = curand_normal(&s[tidx]);
+	}
+}
+
+__global__ void normalDouble(curandState *s, double *d, int n) {
+	const int tidx = threadIdx.x;
+	for (int idx = tidx; idx < n; idx += blockDim.x) {
+		d[idx] = curand_normal_double(&s[tidx]);
+	}
+}
+
+__global__ void normalFloat2(curandState *s, float2 *d, int n) {
+	const int tidx = threadIdx.x;
+	for (int idx = tidx; idx < n; idx += blockDim.x) {
+		d[idx] = curand_normal2(&s[tidx]);
+	}
+}
+
+__global__ void normalDouble2(curandState *s, double2 *d, int n) {
+	const int tidx = threadIdx.x;
+	for (int idx = tidx; idx < n; idx += blockDim.x) {
+		d[idx] = curand_normal2_double(&s[tidx]);
+	}
+}
+
+__global__ void skipAhead(curandState *s, int n, int skip) {
+	const int idx = threadIdx.x;
+	if (idx < n) {
+		skipahead(skip, &s[idx]);
+	}
+}
+
+__global__ void skipAheadArray(curandState *s, int n, int *skip) {
+	const int idx = threadIdx.x;
+	if (idx < n) {
+		skipahead(skip[idx], &s[idx]);
+	}
+}
+}
+"""
+
+class Randomizer:
+	def __init__(self, generatorCount, offset, seed = None):
+		import pycuda.compiler
+		import pycuda.driver
+		import pycuda._driver
+		import pycuda.gpuarray
+		if pycuda.driver.get_version() < (3, 2, 0):
+			raise EnvironmentError("Need at least CUDA 3.2")
+		self.generatorCount = generatorCount
+		self.state = pycuda.driver.mem_alloc(generatorCount*curandStateSize)
+		m = pycuda.compiler.SourceModule(randomSource, no_extern_c=True, keep=True)
+		if seed is  None:
+			import random
+			import sys
+			seed = random.randint(0, (1<<31)-1)
+# TODO: should we use async call here, or leave as is - it is just a constructor?
+# TODO: cudaThreadSetLimit(cudaLimitStackSize, 16k) on Fermi
+		if type(seed) == type(1):
+			p = m.get_function("prepareWithSeed")
+			p.prepare("Piii", block = (generatorCount, 1, 1))
+			try:
+# Max 256 threads on ION during preparing
+# Need to limit number of threads. If not there is failed execution:
+# pycuda._driver.LaunchError: cuLaunchGrid failed: launch out of resources
+				p.prepared_call((1, 1), self.state, generatorCount, seed, offset)
+			except pycuda._driver.LaunchError:
+				raise ValueError("Initialisation failed. Decrease number of threads.")
+		else:
+			if type(seed) in [type([]), type(())]:
+				seed = numpy.array(seed, dtype = numpy.int32)
+			if type(seed) == numpy.ndarray:
+				seed = pycuda.gpuarray.to_gpu(seed).astype(numpy.int32)
+			if type(seed) == pycuda.gpuarray.GPUArray and seed.dtype == numpy.int32:
+				p = m.get_function("prepareWithSeeds")
+				p.prepare("PiPi", block = (generatorCount, 1, 1))
+				try:
+# Max 256 threads on ION during preparing
+# Need to limit number of threads. If not there is failed execution:
+# pycuda._driver.LaunchError: cuLaunchGrid failed: launch out of resources
+					p.prepared_call((1, 1), self.state, generatorCount, seed.gpudata, offset)
+				except pycuda._driver.LaunchError:
+					raise ValueError("Initialisation failed. Decrease number of threads.")
+			else:
+				raise ValueError("Need GPUArray of integers")
+		self.uniformInt = m.get_function("uniformInt")
+		self.uniformInt.prepare("PPi", block = (generatorCount, 1, 1))
+		self.uniformFloat = m.get_function("uniformFloat")
+		self.uniformFloat.prepare("PPi", block = (generatorCount, 1, 1))
+		self.uniformDouble = m.get_function("uniformDouble")
+		self.uniformDouble.prepare("PPi", block = (generatorCount, 1, 1))
+		self.normalFloat = m.get_function("normalFloat")
+		self.normalFloat.prepare("PPi", block = (generatorCount, 1, 1))
+		self.normalDouble = m.get_function("normalDouble")
+		self.normalDouble.prepare("PPi", block = (generatorCount, 1, 1))
+		self.normalFloat2 = m.get_function("normalFloat2")
+		self.normalFloat2.prepare("PPi", block = (generatorCount, 1, 1))
+		self.normalDouble2 = m.get_function("normalDouble2")
+		self.normalDouble2.prepare("PPi", block = (generatorCount, 1, 1))
+		self.skipAhead = m.get_function("skipAhead")
+		self.skipAhead.prepare("Pii", block = (generatorCount, 1, 1))
+		self.skipAheadArray = m.get_function("skipAheadArray")
+		self.skipAheadArray.prepare("PiP", block = (generatorCount, 1, 1))
+	def __del__(self):
+		self.state.free()
+	def fillInUniformInt(self, data, inputSize, stream = None):
+		self.uniformInt.prepared_async_call((1, 1), stream, self.state, data, inputSize)
+	def fillInUniformFloat(self, data, inputSize, stream = None):
+		self.uniformFloat.prepared_async_call((1, 1), stream, self.state, data, inputSize)
+	def fillInUniformDouble(self, data, inputSize, stream = None):
+		self.uniformDouble.prepared_async_call((1, 1), stream, self.state, data, inputSize)
+	def fillInNormalFloat(self, data, inputSize, stream = None):
+		self.normalFloat.prepared_async_call((1, 1), stream, self.state, data, inputSize)
+	def fillInNormalDouble(self, data, inputSize, stream = None):
+		self.normalDouble.prepared_async_call((1, 1), stream, self.state, data, inputSize)
+	def fillInNormalFloat2(self, data, inputSize, stream = None):
+		self.normalFloat2.prepared_async_call((1, 1), stream, self.state, data, inputSize)
+	def fillInNormalDouble2(self, data, inputSize, stream = None):
+		self.normalDouble2.prepared_async_call((1, 1), stream, self.state, data, inputSize)
+	def fillInUniform(self, data, stream = None):
+		if data.dtype == numpy.float32:
+			self.fillInUniformFloat(data.gpudata, data.size, stream)
+		elif data.dtype == numpy.float64:
+			self.fillInUniformDouble(data.gpudata, data.size, stream)
+		elif data.dtype in [numpy.int, numpy.int32, numpy.uint32]:
+			self.fillInUniformInt(data.gpudata, data.size, stream)
+		else:
+			raise NotImplementedError
+	def fillInNormal(self, data, stream = None):
+		if data.dtype == numpy.float32:
+			self.fillInNormalFloat(data.gpudata, data.size, stream)
+		elif data.dtype == numpy.float64:
+			self.fillInNormalDouble(data.gpudata, data.size, stream)
+		else:
+			raise NotImplementedError
+	def callSkipAhead(self, i, stream = None):
+		self.skipAhead.prepared_async_call((1, 1), stream, self.state, self.generatorCount, i)
+	def callSkipAheadArray(self, i, stream = None):
+		self.skipAheadArray.prepared_async_call((1, 1), stream, self.state, self.generatorCount, i.gpudata)
+	def __call__(self, shape, dtype=numpy.float32, stream=None):
+		import pycuda.gpuarray
+		result = pycuda.gpuarray.GPUArray(shape, dtype)
+# Do not use GPUArray._blocks like curand.rand does;
+# we are limited by number of states, set in constructor
+		self.fillInUniform(result, stream)
+		return result
+
+quasirandomSource = """
+// Uses C++ features (templates); do not surround with extern C
+#include <curand_kernel.h>
+
+extern "C" {
+__global__ void prepare(curandStateSobol32 *s, int n, unsigned int *v, unsigned int o) {
+	if (threadIdx.x < n) {
+		curand_init(v, o, &s[threadIdx.x]);
+	}
+}
+
+__global__ void uniformInt(curandStateSobol32 *s, int *d, int n) {
+	const int tidx = threadIdx.x;
+	for (int idx = tidx; idx < n; idx += blockDim.x) {
+		d[idx] = curand(&s[tidx]);
+	}
+}
+
+__global__ void uniformFloat(curandStateSobol32 *s, float *d, int n) {
+	const int tidx = threadIdx.x;
+	for (int idx = tidx; idx < n; idx += blockDim.x) {
+		d[idx] = curand_uniform(&s[tidx]);
+	}
+}
+
+__global__ void uniformDouble(curandStateSobol32 *s, double *d, int n) {
+	const int tidx = threadIdx.x;
+	for (int idx = tidx; idx < n; idx += blockDim.x) {
+		d[idx] = curand_uniform_double(&s[tidx]);
+	}
+}
+
+__global__ void normalFloat(curandStateSobol32 *s, float *d, int n) {
+	const int tidx = threadIdx.x;
+	for (int idx = tidx; idx < n; idx += blockDim.x) {
+		d[idx] = curand_normal(&s[tidx]);
+	}
+}
+
+__global__ void normalDouble(curandStateSobol32 *s, double *d, int n) {
+	const int tidx = threadIdx.x;
+	for (int idx = tidx; idx < n; idx += blockDim.x) {
+		d[idx] = curand_normal_double(&s[tidx]);
+	}
+}
+
+__global__ void skipAhead(curandStateSobol32 *s, int n, int skip) {
+	const int idx = threadIdx.x;
+	if (idx < n) {
+		skipahead(skip, &s[idx]);
+	}
+}
+
+__global__ void skipAheadArray(curandStateSobol32 *s, int n, int *skip) {
+	const int idx = threadIdx.x;
+	if (idx < n) {
+		skipahead(skip[idx], &s[idx]);
+	}
+}
+}
+"""
+
+class QuasiRandomizer:
+	def __init__(self, generatorCount, vector, offset):
+		import pycuda.compiler
+		import pycuda.driver
+		if pycuda.driver.get_version() < (3, 2, 0):
+			raise EnvironmentError("Need at least CUDA 3.2")
+		self.generatorCount = generatorCount
+		self.state = pycuda.driver.mem_alloc(generatorCount*curandStateSobolSize)
+		m = pycuda.compiler.SourceModule(quasirandomSource, no_extern_c=True, keep=True)
+# TODO: should we use async call here, or leave as is - it is just a constructor?
+		p = m.get_function("prepare")
+		p.prepare("PiPi", block = (generatorCount, 1, 1))
+		try:
+# Max 256 threads on ION during preparing
+# Need to limit number of threads. If not there is failed execution:
+# pycuda._driver.LaunchError: cuLaunchGrid failed: launch out of resources
+			p.prepared_call((1, 1), self.state, generatorCount, vector, offset)
+		except pycuda._driver.LaunchError:
+			raise ValueError("Initialisation failed. Decrease number of threads.")
+		self.uniformInt = m.get_function("uniformInt")
+		self.uniformInt.prepare("PPi", block = (generatorCount, 1, 1))
+		self.uniformFloat = m.get_function("uniformFloat")
+		self.uniformFloat.prepare("PPi", block = (generatorCount, 1, 1))
+		self.uniformDouble = m.get_function("uniformDouble")
+		self.uniformDouble.prepare("PPi", block = (generatorCount, 1, 1))
+		self.normalFloat = m.get_function("normalFloat")
+		self.normalFloat.prepare("PPi", block = (generatorCount, 1, 1))
+		self.normalDouble = m.get_function("normalDouble")
+		self.normalDouble.prepare("PPi", block = (generatorCount, 1, 1))
+		self.skipAhead = m.get_function("skipAhead")
+		self.skipAhead.prepare("Pii", block = (generatorCount, 1, 1))
+		self.skipAheadArray = m.get_function("skipAheadArray")
+		self.skipAheadArray.prepare("PiP", block = (generatorCount, 1, 1))
+	def __del__(self):
+		self.state.free()
+	def fillInUniformInt(self, data, inputSize, stream = None):
+		self.uniformInt.prepared_async_call((1, 1), stream, self.state, data, inputSize)
+	def fillInUniformFloat(self, data, inputSize, stream = None):
+		self.uniformFloat.prepared_async_call((1, 1), stream, self.state, data, inputSize)
+	def fillInUniformDouble(self, data, inputSize, stream = None):
+		self.uniformDouble.prepared_async_call((1, 1), stream, self.state, data, inputSize)
+	def fillInNormalFloat(self, data, inputSize, stream = None):
+		self.normalFloat.prepared_async_call((1, 1), stream, self.state, data, inputSize)
+	def fillInNormalDouble(self, data, inputSize, stream = None):
+		self.normalDouble.prepared_async_call((1, 1), stream, self.state, data, inputSize)
+	def fillInUniform(self, data, stream = None):
+		if data.dtype == numpy.float32:
+			self.fillInUniformFloat(data.gpudata, data.size, stream)
+		elif data.dtype == numpy.float64:
+			self.fillInUniformDouble(data.gpudata, data.size, stream)
+		elif data.dtype in [numpy.int, numpy.int32, numpy.uint32]:
+			self.fillInUniformInt(data.gpudata, data.size, stream)
+		else:
+			raise NotImplementedError
+	def fillInNormal(self, data, stream = None):
+		if data.dtype == numpy.float32:
+			self.fillInNormalFloat(data.gpudata, data.size, stream)
+		elif data.dtype == numpy.float64:
+			self.fillInNormalDouble(data.gpudata, data.size, stream)
+		else:
+			raise NotImplementedError
+	def callSkipAhead(self, i):
+		self.skipAhead.prepared_call((1, 1), self.state, self.generatorCount, i)
+	def callSkipAheadArray(self, i):
+		self.skipAheadArray.prepared_call((1, 1), self.state, self.generatorCount, i.gpudata)
+	def __call__(self, shape, dtype=numpy.float32, stream=None):
+		import pycuda.gpuarray
+		result = pycuda.gpuarray.GPUArray(shape, dtype)
+		self.fillInUniform(result, stream)
+		return result
+
+
 if __name__ == "__main__":
     import sys, pycuda.autoinit
 
#! /usr/bin/python

import numpy

# Need to allocate memory for curandState - there in not enough memory
# (neither shared nor private) on the chip for one for each thread ;-)
# sizeof(curandState))
curandStateSize = 32
curandStateSobolSize = 32

randomSource = """
#include <curand_kernel.h>

extern "C" {
__global__ void prepareWithSeed(curandState *s, int n, int seed, int offset) {
	if (threadIdx.x < n) {
		curand_init(seed, threadIdx.x, offset, &s[threadIdx.x]);
	}
}

__global__ void prepareWithSeeds(curandState *s, int n, int *seed, int offset) {
	if (threadIdx.x < n) {
		curand_init(seed[threadIdx.x], threadIdx.x, offset, &s[threadIdx.x]);
	}
}

__global__ void uniformInt(curandState *s, int *d, int n) {
	const int tidx = threadIdx.x;
	for (int idx = tidx; idx < n; idx += blockDim.x) {
		d[idx] = curand(&s[tidx]);
	}
}

__global__ void uniformFloat(curandState *s, float *d, int n) {
	const int tidx = threadIdx.x;
	for (int idx = tidx; idx < n; idx += blockDim.x) {
		d[idx] = curand_uniform(&s[tidx]);
	}
}

__global__ void uniformDouble(curandState *s, double *d, int n) {
	const int tidx = threadIdx.x;
	for (int idx = tidx; idx < n; idx += blockDim.x) {
		d[idx] = curand_uniform_double(&s[tidx]);
	}
}

__global__ void normalFloat(curandState *s, float *d, int n) {
	const int tidx = threadIdx.x;
	for (int idx = tidx; idx < n; idx += blockDim.x) {
		d[idx] = curand_normal(&s[tidx]);
	}
}

__global__ void normalDouble(curandState *s, double *d, int n) {
	const int tidx = threadIdx.x;
	for (int idx = tidx; idx < n; idx += blockDim.x) {
		d[idx] = curand_normal_double(&s[tidx]);
	}
}

__global__ void normalFloat2(curandState *s, float2 *d, int n) {
	const int tidx = threadIdx.x;
	for (int idx = tidx; idx < n; idx += blockDim.x) {
		d[idx] = curand_normal2(&s[tidx]);
	}
}

__global__ void normalDouble2(curandState *s, double2 *d, int n) {
	const int tidx = threadIdx.x;
	for (int idx = tidx; idx < n; idx += blockDim.x) {
		d[idx] = curand_normal2_double(&s[tidx]);
	}
}

__global__ void skipAhead(curandState *s, int n, int skip) {
	const int idx = threadIdx.x;
	if (idx < n) {
		skipahead(skip, &s[idx]);
	}
}

__global__ void skipAheadArray(curandState *s, int n, int *skip) {
	const int idx = threadIdx.x;
	if (idx < n) {
		skipahead(skip[idx], &s[idx]);
	}
}
}
"""

class Randomizer:
	def __init__(self, generatorCount, offset, seed = None):
		import pycuda.compiler
		import pycuda.driver
		import pycuda._driver
		import pycuda.gpuarray
		if pycuda.driver.get_version() < (3, 2, 0):
			raise EnvironmentError("Need at least CUDA 3.2")
		self.generatorCount = generatorCount
		self.state = pycuda.driver.mem_alloc(generatorCount*curandStateSize)
		m = pycuda.compiler.SourceModule(randomSource, no_extern_c=True, keep=True)
		if seed is  None:
			import random
			import sys
			seed = random.randint(0, (1<<31)-1)
# TODO: should we use async call here, or leave as is - it is just a constructor?
# TODO: cudaThreadSetLimit(cudaLimitStackSize, 16k) on Fermi
		if type(seed) == type(1):
			p = m.get_function("prepareWithSeed")
			p.prepare("Piii", block = (generatorCount, 1, 1))
			try:
# Max 256 threads on ION during preparing
# Need to limit number of threads. If not there is failed execution:
# pycuda._driver.LaunchError: cuLaunchGrid failed: launch out of resources
				p.prepared_call((1, 1), self.state, generatorCount, seed, offset)
			except pycuda._driver.LaunchError:
				raise ValueError("Initialisation failed. Decrease number of threads.")
		else:
			if type(seed) in [type([]), type(())]:
				seed = numpy.array(seed, dtype = numpy.int32)
			if type(seed) == numpy.ndarray:
				seed = pycuda.gpuarray.to_gpu(seed).astype(numpy.int32)
			if type(seed) == pycuda.gpuarray.GPUArray and seed.dtype == numpy.int32:
				p = m.get_function("prepareWithSeeds")
				p.prepare("PiPi", block = (generatorCount, 1, 1))
				try:
# Max 256 threads on ION during preparing
# Need to limit number of threads. If not there is failed execution:
# pycuda._driver.LaunchError: cuLaunchGrid failed: launch out of resources
					p.prepared_call((1, 1), self.state, generatorCount, seed.gpudata, offset)
				except pycuda._driver.LaunchError:
					raise ValueError("Initialisation failed. Decrease number of threads.")
			else:
				raise ValueError("Need GPUArray of integers")
		self.uniformInt = m.get_function("uniformInt")
		self.uniformInt.prepare("PPi", block = (generatorCount, 1, 1))
		self.uniformFloat = m.get_function("uniformFloat")
		self.uniformFloat.prepare("PPi", block = (generatorCount, 1, 1))
		self.uniformDouble = m.get_function("uniformDouble")
		self.uniformDouble.prepare("PPi", block = (generatorCount, 1, 1))
		self.normalFloat = m.get_function("normalFloat")
		self.normalFloat.prepare("PPi", block = (generatorCount, 1, 1))
		self.normalDouble = m.get_function("normalDouble")
		self.normalDouble.prepare("PPi", block = (generatorCount, 1, 1))
		self.normalFloat2 = m.get_function("normalFloat2")
		self.normalFloat2.prepare("PPi", block = (generatorCount, 1, 1))
		self.normalDouble2 = m.get_function("normalDouble2")
		self.normalDouble2.prepare("PPi", block = (generatorCount, 1, 1))
		self.skipAhead = m.get_function("skipAhead")
		self.skipAhead.prepare("Pii", block = (generatorCount, 1, 1))
		self.skipAheadArray = m.get_function("skipAheadArray")
		self.skipAheadArray.prepare("PiP", block = (generatorCount, 1, 1))
	def __del__(self):
		self.state.free()
	def fillInUniformInt(self, data, inputSize, stream = None):
		self.uniformInt.prepared_async_call((1, 1), stream, self.state, data, inputSize)
	def fillInUniformFloat(self, data, inputSize, stream = None):
		self.uniformFloat.prepared_async_call((1, 1), stream, self.state, data, inputSize)
	def fillInUniformDouble(self, data, inputSize, stream = None):
		self.uniformDouble.prepared_async_call((1, 1), stream, self.state, data, inputSize)
	def fillInNormalFloat(self, data, inputSize, stream = None):
		self.normalFloat.prepared_async_call((1, 1), stream, self.state, data, inputSize)
	def fillInNormalDouble(self, data, inputSize, stream = None):
		self.normalDouble.prepared_async_call((1, 1), stream, self.state, data, inputSize)
	def fillInNormalFloat2(self, data, inputSize, stream = None):
		self.normalFloat2.prepared_async_call((1, 1), stream, self.state, data, inputSize)
	def fillInNormalDouble2(self, data, inputSize, stream = None):
		self.normalDouble2.prepared_async_call((1, 1), stream, self.state, data, inputSize)
	def fillInUniform(self, data, stream = None):
		if data.dtype == numpy.float32:
			self.fillInUniformFloat(data.gpudata, data.size, stream)
		elif data.dtype == numpy.float64:
			self.fillInUniformDouble(data.gpudata, data.size, stream)
		elif data.dtype in [numpy.int, numpy.int32, numpy.uint32]:
			self.fillInUniformInt(data.gpudata, data.size, stream)
		else:
			raise NotImplementedError
	def fillInNormal(self, data, stream = None):
		if data.dtype == numpy.float32:
			self.fillInNormalFloat(data.gpudata, data.size, stream)
		elif data.dtype == numpy.float64:
			self.fillInNormalDouble(data.gpudata, data.size, stream)
		else:
			raise NotImplementedError
	def callSkipAhead(self, i, stream = None):
		self.skipAhead.prepared_async_call((1, 1), stream, self.state, self.generatorCount, i)
	def callSkipAheadArray(self, i, stream = None):
		self.skipAheadArray.prepared_async_call((1, 1), stream, self.state, self.generatorCount, i.gpudata)
	def __call__(self, shape, dtype=numpy.float32, stream=None):
		import pycuda.gpuarray
		result = pycuda.gpuarray.GPUArray(shape, dtype)
# Do not use GPUArray._blocks like curand.rand does;
# we are limited by number of states, set in constructor
		self.fillInUniform(result, stream)
		return result

quasirandomSource = """
// Uses C++ features (templates); do not surround with extern C
#include <curand_kernel.h>

extern "C" {
__global__ void prepare(curandStateSobol32 *s, int n, unsigned int *v, unsigned int o) {
	if (threadIdx.x < n) {
		curand_init(v, o, &s[threadIdx.x]);
	}
}

__global__ void uniformInt(curandStateSobol32 *s, int *d, int n) {
	const int tidx = threadIdx.x;
	for (int idx = tidx; idx < n; idx += blockDim.x) {
		d[idx] = curand(&s[tidx]);
	}
}

__global__ void uniformFloat(curandStateSobol32 *s, float *d, int n) {
	const int tidx = threadIdx.x;
	for (int idx = tidx; idx < n; idx += blockDim.x) {
		d[idx] = curand_uniform(&s[tidx]);
	}
}

__global__ void uniformDouble(curandStateSobol32 *s, double *d, int n) {
	const int tidx = threadIdx.x;
	for (int idx = tidx; idx < n; idx += blockDim.x) {
		d[idx] = curand_uniform_double(&s[tidx]);
	}
}

__global__ void normalFloat(curandStateSobol32 *s, float *d, int n) {
	const int tidx = threadIdx.x;
	for (int idx = tidx; idx < n; idx += blockDim.x) {
		d[idx] = curand_normal(&s[tidx]);
	}
}

__global__ void normalDouble(curandStateSobol32 *s, double *d, int n) {
	const int tidx = threadIdx.x;
	for (int idx = tidx; idx < n; idx += blockDim.x) {
		d[idx] = curand_normal_double(&s[tidx]);
	}
}

__global__ void skipAhead(curandStateSobol32 *s, int n, int skip) {
	const int idx = threadIdx.x;
	if (idx < n) {
		skipahead(skip, &s[idx]);
	}
}

__global__ void skipAheadArray(curandStateSobol32 *s, int n, int *skip) {
	const int idx = threadIdx.x;
	if (idx < n) {
		skipahead(skip[idx], &s[idx]);
	}
}
}
"""

class QuasiRandomizer:
	def __init__(self, generatorCount, vector, offset):
		import pycuda.compiler
		import pycuda.driver
		if pycuda.driver.get_version() < (3, 2, 0):
			raise EnvironmentError("Need at least CUDA 3.2")
		self.generatorCount = generatorCount
		self.state = pycuda.driver.mem_alloc(generatorCount*curandStateSobolSize)
		m = pycuda.compiler.SourceModule(quasirandomSource, no_extern_c=True, keep=True)
# TODO: should we use async call here, or leave as is - it is just a constructor?
		p = m.get_function("prepare")
		p.prepare("PiPi", block = (generatorCount, 1, 1))
		try:
# Max 256 threads on ION during preparing
# Need to limit number of threads. If not there is failed execution:
# pycuda._driver.LaunchError: cuLaunchGrid failed: launch out of resources
			p.prepared_call((1, 1), self.state, generatorCount, vector, offset)
		except pycuda._driver.LaunchError:
			raise ValueError("Initialisation failed. Decrease number of threads.")
		self.uniformInt = m.get_function("uniformInt")
		self.uniformInt.prepare("PPi", block = (generatorCount, 1, 1))
		self.uniformFloat = m.get_function("uniformFloat")
		self.uniformFloat.prepare("PPi", block = (generatorCount, 1, 1))
		self.uniformDouble = m.get_function("uniformDouble")
		self.uniformDouble.prepare("PPi", block = (generatorCount, 1, 1))
		self.normalFloat = m.get_function("normalFloat")
		self.normalFloat.prepare("PPi", block = (generatorCount, 1, 1))
		self.normalDouble = m.get_function("normalDouble")
		self.normalDouble.prepare("PPi", block = (generatorCount, 1, 1))
		self.skipAhead = m.get_function("skipAhead")
		self.skipAhead.prepare("Pii", block = (generatorCount, 1, 1))
		self.skipAheadArray = m.get_function("skipAheadArray")
		self.skipAheadArray.prepare("PiP", block = (generatorCount, 1, 1))
	def __del__(self):
		self.state.free()
	def fillInUniformInt(self, data, inputSize, stream = None):
		self.uniformInt.prepared_async_call((1, 1), stream, self.state, data, inputSize)
	def fillInUniformFloat(self, data, inputSize, stream = None):
		self.uniformFloat.prepared_async_call((1, 1), stream, self.state, data, inputSize)
	def fillInUniformDouble(self, data, inputSize, stream = None):
		self.uniformDouble.prepared_async_call((1, 1), stream, self.state, data, inputSize)
	def fillInNormalFloat(self, data, inputSize, stream = None):
		self.normalFloat.prepared_async_call((1, 1), stream, self.state, data, inputSize)
	def fillInNormalDouble(self, data, inputSize, stream = None):
		self.normalDouble.prepared_async_call((1, 1), stream, self.state, data, inputSize)
	def fillInUniform(self, data, stream = None):
		if data.dtype == numpy.float32:
			self.fillInUniformFloat(data.gpudata, data.size, stream)
		elif data.dtype == numpy.float64:
			self.fillInUniformDouble(data.gpudata, data.size, stream)
		elif data.dtype in [numpy.int, numpy.int32, numpy.uint32]:
			self.fillInUniformInt(data.gpudata, data.size, stream)
		else:
			raise NotImplementedError
	def fillInNormal(self, data, stream = None):
		if data.dtype == numpy.float32:
			self.fillInNormalFloat(data.gpudata, data.size, stream)
		elif data.dtype == numpy.float64:
			self.fillInNormalDouble(data.gpudata, data.size, stream)
		else:
			raise NotImplementedError
	def callSkipAhead(self, i):
		self.skipAhead.prepared_call((1, 1), self.state, self.generatorCount, i)
	def callSkipAheadArray(self, i):
		self.skipAheadArray.prepared_call((1, 1), self.state, self.generatorCount, i.gpudata)
	def __call__(self, shape, dtype=numpy.float32, stream=None):
		import pycuda.gpuarray
		result = pycuda.gpuarray.GPUArray(shape, dtype)
		self.fillInUniform(result, stream)
		return result

if __name__ == '__main__':
	import pycuda.gpuarray
	import pycuda.autoinit
	rr = Randomizer(256, 0, numpy.random.random(256).astype(numpy.int32))
	data = pycuda.gpuarray.zeros([10000], numpy.float32)
	print data[0:200]
	rr.fillInUniformFloat(data.gpudata, 512)
	print data[0:200]
	data = rr((100, ), numpy.uint32)
	del rr
	print data[0:200]

	data = pycuda.gpuarray.zeros([256], numpy.int)
	rr = QuasiRandomizer(256, data.gpudata, 1)

Attachment: signature.asc
Description: This is a digitally signed message part

_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda

Reply via email to