Hello. NVIDIA introduced library CURAND in CUDA 3.2. It allows for generating pseudo-random numbers on GPU. CURAND library consists of two parts - host part, and device part. The latter is set of __device__ kernels that can be called from custom kernels. I attach my wrapper around those kernels, and patch for PyCUDA that would include my objects to pycuda.curandom. I have checked those classes on both Fermi and 1.1 device. On ION one cannot call 512 threads at one time - there is not enough resources to initialise 512 random states at one time.
This is rather early version - so please give some feedback. I am also using camelCase names while PyCUDA uses names_with_underscores; I assume I will need to change it before patch gets accepted. -- Tomasz Rybak <[email protected]> GPG/PGP key ID: 2AD5 9860 Fingerprint A481 824E 7DD3 9C0E C40A 488E C654 FB33 2AD5 9860 http://member.acm.org/~tomaszrybak
diff --git a/pycuda/curandom.py b/pycuda/curandom.py
index 2105f40..297b764 100644
--- a/pycuda/curandom.py
+++ b/pycuda/curandom.py
@@ -233,6 +233,333 @@ def rand(shape, dtype=numpy.float32, stream=None):
return result
+# Need to allocate memory for curandState - there in not enough memory
+# (neither shared nor private) on the chip for one for each thread ;-)
+# sizeof(curandState))
+curandStateSize = 32
+curandStateSobolSize = 32
+
+randomSource = """
+#include <curand_kernel.h>
+
+extern "C" {
+__global__ void prepareWithSeed(curandState *s, int n, int seed, int offset) {
+ if (threadIdx.x < n) {
+ curand_init(seed, threadIdx.x, offset, &s[threadIdx.x]);
+ }
+}
+
+__global__ void prepareWithSeeds(curandState *s, int n, int *seed, int offset) {
+ if (threadIdx.x < n) {
+ curand_init(seed[threadIdx.x], threadIdx.x, offset, &s[threadIdx.x]);
+ }
+}
+
+__global__ void uniformInt(curandState *s, int *d, int n) {
+ const int tidx = threadIdx.x;
+ for (int idx = tidx; idx < n; idx += blockDim.x) {
+ d[idx] = curand(&s[tidx]);
+ }
+}
+
+__global__ void uniformFloat(curandState *s, float *d, int n) {
+ const int tidx = threadIdx.x;
+ for (int idx = tidx; idx < n; idx += blockDim.x) {
+ d[idx] = curand_uniform(&s[tidx]);
+ }
+}
+
+__global__ void uniformDouble(curandState *s, double *d, int n) {
+ const int tidx = threadIdx.x;
+ for (int idx = tidx; idx < n; idx += blockDim.x) {
+ d[idx] = curand_uniform_double(&s[tidx]);
+ }
+}
+
+__global__ void normalFloat(curandState *s, float *d, int n) {
+ const int tidx = threadIdx.x;
+ for (int idx = tidx; idx < n; idx += blockDim.x) {
+ d[idx] = curand_normal(&s[tidx]);
+ }
+}
+
+__global__ void normalDouble(curandState *s, double *d, int n) {
+ const int tidx = threadIdx.x;
+ for (int idx = tidx; idx < n; idx += blockDim.x) {
+ d[idx] = curand_normal_double(&s[tidx]);
+ }
+}
+
+__global__ void normalFloat2(curandState *s, float2 *d, int n) {
+ const int tidx = threadIdx.x;
+ for (int idx = tidx; idx < n; idx += blockDim.x) {
+ d[idx] = curand_normal2(&s[tidx]);
+ }
+}
+
+__global__ void normalDouble2(curandState *s, double2 *d, int n) {
+ const int tidx = threadIdx.x;
+ for (int idx = tidx; idx < n; idx += blockDim.x) {
+ d[idx] = curand_normal2_double(&s[tidx]);
+ }
+}
+
+__global__ void skipAhead(curandState *s, int n, int skip) {
+ const int idx = threadIdx.x;
+ if (idx < n) {
+ skipahead(skip, &s[idx]);
+ }
+}
+
+__global__ void skipAheadArray(curandState *s, int n, int *skip) {
+ const int idx = threadIdx.x;
+ if (idx < n) {
+ skipahead(skip[idx], &s[idx]);
+ }
+}
+}
+"""
+
+class Randomizer:
+ def __init__(self, generatorCount, offset, seed = None):
+ import pycuda.compiler
+ import pycuda.driver
+ import pycuda._driver
+ import pycuda.gpuarray
+ if pycuda.driver.get_version() < (3, 2, 0):
+ raise EnvironmentError("Need at least CUDA 3.2")
+ self.generatorCount = generatorCount
+ self.state = pycuda.driver.mem_alloc(generatorCount*curandStateSize)
+ m = pycuda.compiler.SourceModule(randomSource, no_extern_c=True, keep=True)
+ if seed is None:
+ import random
+ import sys
+ seed = random.randint(0, (1<<31)-1)
+# TODO: should we use async call here, or leave as is - it is just a constructor?
+# TODO: cudaThreadSetLimit(cudaLimitStackSize, 16k) on Fermi
+ if type(seed) == type(1):
+ p = m.get_function("prepareWithSeed")
+ p.prepare("Piii", block = (generatorCount, 1, 1))
+ try:
+# Max 256 threads on ION during preparing
+# Need to limit number of threads. If not there is failed execution:
+# pycuda._driver.LaunchError: cuLaunchGrid failed: launch out of resources
+ p.prepared_call((1, 1), self.state, generatorCount, seed, offset)
+ except pycuda._driver.LaunchError:
+ raise ValueError("Initialisation failed. Decrease number of threads.")
+ else:
+ if type(seed) in [type([]), type(())]:
+ seed = numpy.array(seed, dtype = numpy.int32)
+ if type(seed) == numpy.ndarray:
+ seed = pycuda.gpuarray.to_gpu(seed).astype(numpy.int32)
+ if type(seed) == pycuda.gpuarray.GPUArray and seed.dtype == numpy.int32:
+ p = m.get_function("prepareWithSeeds")
+ p.prepare("PiPi", block = (generatorCount, 1, 1))
+ try:
+# Max 256 threads on ION during preparing
+# Need to limit number of threads. If not there is failed execution:
+# pycuda._driver.LaunchError: cuLaunchGrid failed: launch out of resources
+ p.prepared_call((1, 1), self.state, generatorCount, seed.gpudata, offset)
+ except pycuda._driver.LaunchError:
+ raise ValueError("Initialisation failed. Decrease number of threads.")
+ else:
+ raise ValueError("Need GPUArray of integers")
+ self.uniformInt = m.get_function("uniformInt")
+ self.uniformInt.prepare("PPi", block = (generatorCount, 1, 1))
+ self.uniformFloat = m.get_function("uniformFloat")
+ self.uniformFloat.prepare("PPi", block = (generatorCount, 1, 1))
+ self.uniformDouble = m.get_function("uniformDouble")
+ self.uniformDouble.prepare("PPi", block = (generatorCount, 1, 1))
+ self.normalFloat = m.get_function("normalFloat")
+ self.normalFloat.prepare("PPi", block = (generatorCount, 1, 1))
+ self.normalDouble = m.get_function("normalDouble")
+ self.normalDouble.prepare("PPi", block = (generatorCount, 1, 1))
+ self.normalFloat2 = m.get_function("normalFloat2")
+ self.normalFloat2.prepare("PPi", block = (generatorCount, 1, 1))
+ self.normalDouble2 = m.get_function("normalDouble2")
+ self.normalDouble2.prepare("PPi", block = (generatorCount, 1, 1))
+ self.skipAhead = m.get_function("skipAhead")
+ self.skipAhead.prepare("Pii", block = (generatorCount, 1, 1))
+ self.skipAheadArray = m.get_function("skipAheadArray")
+ self.skipAheadArray.prepare("PiP", block = (generatorCount, 1, 1))
+ def __del__(self):
+ self.state.free()
+ def fillInUniformInt(self, data, inputSize, stream = None):
+ self.uniformInt.prepared_async_call((1, 1), stream, self.state, data, inputSize)
+ def fillInUniformFloat(self, data, inputSize, stream = None):
+ self.uniformFloat.prepared_async_call((1, 1), stream, self.state, data, inputSize)
+ def fillInUniformDouble(self, data, inputSize, stream = None):
+ self.uniformDouble.prepared_async_call((1, 1), stream, self.state, data, inputSize)
+ def fillInNormalFloat(self, data, inputSize, stream = None):
+ self.normalFloat.prepared_async_call((1, 1), stream, self.state, data, inputSize)
+ def fillInNormalDouble(self, data, inputSize, stream = None):
+ self.normalDouble.prepared_async_call((1, 1), stream, self.state, data, inputSize)
+ def fillInNormalFloat2(self, data, inputSize, stream = None):
+ self.normalFloat2.prepared_async_call((1, 1), stream, self.state, data, inputSize)
+ def fillInNormalDouble2(self, data, inputSize, stream = None):
+ self.normalDouble2.prepared_async_call((1, 1), stream, self.state, data, inputSize)
+ def fillInUniform(self, data, stream = None):
+ if data.dtype == numpy.float32:
+ self.fillInUniformFloat(data.gpudata, data.size, stream)
+ elif data.dtype == numpy.float64:
+ self.fillInUniformDouble(data.gpudata, data.size, stream)
+ elif data.dtype in [numpy.int, numpy.int32, numpy.uint32]:
+ self.fillInUniformInt(data.gpudata, data.size, stream)
+ else:
+ raise NotImplementedError
+ def fillInNormal(self, data, stream = None):
+ if data.dtype == numpy.float32:
+ self.fillInNormalFloat(data.gpudata, data.size, stream)
+ elif data.dtype == numpy.float64:
+ self.fillInNormalDouble(data.gpudata, data.size, stream)
+ else:
+ raise NotImplementedError
+ def callSkipAhead(self, i, stream = None):
+ self.skipAhead.prepared_async_call((1, 1), stream, self.state, self.generatorCount, i)
+ def callSkipAheadArray(self, i, stream = None):
+ self.skipAheadArray.prepared_async_call((1, 1), stream, self.state, self.generatorCount, i.gpudata)
+ def __call__(self, shape, dtype=numpy.float32, stream=None):
+ import pycuda.gpuarray
+ result = pycuda.gpuarray.GPUArray(shape, dtype)
+# Do not use GPUArray._blocks like curand.rand does;
+# we are limited by number of states, set in constructor
+ self.fillInUniform(result, stream)
+ return result
+
+quasirandomSource = """
+// Uses C++ features (templates); do not surround with extern C
+#include <curand_kernel.h>
+
+extern "C" {
+__global__ void prepare(curandStateSobol32 *s, int n, unsigned int *v, unsigned int o) {
+ if (threadIdx.x < n) {
+ curand_init(v, o, &s[threadIdx.x]);
+ }
+}
+
+__global__ void uniformInt(curandStateSobol32 *s, int *d, int n) {
+ const int tidx = threadIdx.x;
+ for (int idx = tidx; idx < n; idx += blockDim.x) {
+ d[idx] = curand(&s[tidx]);
+ }
+}
+
+__global__ void uniformFloat(curandStateSobol32 *s, float *d, int n) {
+ const int tidx = threadIdx.x;
+ for (int idx = tidx; idx < n; idx += blockDim.x) {
+ d[idx] = curand_uniform(&s[tidx]);
+ }
+}
+
+__global__ void uniformDouble(curandStateSobol32 *s, double *d, int n) {
+ const int tidx = threadIdx.x;
+ for (int idx = tidx; idx < n; idx += blockDim.x) {
+ d[idx] = curand_uniform_double(&s[tidx]);
+ }
+}
+
+__global__ void normalFloat(curandStateSobol32 *s, float *d, int n) {
+ const int tidx = threadIdx.x;
+ for (int idx = tidx; idx < n; idx += blockDim.x) {
+ d[idx] = curand_normal(&s[tidx]);
+ }
+}
+
+__global__ void normalDouble(curandStateSobol32 *s, double *d, int n) {
+ const int tidx = threadIdx.x;
+ for (int idx = tidx; idx < n; idx += blockDim.x) {
+ d[idx] = curand_normal_double(&s[tidx]);
+ }
+}
+
+__global__ void skipAhead(curandStateSobol32 *s, int n, int skip) {
+ const int idx = threadIdx.x;
+ if (idx < n) {
+ skipahead(skip, &s[idx]);
+ }
+}
+
+__global__ void skipAheadArray(curandStateSobol32 *s, int n, int *skip) {
+ const int idx = threadIdx.x;
+ if (idx < n) {
+ skipahead(skip[idx], &s[idx]);
+ }
+}
+}
+"""
+
+class QuasiRandomizer:
+ def __init__(self, generatorCount, vector, offset):
+ import pycuda.compiler
+ import pycuda.driver
+ if pycuda.driver.get_version() < (3, 2, 0):
+ raise EnvironmentError("Need at least CUDA 3.2")
+ self.generatorCount = generatorCount
+ self.state = pycuda.driver.mem_alloc(generatorCount*curandStateSobolSize)
+ m = pycuda.compiler.SourceModule(quasirandomSource, no_extern_c=True, keep=True)
+# TODO: should we use async call here, or leave as is - it is just a constructor?
+ p = m.get_function("prepare")
+ p.prepare("PiPi", block = (generatorCount, 1, 1))
+ try:
+# Max 256 threads on ION during preparing
+# Need to limit number of threads. If not there is failed execution:
+# pycuda._driver.LaunchError: cuLaunchGrid failed: launch out of resources
+ p.prepared_call((1, 1), self.state, generatorCount, vector, offset)
+ except pycuda._driver.LaunchError:
+ raise ValueError("Initialisation failed. Decrease number of threads.")
+ self.uniformInt = m.get_function("uniformInt")
+ self.uniformInt.prepare("PPi", block = (generatorCount, 1, 1))
+ self.uniformFloat = m.get_function("uniformFloat")
+ self.uniformFloat.prepare("PPi", block = (generatorCount, 1, 1))
+ self.uniformDouble = m.get_function("uniformDouble")
+ self.uniformDouble.prepare("PPi", block = (generatorCount, 1, 1))
+ self.normalFloat = m.get_function("normalFloat")
+ self.normalFloat.prepare("PPi", block = (generatorCount, 1, 1))
+ self.normalDouble = m.get_function("normalDouble")
+ self.normalDouble.prepare("PPi", block = (generatorCount, 1, 1))
+ self.skipAhead = m.get_function("skipAhead")
+ self.skipAhead.prepare("Pii", block = (generatorCount, 1, 1))
+ self.skipAheadArray = m.get_function("skipAheadArray")
+ self.skipAheadArray.prepare("PiP", block = (generatorCount, 1, 1))
+ def __del__(self):
+ self.state.free()
+ def fillInUniformInt(self, data, inputSize, stream = None):
+ self.uniformInt.prepared_async_call((1, 1), stream, self.state, data, inputSize)
+ def fillInUniformFloat(self, data, inputSize, stream = None):
+ self.uniformFloat.prepared_async_call((1, 1), stream, self.state, data, inputSize)
+ def fillInUniformDouble(self, data, inputSize, stream = None):
+ self.uniformDouble.prepared_async_call((1, 1), stream, self.state, data, inputSize)
+ def fillInNormalFloat(self, data, inputSize, stream = None):
+ self.normalFloat.prepared_async_call((1, 1), stream, self.state, data, inputSize)
+ def fillInNormalDouble(self, data, inputSize, stream = None):
+ self.normalDouble.prepared_async_call((1, 1), stream, self.state, data, inputSize)
+ def fillInUniform(self, data, stream = None):
+ if data.dtype == numpy.float32:
+ self.fillInUniformFloat(data.gpudata, data.size, stream)
+ elif data.dtype == numpy.float64:
+ self.fillInUniformDouble(data.gpudata, data.size, stream)
+ elif data.dtype in [numpy.int, numpy.int32, numpy.uint32]:
+ self.fillInUniformInt(data.gpudata, data.size, stream)
+ else:
+ raise NotImplementedError
+ def fillInNormal(self, data, stream = None):
+ if data.dtype == numpy.float32:
+ self.fillInNormalFloat(data.gpudata, data.size, stream)
+ elif data.dtype == numpy.float64:
+ self.fillInNormalDouble(data.gpudata, data.size, stream)
+ else:
+ raise NotImplementedError
+ def callSkipAhead(self, i):
+ self.skipAhead.prepared_call((1, 1), self.state, self.generatorCount, i)
+ def callSkipAheadArray(self, i):
+ self.skipAheadArray.prepared_call((1, 1), self.state, self.generatorCount, i.gpudata)
+ def __call__(self, shape, dtype=numpy.float32, stream=None):
+ import pycuda.gpuarray
+ result = pycuda.gpuarray.GPUArray(shape, dtype)
+ self.fillInUniform(result, stream)
+ return result
+
+
if __name__ == "__main__":
import sys, pycuda.autoinit
#! /usr/bin/python
import numpy
# Need to allocate memory for curandState - there in not enough memory
# (neither shared nor private) on the chip for one for each thread ;-)
# sizeof(curandState))
curandStateSize = 32
curandStateSobolSize = 32
randomSource = """
#include <curand_kernel.h>
extern "C" {
__global__ void prepareWithSeed(curandState *s, int n, int seed, int offset) {
if (threadIdx.x < n) {
curand_init(seed, threadIdx.x, offset, &s[threadIdx.x]);
}
}
__global__ void prepareWithSeeds(curandState *s, int n, int *seed, int offset) {
if (threadIdx.x < n) {
curand_init(seed[threadIdx.x], threadIdx.x, offset, &s[threadIdx.x]);
}
}
__global__ void uniformInt(curandState *s, int *d, int n) {
const int tidx = threadIdx.x;
for (int idx = tidx; idx < n; idx += blockDim.x) {
d[idx] = curand(&s[tidx]);
}
}
__global__ void uniformFloat(curandState *s, float *d, int n) {
const int tidx = threadIdx.x;
for (int idx = tidx; idx < n; idx += blockDim.x) {
d[idx] = curand_uniform(&s[tidx]);
}
}
__global__ void uniformDouble(curandState *s, double *d, int n) {
const int tidx = threadIdx.x;
for (int idx = tidx; idx < n; idx += blockDim.x) {
d[idx] = curand_uniform_double(&s[tidx]);
}
}
__global__ void normalFloat(curandState *s, float *d, int n) {
const int tidx = threadIdx.x;
for (int idx = tidx; idx < n; idx += blockDim.x) {
d[idx] = curand_normal(&s[tidx]);
}
}
__global__ void normalDouble(curandState *s, double *d, int n) {
const int tidx = threadIdx.x;
for (int idx = tidx; idx < n; idx += blockDim.x) {
d[idx] = curand_normal_double(&s[tidx]);
}
}
__global__ void normalFloat2(curandState *s, float2 *d, int n) {
const int tidx = threadIdx.x;
for (int idx = tidx; idx < n; idx += blockDim.x) {
d[idx] = curand_normal2(&s[tidx]);
}
}
__global__ void normalDouble2(curandState *s, double2 *d, int n) {
const int tidx = threadIdx.x;
for (int idx = tidx; idx < n; idx += blockDim.x) {
d[idx] = curand_normal2_double(&s[tidx]);
}
}
__global__ void skipAhead(curandState *s, int n, int skip) {
const int idx = threadIdx.x;
if (idx < n) {
skipahead(skip, &s[idx]);
}
}
__global__ void skipAheadArray(curandState *s, int n, int *skip) {
const int idx = threadIdx.x;
if (idx < n) {
skipahead(skip[idx], &s[idx]);
}
}
}
"""
class Randomizer:
def __init__(self, generatorCount, offset, seed = None):
import pycuda.compiler
import pycuda.driver
import pycuda._driver
import pycuda.gpuarray
if pycuda.driver.get_version() < (3, 2, 0):
raise EnvironmentError("Need at least CUDA 3.2")
self.generatorCount = generatorCount
self.state = pycuda.driver.mem_alloc(generatorCount*curandStateSize)
m = pycuda.compiler.SourceModule(randomSource, no_extern_c=True, keep=True)
if seed is None:
import random
import sys
seed = random.randint(0, (1<<31)-1)
# TODO: should we use async call here, or leave as is - it is just a constructor?
# TODO: cudaThreadSetLimit(cudaLimitStackSize, 16k) on Fermi
if type(seed) == type(1):
p = m.get_function("prepareWithSeed")
p.prepare("Piii", block = (generatorCount, 1, 1))
try:
# Max 256 threads on ION during preparing
# Need to limit number of threads. If not there is failed execution:
# pycuda._driver.LaunchError: cuLaunchGrid failed: launch out of resources
p.prepared_call((1, 1), self.state, generatorCount, seed, offset)
except pycuda._driver.LaunchError:
raise ValueError("Initialisation failed. Decrease number of threads.")
else:
if type(seed) in [type([]), type(())]:
seed = numpy.array(seed, dtype = numpy.int32)
if type(seed) == numpy.ndarray:
seed = pycuda.gpuarray.to_gpu(seed).astype(numpy.int32)
if type(seed) == pycuda.gpuarray.GPUArray and seed.dtype == numpy.int32:
p = m.get_function("prepareWithSeeds")
p.prepare("PiPi", block = (generatorCount, 1, 1))
try:
# Max 256 threads on ION during preparing
# Need to limit number of threads. If not there is failed execution:
# pycuda._driver.LaunchError: cuLaunchGrid failed: launch out of resources
p.prepared_call((1, 1), self.state, generatorCount, seed.gpudata, offset)
except pycuda._driver.LaunchError:
raise ValueError("Initialisation failed. Decrease number of threads.")
else:
raise ValueError("Need GPUArray of integers")
self.uniformInt = m.get_function("uniformInt")
self.uniformInt.prepare("PPi", block = (generatorCount, 1, 1))
self.uniformFloat = m.get_function("uniformFloat")
self.uniformFloat.prepare("PPi", block = (generatorCount, 1, 1))
self.uniformDouble = m.get_function("uniformDouble")
self.uniformDouble.prepare("PPi", block = (generatorCount, 1, 1))
self.normalFloat = m.get_function("normalFloat")
self.normalFloat.prepare("PPi", block = (generatorCount, 1, 1))
self.normalDouble = m.get_function("normalDouble")
self.normalDouble.prepare("PPi", block = (generatorCount, 1, 1))
self.normalFloat2 = m.get_function("normalFloat2")
self.normalFloat2.prepare("PPi", block = (generatorCount, 1, 1))
self.normalDouble2 = m.get_function("normalDouble2")
self.normalDouble2.prepare("PPi", block = (generatorCount, 1, 1))
self.skipAhead = m.get_function("skipAhead")
self.skipAhead.prepare("Pii", block = (generatorCount, 1, 1))
self.skipAheadArray = m.get_function("skipAheadArray")
self.skipAheadArray.prepare("PiP", block = (generatorCount, 1, 1))
def __del__(self):
self.state.free()
def fillInUniformInt(self, data, inputSize, stream = None):
self.uniformInt.prepared_async_call((1, 1), stream, self.state, data, inputSize)
def fillInUniformFloat(self, data, inputSize, stream = None):
self.uniformFloat.prepared_async_call((1, 1), stream, self.state, data, inputSize)
def fillInUniformDouble(self, data, inputSize, stream = None):
self.uniformDouble.prepared_async_call((1, 1), stream, self.state, data, inputSize)
def fillInNormalFloat(self, data, inputSize, stream = None):
self.normalFloat.prepared_async_call((1, 1), stream, self.state, data, inputSize)
def fillInNormalDouble(self, data, inputSize, stream = None):
self.normalDouble.prepared_async_call((1, 1), stream, self.state, data, inputSize)
def fillInNormalFloat2(self, data, inputSize, stream = None):
self.normalFloat2.prepared_async_call((1, 1), stream, self.state, data, inputSize)
def fillInNormalDouble2(self, data, inputSize, stream = None):
self.normalDouble2.prepared_async_call((1, 1), stream, self.state, data, inputSize)
def fillInUniform(self, data, stream = None):
if data.dtype == numpy.float32:
self.fillInUniformFloat(data.gpudata, data.size, stream)
elif data.dtype == numpy.float64:
self.fillInUniformDouble(data.gpudata, data.size, stream)
elif data.dtype in [numpy.int, numpy.int32, numpy.uint32]:
self.fillInUniformInt(data.gpudata, data.size, stream)
else:
raise NotImplementedError
def fillInNormal(self, data, stream = None):
if data.dtype == numpy.float32:
self.fillInNormalFloat(data.gpudata, data.size, stream)
elif data.dtype == numpy.float64:
self.fillInNormalDouble(data.gpudata, data.size, stream)
else:
raise NotImplementedError
def callSkipAhead(self, i, stream = None):
self.skipAhead.prepared_async_call((1, 1), stream, self.state, self.generatorCount, i)
def callSkipAheadArray(self, i, stream = None):
self.skipAheadArray.prepared_async_call((1, 1), stream, self.state, self.generatorCount, i.gpudata)
def __call__(self, shape, dtype=numpy.float32, stream=None):
import pycuda.gpuarray
result = pycuda.gpuarray.GPUArray(shape, dtype)
# Do not use GPUArray._blocks like curand.rand does;
# we are limited by number of states, set in constructor
self.fillInUniform(result, stream)
return result
quasirandomSource = """
// Uses C++ features (templates); do not surround with extern C
#include <curand_kernel.h>
extern "C" {
__global__ void prepare(curandStateSobol32 *s, int n, unsigned int *v, unsigned int o) {
if (threadIdx.x < n) {
curand_init(v, o, &s[threadIdx.x]);
}
}
__global__ void uniformInt(curandStateSobol32 *s, int *d, int n) {
const int tidx = threadIdx.x;
for (int idx = tidx; idx < n; idx += blockDim.x) {
d[idx] = curand(&s[tidx]);
}
}
__global__ void uniformFloat(curandStateSobol32 *s, float *d, int n) {
const int tidx = threadIdx.x;
for (int idx = tidx; idx < n; idx += blockDim.x) {
d[idx] = curand_uniform(&s[tidx]);
}
}
__global__ void uniformDouble(curandStateSobol32 *s, double *d, int n) {
const int tidx = threadIdx.x;
for (int idx = tidx; idx < n; idx += blockDim.x) {
d[idx] = curand_uniform_double(&s[tidx]);
}
}
__global__ void normalFloat(curandStateSobol32 *s, float *d, int n) {
const int tidx = threadIdx.x;
for (int idx = tidx; idx < n; idx += blockDim.x) {
d[idx] = curand_normal(&s[tidx]);
}
}
__global__ void normalDouble(curandStateSobol32 *s, double *d, int n) {
const int tidx = threadIdx.x;
for (int idx = tidx; idx < n; idx += blockDim.x) {
d[idx] = curand_normal_double(&s[tidx]);
}
}
__global__ void skipAhead(curandStateSobol32 *s, int n, int skip) {
const int idx = threadIdx.x;
if (idx < n) {
skipahead(skip, &s[idx]);
}
}
__global__ void skipAheadArray(curandStateSobol32 *s, int n, int *skip) {
const int idx = threadIdx.x;
if (idx < n) {
skipahead(skip[idx], &s[idx]);
}
}
}
"""
class QuasiRandomizer:
def __init__(self, generatorCount, vector, offset):
import pycuda.compiler
import pycuda.driver
if pycuda.driver.get_version() < (3, 2, 0):
raise EnvironmentError("Need at least CUDA 3.2")
self.generatorCount = generatorCount
self.state = pycuda.driver.mem_alloc(generatorCount*curandStateSobolSize)
m = pycuda.compiler.SourceModule(quasirandomSource, no_extern_c=True, keep=True)
# TODO: should we use async call here, or leave as is - it is just a constructor?
p = m.get_function("prepare")
p.prepare("PiPi", block = (generatorCount, 1, 1))
try:
# Max 256 threads on ION during preparing
# Need to limit number of threads. If not there is failed execution:
# pycuda._driver.LaunchError: cuLaunchGrid failed: launch out of resources
p.prepared_call((1, 1), self.state, generatorCount, vector, offset)
except pycuda._driver.LaunchError:
raise ValueError("Initialisation failed. Decrease number of threads.")
self.uniformInt = m.get_function("uniformInt")
self.uniformInt.prepare("PPi", block = (generatorCount, 1, 1))
self.uniformFloat = m.get_function("uniformFloat")
self.uniformFloat.prepare("PPi", block = (generatorCount, 1, 1))
self.uniformDouble = m.get_function("uniformDouble")
self.uniformDouble.prepare("PPi", block = (generatorCount, 1, 1))
self.normalFloat = m.get_function("normalFloat")
self.normalFloat.prepare("PPi", block = (generatorCount, 1, 1))
self.normalDouble = m.get_function("normalDouble")
self.normalDouble.prepare("PPi", block = (generatorCount, 1, 1))
self.skipAhead = m.get_function("skipAhead")
self.skipAhead.prepare("Pii", block = (generatorCount, 1, 1))
self.skipAheadArray = m.get_function("skipAheadArray")
self.skipAheadArray.prepare("PiP", block = (generatorCount, 1, 1))
def __del__(self):
self.state.free()
def fillInUniformInt(self, data, inputSize, stream = None):
self.uniformInt.prepared_async_call((1, 1), stream, self.state, data, inputSize)
def fillInUniformFloat(self, data, inputSize, stream = None):
self.uniformFloat.prepared_async_call((1, 1), stream, self.state, data, inputSize)
def fillInUniformDouble(self, data, inputSize, stream = None):
self.uniformDouble.prepared_async_call((1, 1), stream, self.state, data, inputSize)
def fillInNormalFloat(self, data, inputSize, stream = None):
self.normalFloat.prepared_async_call((1, 1), stream, self.state, data, inputSize)
def fillInNormalDouble(self, data, inputSize, stream = None):
self.normalDouble.prepared_async_call((1, 1), stream, self.state, data, inputSize)
def fillInUniform(self, data, stream = None):
if data.dtype == numpy.float32:
self.fillInUniformFloat(data.gpudata, data.size, stream)
elif data.dtype == numpy.float64:
self.fillInUniformDouble(data.gpudata, data.size, stream)
elif data.dtype in [numpy.int, numpy.int32, numpy.uint32]:
self.fillInUniformInt(data.gpudata, data.size, stream)
else:
raise NotImplementedError
def fillInNormal(self, data, stream = None):
if data.dtype == numpy.float32:
self.fillInNormalFloat(data.gpudata, data.size, stream)
elif data.dtype == numpy.float64:
self.fillInNormalDouble(data.gpudata, data.size, stream)
else:
raise NotImplementedError
def callSkipAhead(self, i):
self.skipAhead.prepared_call((1, 1), self.state, self.generatorCount, i)
def callSkipAheadArray(self, i):
self.skipAheadArray.prepared_call((1, 1), self.state, self.generatorCount, i.gpudata)
def __call__(self, shape, dtype=numpy.float32, stream=None):
import pycuda.gpuarray
result = pycuda.gpuarray.GPUArray(shape, dtype)
self.fillInUniform(result, stream)
return result
if __name__ == '__main__':
import pycuda.gpuarray
import pycuda.autoinit
rr = Randomizer(256, 0, numpy.random.random(256).astype(numpy.int32))
data = pycuda.gpuarray.zeros([10000], numpy.float32)
print data[0:200]
rr.fillInUniformFloat(data.gpudata, 512)
print data[0:200]
data = rr((100, ), numpy.uint32)
del rr
print data[0:200]
data = pycuda.gpuarray.zeros([256], numpy.int)
rr = QuasiRandomizer(256, data.gpudata, 1)
signature.asc
Description: This is a digitally signed message part
_______________________________________________ PyCUDA mailing list [email protected] http://lists.tiker.net/listinfo/pycuda
