Hello. I attach patch bringing new CURAND 4 features to PyCUDA. Because there is more Sobol generator types, I decided to introduce inheritance: _SobolRandomNumberGeneratorBase and _ScrambledSobolRandomNumberGeneratorBase both inherit from _RandomNumberGeneratorBase and are inherited by proper Sobol{32,64}RandomNumberGenerator classes. This leaves repetition in Sobol64 and ScrambledSobol64 concerning methods accepting 64-bit arguments, but I do not have idea how to solve it without complicating code.
I have not yet added curand_log_normal kernels - unlike other kernels they require additional parameters. I wanted to focus on bringing new types to PyCUDA first, and only then try to extend classes by adding new kernels. Best regards. -- Tomasz Rybak <bogom...@post.pl> GPG/PGP key ID: 2AD5 9860 Fingerprint A481 824E 7DD3 9C0E C40A 488E C654 FB33 2AD5 9860 http://member.acm.org/~tomaszrybak
diff --git a/pycuda/curandom.py b/pycuda/curandom.py index 4a4bd7e..f236ad6 100644 --- a/pycuda/curandom.py +++ b/pycuda/curandom.py @@ -258,6 +258,10 @@ if get_curand_version() >= (3, 2, 0): direction_vector_set = _curand.direction_vector_set _get_direction_vectors = _curand._get_direction_vectors +if get_curand_version() >= (3, 2, 0): + _get_scramble_constants32 = _curand._get_scramble_constants32 + _get_scramble_constants64 = _curand._get_scramble_constants64 + # {{{ Base class gen_template = """ @@ -279,14 +283,14 @@ extern "C" %(generators)s -__global__ void skip_ahead(%(state_type)s *s, const int n, const int skip) +__global__ void skip_ahead(%(state_type)s *s, const int n, const unsigned int skip) { const int idx = blockIdx.x*blockDim.x+threadIdx.x; if (idx < n) skipahead(skip, &s[idx]); } -__global__ void skip_ahead_array(%(state_type)s *s, const int n, const int *skip) +__global__ void skip_ahead_array(%(state_type)s *s, const int n, const unsigned int *skip) { const int idx = blockIdx.x*blockDim.x+threadIdx.x; if (idx < n) @@ -529,20 +533,158 @@ if get_curand_version() >= (3, 2, 0): # }}} -# {{{ Sobol32 RNG +# {{{ Sobol RNG def generate_direction_vectors(count, direction=None): - if direction is None: - direction = direction_vector_set.VECTOR_32 - - result = np.empty((count, 32), dtype=np.int32) + if get_curand_version() >= (4, 0, 0): + if direction == direction_vector_set.VECTOR_64 or \ + direction == direction_vector_set.SCRAMBLED_VECTOR_64: + result = np.empty((count, 64), dtype=np.uint64) + else: + result = np.empty((count, 32), dtype=np.uint32) + else: + result = np.empty((count, 32), dtype=np.uint32) _get_direction_vectors(direction, result, count) return pycuda.gpuarray.to_gpu(result) +if get_curand_version() >= (4, 0, 0): + def generate_scramble_constants32(count): + result = np.empty((count, ), dtype=np.uint32) + _get_scramble_constants32(result, count) + return pycuda.gpuarray.to_gpu(result) + + def generate_scramble_constants64(count): + result = np.empty((count, ), dtype=np.uint64) + _get_scramble_constants64(result, count) + return pycuda.gpuarray.to_gpu(result) + +class _SobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase): + """ + Class surrounding CURAND kernels from CUDA 3.2. + It allows for generating quasi-random numbers with uniform + and normal probability function of type int, float, and double. + """ + + has_box_muller = False + + def __init__(self, dir_vector, dir_vector_dtype, dir_vector_size, + dir_vector_set, offset, state_type, sobol_random_source): + super(_SobolRandomNumberGeneratorBase, self).__init__(state_type, + sobol_random_source) + + if dir_vector is None: + dir_vector = generate_direction_vectors( + self.block_count * self.generators_per_block, dir_vector_set) + + if not (isinstance(dir_vector, pycuda.gpuarray.GPUArray) + and dir_vector.dtype == dir_vector_dtype + and dir_vector.shape == (self.block_count * self.generators_per_block, dir_vector_size)): + raise TypeError("seed must be GPUArray of integers of right length") + + p = self.module.get_function("prepare") + p.prepare("PiPi", block=(self.generators_per_block, 1, 1)) + + from pycuda.characterize import has_stack + has_stack = has_stack() + + if has_stack: + prev_stack_size = drv.Context.get_limit(drv.limit.STACK_SIZE) + + try: + if has_stack: + drv.Context.set_limit(drv.limit.STACK_SIZE, 1<<14) # 16k + try: + + dev = drv.Context.get_device() + if dev.compute_capability() >= (2, 0): + p.prepared_call((self.block_count, 1), self.state, + self.block_count * self.generators_per_block, dir_vector.gpudata, offset) + else: + p.prepared_call((2 * self.block_count, 1), self.state, + self.block_count * self.generators_per_block // 2, dir_vector.gpudata, offset) + except drv.LaunchError: + raise ValueError("Initialisation failed. Decrease number of threads.") + + finally: + if has_stack: + drv.Context.set_limit(drv.limit.STACK_SIZE, prev_stack_size) + + def _kernels(self): + return (_RandomNumberGeneratorBase._kernels(self) + + [self.module.get_function("prepare")]) + +class _ScrambledSobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase): + """ + Class surrounding CURAND kernels from CUDA 4.0. + It allows for generating quasi-random numbers with uniform + and normal probability function of type int, float, and double. + """ + + has_box_muller = False + + def __init__(self, dir_vector, dir_vector_dtype, dir_vector_size, + dir_vector_set, scramble_vector, scramble_vector_function, + offset, state_type, sobol_random_source): + super(_ScrambledSobolRandomNumberGeneratorBase, self).__init__(state_type, + sobol_random_source) + + if dir_vector is None: + dir_vector = generate_direction_vectors( + self.block_count * self.generators_per_block, + dir_vector_set) + + if scramble_vector is None: + scramble_vector = scramble_vector_function( + self.block_count * self.generators_per_block) + + if not (isinstance(dir_vector, pycuda.gpuarray.GPUArray) + and dir_vector.dtype == dir_vector_dtype + and dir_vector.shape == (self.block_count * self.generators_per_block, dir_vector_size)): + raise TypeError("seed must be GPUArray of integers of right length") + + if not (isinstance(scramble_vector, pycuda.gpuarray.GPUArray) + and scramble_vector.dtype == dir_vector_dtype + and scramble_vector.shape == (self.block_count * self.generators_per_block, )): + raise TypeError("scramble must be GPUArray of integers of right length") + + p = self.module.get_function("prepare") + p.prepare("PiPPi", block=(self.generators_per_block, 1, 1)) + + from pycuda.characterize import has_stack + has_stack = has_stack() + + if has_stack: + prev_stack_size = drv.Context.get_limit(drv.limit.STACK_SIZE) + + try: + if has_stack: + drv.Context.set_limit(drv.limit.STACK_SIZE, 1<<14) # 16k + try: + + dev = drv.Context.get_device() + if dev.compute_capability() >= (2, 0): + p.prepared_call((self.block_count, 1), self.state, + self.block_count * self.generators_per_block, + dir_vector.gpudata, scramble_vector.gpudata, offset) + else: + p.prepared_call((2 * self.block_count, 1), self.state, + self.block_count * self.generators_per_block // 2, + dir_vector.gpudata, scramble_vector.gpudata, offset) + except drv.LaunchError: + raise ValueError("Initialisation failed. Decrease number of threads.") + + finally: + if has_stack: + drv.Context.set_limit(drv.limit.STACK_SIZE, prev_stack_size) + + def _kernels(self): + return (_RandomNumberGeneratorBase._kernels(self) + + [self.module.get_function("prepare")]) + sobol32_random_source = """ extern "C" { -__global__ void prepare(curandStateSobol32 *s, const int n, curandDirectionVectors32_t *v, - const unsigned int o) +__global__ void prepare(curandStateSobol32 *s, const int n, + curandDirectionVectors32_t *v, const unsigned int o) { const int id = blockIdx.x*blockDim.x+threadIdx.x; if (id < n) @@ -552,60 +694,150 @@ __global__ void prepare(curandStateSobol32 *s, const int n, curandDirectionVecto """ if get_curand_version() >= (3, 2, 0): - class Sobol32RandomNumberGenerator(_RandomNumberGeneratorBase): + class Sobol32RandomNumberGenerator(_SobolRandomNumberGeneratorBase): """ Class surrounding CURAND kernels from CUDA 3.2. It allows for generating quasi-random numbers with uniform and normal probability function of type int, float, and double. """ - has_box_muller = False - def __init__(self, dir_vector=None, offset=0): - super(Sobol32RandomNumberGenerator, self).__init__('curandStateSobol32', - sobol32_random_source) + super(Sobol32RandomNumberGenerator, self).__init__(dir_vector, + np.uint32, 32, direction_vector_set.VECTOR_32, offset, + 'curandStateSobol32', sobol32_random_source) - if dir_vector is None: - dir_vector = generate_direction_vectors( - self.block_count * self.generators_per_block) +scrambledsobol32_random_source = """ +extern "C" { +__global__ void prepare(curandStateScrambledSobol32 *s, const int n, + curandDirectionVectors32_t *v, unsigned int *scramble, const unsigned int o) +{ + const int id = blockIdx.x*blockDim.x+threadIdx.x; + if (id < n) + curand_init(v[id], scramble[id], o, &s[id]); +} +} +""" - if not (isinstance(dir_vector, pycuda.gpuarray.GPUArray) - and dir_vector.dtype == np.int32 - and dir_vector.shape == (self.block_count * self.generators_per_block, 32)): - raise TypeError("seed must be GPUArray of integers of right length") +if get_curand_version() >= (4, 0, 0): + class ScrambledSobol32RandomNumberGenerator(_ScrambledSobolRandomNumberGeneratorBase): + """ + Class surrounding CURAND kernels from CUDA 4.0. + It allows for generating quasi-random numbers with uniform + and normal probability function of type int, float, and double. + """ - p = self.module.get_function("prepare") - p.prepare("PiPi", block=(self.generators_per_block, 1, 1)) + def __init__(self, dir_vector=None, scramble_vector=None, offset=0): + super(ScrambledSobol32RandomNumberGenerator, self).__init__(dir_vector, + np.uint32, 32, direction_vector_set.SCRAMBLED_VECTOR_32, + scramble_vector, generate_scramble_constants32, offset, + 'curandStateScrambledSobol32', scrambledsobol32_random_source) - from pycuda.characterize import has_stack - has_stack = has_stack() +sobol64_random_source = """ +extern "C" { +__global__ void prepare(curandStateSobol64 *s, const int n, curandDirectionVectors64_t *v, + const unsigned int o) +{ + const int id = blockIdx.x*blockDim.x+threadIdx.x; + if (id < n) + curand_init(v[id], o, &s[id]); +} +} +""" - if has_stack: - prev_stack_size = drv.Context.get_limit(drv.limit.STACK_SIZE) +random_skip_ahead64_source = """ +extern "C" { +__global__ void skip_ahead64(%(state_type)s *s, const int n, const unsigned long long skip) +{ + const int idx = blockIdx.x*blockDim.x+threadIdx.x; + if (idx < n) + skipahead(skip, &s[idx]); +} - try: - if has_stack: - drv.Context.set_limit(drv.limit.STACK_SIZE, 1<<14) # 16k - try: +__global__ void skip_ahead_array64(%(state_type)s *s, const int n, const unsigned long long *skip) +{ + const int idx = blockIdx.x*blockDim.x+threadIdx.x; + if (idx < n) + skipahead(skip[idx], &s[idx]); +} +} +""" - dev = drv.Context.get_device() - if dev.compute_capability() >= (2, 0): - p.prepared_call((self.block_count, 1), self.state, - self.block_count * self.generators_per_block, dir_vector.gpudata, offset) - else: - p.prepared_call((2 * self.block_count, 1), self.state, - self.block_count * self.generators_per_block // 2, dir_vector.gpudata, offset) - except drv.LaunchError: - raise ValueError("Initialisation failed. Decrease number of threads.") +if get_curand_version() >= (4, 0, 0): + class Sobol64RandomNumberGenerator(_SobolRandomNumberGeneratorBase): + """ + Class surrounding CURAND kernels from CUDA 4.0. + It allows for generating quasi-random numbers with uniform + and normal probability function of type int, float, and double. + """ - finally: - if has_stack: - drv.Context.set_limit(drv.limit.STACK_SIZE, prev_stack_size) + def __init__(self, dir_vector=None, offset=0): + super(Sobol64RandomNumberGenerator, self).__init__(dir_vector, + np.uint64, 64, direction_vector_set.VECTOR_64, offset, + 'curandStateSobol64', sobol64_random_source+random_skip_ahead64_source) + + self.skip_ahead64 = self.module.get_function("skip_ahead64") + self.skip_ahead64.prepare("Pii") + self.skip_ahead_array64 = self.module.get_function("skip_ahead_array64") + self.skip_ahead_array64.prepare("PiP") def _kernels(self): - return (_RandomNumberGeneratorBase._kernels(self) - + [self.module.get_function("prepare")]) + return (_SobolRandomNumberGeneratorBase._kernels(self) + + [self.module.get_function("skip_ahead64"), self.module.get_function("skip_ahead_array64")]) + + def call_skip_ahead64(self, i, stream=None): + self.skip_ahead64.prepared_async_call( + (self.block_count, 1), (self.generators_per_block, 1, 1), stream, + self.state, self.generators_per_block, i) + def call_skip_ahead_array64(self, i, stream=None): + self.skip_ahead_array64.prepared_async_call( + (self.block_count, 1), (self.generators_per_block, 1, 1), stream, + self.state, self.generators_per_block, i.gpudata) + +scrambledsobol64_random_source = """ +extern "C" { +__global__ void prepare(curandStateScrambledSobol64 *s, const int n, + curandDirectionVectors64_t *v, unsigned long long *scramble, const unsigned int o) +{ + const int id = blockIdx.x*blockDim.x+threadIdx.x; + if (id < n) + curand_init(v[id], scramble[id], o, &s[id]); +} +} +""" + +if get_curand_version() >= (4, 0, 0): + class ScrambledSobol64RandomNumberGenerator(_ScrambledSobolRandomNumberGeneratorBase): + """ + Class surrounding CURAND kernels from CUDA 4.0. + It allows for generating quasi-random numbers with uniform + and normal probability function of type int, float, and double. + """ + + def __init__(self, dir_vector=None, scramble_vector=None, offset=0): + super(ScrambledSobol64RandomNumberGenerator, self).__init__(dir_vector, + np.uint64, 64, direction_vector_set.SCRAMBLED_VECTOR_64, + scramble_vector, generate_scramble_constants64, offset, + 'curandStateScrambledSobol64', sobol64_random_source+random_skip_ahead64_source) + + self.skip_ahead64 = self.module.get_function("skip_ahead64") + self.skip_ahead64.prepare("Pii") + self.skip_ahead_array64 = self.module.get_function("skip_ahead_array64") + self.skip_ahead_array64.prepare("PiP") + + def _kernels(self): + return (_ScrambledSobolRandomNumberGeneratorBase._kernels(self) + + [self.module.get_function("skip_ahead64"), self.module.get_function("skip_ahead_array64")]) + + def call_skip_ahead64(self, i, stream=None): + self.skip_ahead64.prepared_async_call( + (self.block_count, 1), (self.generators_per_block, 1, 1), stream, + self.state, self.generators_per_block, i) + + def call_skip_ahead_array64(self, i, stream=None): + self.skip_ahead_array64.prepared_async_call( + (self.block_count, 1), (self.generators_per_block, 1, 1), stream, + self.state, self.generators_per_block, i.gpudata) # }}} # }}} diff --git a/src/cpp/curand.hpp b/src/cpp/curand.hpp index 9783239..b3d015d 100644 --- a/src/cpp/curand.hpp +++ b/src/cpp/curand.hpp @@ -52,19 +52,94 @@ namespace pycuda { namespace curandom { if (PyObject_AsWriteBuffer(dst.ptr(), &buf, &len)) throw py::error_already_set(); - if (CURAND_DIRECTION_VECTORS_32_JOEKUO6 == set) { + if (CURAND_DIRECTION_VECTORS_32_JOEKUO6 == set +#if CUDAPP_CUDA_VERSION >= 4000 + || CURAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6 == set +#endif + ) { curandDirectionVectors32_t *vectors; CURAND_CALL_GUARDED(curandGetDirectionVectors32, (&vectors, set)); while (count > 0) { int size = ((count > 20000) ? 20000 : count)*sizeof(curandDirectionVectors32_t); - memcpy((int *)buf+n*20000*sizeof(curandDirectionVectors32_t)/sizeof(unsigned int), vectors, size); + memcpy((unsigned int *)buf+n*20000*sizeof(curandDirectionVectors32_t)/sizeof(unsigned int), vectors, size); count -= size/sizeof(curandDirectionVectors32_t); n++; } } +#if CUDAPP_CUDA_VERSION >= 4000 + if (CURAND_DIRECTION_VECTORS_64_JOEKUO6 == set + || CURAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6 == set) { + curandDirectionVectors64_t *vectors; + CURAND_CALL_GUARDED(curandGetDirectionVectors64, (&vectors, set)); + while (count > 0) { + int size = ((count > 20000) ? 20000 : count)*sizeof(curandDirectionVectors64_t); + memcpy((unsigned long long *)buf+n*20000*sizeof(curandDirectionVectors64_t)/sizeof(unsigned long long), vectors, size); + count -= size/sizeof(curandDirectionVectors64_t); + n++; + } + } +#endif + } +#endif + +#if CUDAPP_CUDA_VERSION >= 4000 + void py_curand_get_scramble_constants32(py::object dst, int count) + { + void *buf; + PYCUDA_BUFFER_SIZE_T len; + int n = 0; + + if (PyObject_AsWriteBuffer(dst.ptr(), &buf, &len)) + throw py::error_already_set(); + unsigned int *vectors; + CURAND_CALL_GUARDED(curandGetScrambleConstants32, (&vectors)); +// Documentation does not mention number of dimensions +// Assuming the same as in getDirectionVectors* + while (count > 0) { + int size = ((count > 20000) ? 20000 : count)*sizeof(unsigned int); + memcpy((unsigned int *)buf+n*20000, vectors, size); + count -= size/sizeof(unsigned int); + n++; + } + } + + void py_curand_get_scramble_constants64(py::object dst, int count) + { + void *buf; + PYCUDA_BUFFER_SIZE_T len; + int n = 0; + + if (PyObject_AsWriteBuffer(dst.ptr(), &buf, &len)) + throw py::error_already_set(); + unsigned long long *vectors; + CURAND_CALL_GUARDED(curandGetScrambleConstants64, (&vectors)); +// Documentation does not mention number of dimensions +// Assuming the same as in getDirectionVectors* + while (count > 0) { + int size = ((count > 20000) ? 20000 : count)*sizeof(unsigned long long); + memcpy((unsigned long long *)buf+n*20000, vectors, size); + count -= size/sizeof(unsigned long long); + n++; + } } #endif +// TODO: add more methods in classes (log) +// log_normal(state, mean, stddev) +// log_normal(Scrambled64) +// log_normal_double(Scrambled64) +// log_normal(Sobol64) +// log_normal_double(Sobol64) +// log_normal(Scrambled32) +// log_normal_double(Scrambled32) +// log_normal(Sobol32) +// log_normal_double(Sobol32) +// +// log_normal(XORWOW) +// log_normal_double(XORWOW) +// log_normal2(XORWOW) +// log_normal2_double(XORWOW) +// } } #endif diff --git a/src/wrapper/wrap_curand.cpp b/src/wrapper/wrap_curand.cpp index 4ca1e42..681c339 100644 --- a/src/wrapper/wrap_curand.cpp +++ b/src/wrapper/wrap_curand.cpp @@ -19,6 +19,11 @@ void pycuda_expose_curand() #if CUDAPP_CUDA_VERSION >= 3020 py::enum_<curandDirectionVectorSet_t>("direction_vector_set") .value("VECTOR_32", CURAND_DIRECTION_VECTORS_32_JOEKUO6) +#if CUDAPP_CUDA_VERSION >= 4000 + .value("SCRAMBLED_VECTOR_32", CURAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6) + .value("VECTOR_64", CURAND_DIRECTION_VECTORS_32_JOEKUO6) + .value("SCRAMBLED_VECTOR_64", CURAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6) +#endif ; #endif @@ -28,4 +33,11 @@ void pycuda_expose_curand() py::def("_get_direction_vectors", py_curand_get_direction_vectors, (arg("set"), arg("dst"), arg("count"))); #endif + +#if CUDAPP_CUDA_VERSION >= 4000 + py::def("_get_scramble_constants32", py_curand_get_scramble_constants32, + (arg("dst"), arg("count"))); + py::def("_get_scramble_constants64", py_curand_get_scramble_constants64, + (arg("dst"), arg("count"))); +#endif }
signature.asc
Description: This is a digitally signed message part
_______________________________________________ PyCUDA mailing list PyCUDA@tiker.net http://lists.tiker.net/listinfo/pycuda