Hello,
here is the alternative implementation of curandom.rand, the
md5 crypto hash function of threadIdx, blockIdx, seed and
other variables is used to generate random numbers (thus
there should be no issues with combining results from
several instances).
It is about twice slower than present (not working) rand
function. However 4x speed up could be achieved by using all
16 bytes returned by md5.
Please CC me on the replies since I am not on pycuda list
Bye
Jozef Vesely
[email protected]
#!/usr/bin/python
import numpy
import pycuda.driver as drv
from pytools import memoize
@memoize
def _get_random_kernel():
mod = drv.SourceModule("""
/*
**********************************************************************
** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. **
** **
** License to copy and use this software is granted provided that **
** it is identified as the "RSA Data Security, Inc. MD5 Message **
** Digest Algorithm" in all material mentioning or referencing this **
** software or this function. **
** **
** License is also granted to make and use derivative works **
** provided that such works are identified as "derived from the RSA **
** Data Security, Inc. MD5 Message Digest Algorithm" in all **
** material mentioning or referencing the derived work. **
** **
** RSA Data Security, Inc. makes no representations concerning **
** either the merchantability of this software or the suitability **
** of this software for any particular purpose. It is provided "as **
** is" without express or implied warranty of any kind. **
** **
** These notices must be retained in any copies of any part of this **
** documentation and/or software. **
**********************************************************************
*/
/* F, G and H are basic MD5 functions: selection, majority, parity */
#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
#define H(x, y, z) ((x) ^ (y) ^ (z))
#define I(x, y, z) ((y) ^ ((x) | (~z)))
/* ROTATE_LEFT rotates x left n bits */
#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
/* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4 */
/* Rotation is separate from addition to prevent recomputation */
#define FF(a, b, c, d, x, s, ac) \
{(a) += F ((b), (c), (d)) + (x) + (ac); \
(a) = ROTATE_LEFT ((a), (s)); \
(a) += (b); \
}
#define GG(a, b, c, d, x, s, ac) \
{(a) += G ((b), (c), (d)) + (x) + (ac); \
(a) = ROTATE_LEFT ((a), (s)); \
(a) += (b); \
}
#define HH(a, b, c, d, x, s, ac) \
{(a) += H ((b), (c), (d)) + (x) + (ac); \
(a) = ROTATE_LEFT ((a), (s)); \
(a) += (b); \
}
#define II(a, b, c, d, x, s, ac) \
{(a) += I ((b), (c), (d)) + (x) + (ac); \
(a) = ROTATE_LEFT ((a), (s)); \
(a) += (b); \
}
#define X0 threadIdx.x
#define X1 threadIdx.y
#define X2 threadIdx.z
#define X3 blockIdx.x
#define X4 blockIdx.y
#define X5 blockIdx.z
#define X6 seed
#define X7 i
#define X8 n
#define X9 blockDim.x
#define X10 blockDim.y
#define X11 blockDim.z
#define X12 gridDim.x
#define X13 gridDim.y
#define X14 gridDim.z
#define X15 0
__global__ void md5(float *dest, uint seed, int n){
const int start = blockDim.x*blockIdx.x + threadIdx.x;
const int step = gridDim.x*blockDim.x;
int i;
for (i = start; i<n; i += step) {
uint a = 0x67452301;
uint b = 0xefcdab89;
uint c = 0x98badcfe;
uint d = 0x10325476;
/* Round 1 */
#define S11 7
#define S12 12
#define S13 17
#define S14 22
FF ( a, b, c, d, X0 , S11, 3614090360); /* 1 */
FF ( d, a, b, c, X1 , S12, 3905402710); /* 2 */
FF ( c, d, a, b, X2 , S13, 606105819); /* 3 */
FF ( b, c, d, a, X3 , S14, 3250441966); /* 4 */
FF ( a, b, c, d, X4 , S11, 4118548399); /* 5 */
FF ( d, a, b, c, X5 , S12, 1200080426); /* 6 */
FF ( c, d, a, b, X6 , S13, 2821735955); /* 7 */
FF ( b, c, d, a, X7 , S14, 4249261313); /* 8 */
FF ( a, b, c, d, X8 , S11, 1770035416); /* 9 */
FF ( d, a, b, c, X9 , S12, 2336552879); /* 10 */
FF ( c, d, a, b, X10, S13, 4294925233); /* 11 */
FF ( b, c, d, a, X11, S14, 2304563134); /* 12 */
FF ( a, b, c, d, X12, S11, 1804603682); /* 13 */
FF ( d, a, b, c, X13, S12, 4254626195); /* 14 */
FF ( c, d, a, b, X14, S13, 2792965006); /* 15 */
FF ( b, c, d, a, X15, S14, 1236535329); /* 16 */
/* Round 2 */
#define S21 5
#define S22 9
#define S23 14
#define S24 20
GG ( a, b, c, d, X1 , S21, 4129170786); /* 17 */
GG ( d, a, b, c, X6 , S22, 3225465664); /* 18 */
GG ( c, d, a, b, X11, S23, 643717713); /* 19 */
GG ( b, c, d, a, X0 , S24, 3921069994); /* 20 */
GG ( a, b, c, d, X5 , S21, 3593408605); /* 21 */
GG ( d, a, b, c, X10, S22, 38016083); /* 22 */
GG ( c, d, a, b, X15, S23, 3634488961); /* 23 */
GG ( b, c, d, a, X4 , S24, 3889429448); /* 24 */
GG ( a, b, c, d, X9 , S21, 568446438); /* 25 */
GG ( d, a, b, c, X14, S22, 3275163606); /* 26 */
GG ( c, d, a, b, X3 , S23, 4107603335); /* 27 */
GG ( b, c, d, a, X8 , S24, 1163531501); /* 28 */
GG ( a, b, c, d, X13, S21, 2850285829); /* 29 */
GG ( d, a, b, c, X2 , S22, 4243563512); /* 30 */
GG ( c, d, a, b, X7 , S23, 1735328473); /* 31 */
GG ( b, c, d, a, X12, S24, 2368359562); /* 32 */
/* Round 3 */
#define S31 4
#define S32 11
#define S33 16
#define S34 23
HH ( a, b, c, d, X5 , S31, 4294588738); /* 33 */
HH ( d, a, b, c, X8 , S32, 2272392833); /* 34 */
HH ( c, d, a, b, X11, S33, 1839030562); /* 35 */
HH ( b, c, d, a, X14, S34, 4259657740); /* 36 */
HH ( a, b, c, d, X1 , S31, 2763975236); /* 37 */
HH ( d, a, b, c, X4 , S32, 1272893353); /* 38 */
HH ( c, d, a, b, X7 , S33, 4139469664); /* 39 */
HH ( b, c, d, a, X10, S34, 3200236656); /* 40 */
HH ( a, b, c, d, X13, S31, 681279174); /* 41 */
HH ( d, a, b, c, X0 , S32, 3936430074); /* 42 */
HH ( c, d, a, b, X3 , S33, 3572445317); /* 43 */
HH ( b, c, d, a, X6 , S34, 76029189); /* 44 */
HH ( a, b, c, d, X9 , S31, 3654602809); /* 45 */
HH ( d, a, b, c, X12, S32, 3873151461); /* 46 */
HH ( c, d, a, b, X15, S33, 530742520); /* 47 */
HH ( b, c, d, a, X2 , S34, 3299628645); /* 48 */
/* Round 4 */
#define S41 6
#define S42 10
#define S43 15
#define S44 21
II ( a, b, c, d, X0 , S41, 4096336452); /* 49 */
II ( d, a, b, c, X7 , S42, 1126891415); /* 50 */
II ( c, d, a, b, X14, S43, 2878612391); /* 51 */
II ( b, c, d, a, X5 , S44, 4237533241); /* 52 */
II ( a, b, c, d, X12, S41, 1700485571); /* 53 */
II ( d, a, b, c, X3 , S42, 2399980690); /* 54 */
II ( c, d, a, b, X10, S43, 4293915773); /* 55 */
II ( b, c, d, a, X1 , S44, 2240044497); /* 56 */
II ( a, b, c, d, X8 , S41, 1873313359); /* 57 */
II ( d, a, b, c, X15, S42, 4264355552); /* 58 */
II ( c, d, a, b, X6 , S43, 2734768916); /* 59 */
II ( b, c, d, a, X13, S44, 1309151649); /* 60 */
II ( a, b, c, d, X4 , S41, 4149444226); /* 61 */
II ( d, a, b, c, X11, S42, 3174756917); /* 62 */
II ( c, d, a, b, X2 , S43, 718787259); /* 63 */
II ( b, c, d, a, X9 , S44, 3951481745); /* 64 */
a += 0x67452301;
b += 0xefcdab89;
c += 0x98badcfe;
d += 0x10325476;
dest[i] = a/4294967296.0;
}
}
""")
func = mod.get_function("md5")
func.prepare("Pii", (1,1,1))
return func
def md5_rand(shape):
from pycuda.gpuarray import GPUArray
result = GPUArray(shape, numpy.float32)
import random
func = _get_random_kernel()
func.set_block_shape(*result._block)
func.prepared_async_call(result._grid, result.stream,
result.gpudata, random.random(), result.size)
return result
if __name__ == "__main__":
import pycuda.autoinit
from pylab import *
from pycuda.curandom import rand
if 0:
N = 2**27
print N*4./1024/1024, "MB"
r = md5_rand((N,))
print "generated"
r.get().tofile("diehard/random.dat")
print "written"
if 1:
N = 500
subplot(211)
r1 = md5_rand((N,))
plot( r1.get(),"x-")
subplot(212)
r2 = rand((N,))
plot( r2.get(),"x-")
show()
_______________________________________________
PyCuda mailing list
[email protected]
http://tiker.net/mailman/listinfo/pycuda_tiker.net