hello,,, i'm trying to translate from cu code to pycuda but got unexpected result because the flops value is too far i think it because i don't really know how to translate it in pycuda...
especially in CUDA_CALL_SAFE and measure the time.. regards
cudaflops.cu
Description: application/cu-seeme
import pycuda.driver as cuda import pycuda.autoinit from pycuda.compiler import SourceModule import time import math mod = SourceModule(""" #define NUM_SMS (24) #define NUM_THREADS_PER_SM (384) #define NUM_THREADS_PER_BLOCK (192) #define NUM_BLOCKS ((NUM_THREADS_PER_SM / NUM_THREADS_PER_BLOCK) * NUM_SMS) #define NUM_ITERATIONS 32 // 128 MAD instructions #define FMAD128(a, b)\ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ a = b * a + b; \ b = a * b + a; \ __shared__ float result[NUM_THREADS_PER_BLOCK]; __global__ void gflops() { float a = result[threadIdx.x]; // this ensures the mads don't get compiled out float b = 1.01f; for (int i = 0; i < NUM_ITERATIONS; i++) { FMAD128(a, b); FMAD128(a, b); FMAD128(a, b); FMAD128(a, b); FMAD128(a, b); FMAD128(a, b); FMAD128(a, b); FMAD128(a, b); FMAD128(a, b); FMAD128(a, b); FMAD128(a, b); FMAD128(a, b); FMAD128(a, b); FMAD128(a, b); FMAD128(a, b); FMAD128(a, b); } result[threadIdx.x] = a + b; } """) func = mod.get_function("gflops") start = time.time() varblock1 = 384/192*24 func( block=(varblock1, 10, 1)) stop = time.time() seconds = stop-start print (stop-start) ,'s' flops = 128 * 2 * 16 * 32 * 384/192*24 *192 #128 * 2 * 16 * NUM_ITERATIONS * NUM_BLOCKS * NUM_THREADS_PER_BLOCK; print "gflops : ", (flops/(seconds))/math.e**9
_______________________________________________ PyCUDA mailing list PyCUDA@tiker.net http://lists.tiker.net/listinfo/pycuda