hello,,,

i'm trying to translate from cu code to pycuda but got unexpected result
because the flops value is too far
i think it because i don't really know how to translate it in pycuda...

especially in CUDA_CALL_SAFE and measure the time..

regards

Attachment: cudaflops.cu
Description: application/cu-seeme

import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import time
import math

mod = SourceModule("""
    #define NUM_SMS (24)
    #define NUM_THREADS_PER_SM (384)
    #define NUM_THREADS_PER_BLOCK (192)
    #define NUM_BLOCKS ((NUM_THREADS_PER_SM / NUM_THREADS_PER_BLOCK) * NUM_SMS)
    #define NUM_ITERATIONS 32
     
    // 128 MAD instructions
    #define FMAD128(a, b)\
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
         a = b * a + b; \
         b = a * b + a; \
     
    __shared__ float result[NUM_THREADS_PER_BLOCK];
     
    __global__ void gflops()
    {
       float a = result[threadIdx.x];  // this ensures the mads don't get compiled out
       float b = 1.01f;
     
       for (int i = 0; i < NUM_ITERATIONS; i++)
       {
           FMAD128(a, b);
           FMAD128(a, b);
           FMAD128(a, b);
           FMAD128(a, b);
           FMAD128(a, b);
           FMAD128(a, b);
           FMAD128(a, b);
           FMAD128(a, b);
           FMAD128(a, b);
           FMAD128(a, b);
           FMAD128(a, b);
           FMAD128(a, b);
           FMAD128(a, b);
           FMAD128(a, b);
           FMAD128(a, b);   
           FMAD128(a, b);
       }
       result[threadIdx.x] = a + b;
    }
    """)

func = mod.get_function("gflops")

start = time.time()
varblock1 = 384/192*24
func( block=(varblock1, 10, 1))
stop = time.time()
seconds = stop-start
print (stop-start) ,'s'

flops = 128 * 2 * 16 * 32 * 384/192*24 *192 #128 * 2 * 16 * NUM_ITERATIONS * NUM_BLOCKS * NUM_THREADS_PER_BLOCK;
print "gflops : ", (flops/(seconds))/math.e**9

_______________________________________________
PyCUDA mailing list
PyCUDA@tiker.net
http://lists.tiker.net/listinfo/pycuda

Reply via email to