Finally, I managed to call custom cuda kernels from Nim. 
[Gist](https://gist.github.com/mratsim/dfbd944f64181727a97dffb30b8cbd0a).

Well, basically I lost the fights to:

  1. Metaprogramming Cuda from Nim. I would have to prevent Nim from 
"optimizing" the Cuda part and using CPU optimization.
  2. Getting a pointer to the function on the GPU. My attempts returned the 
pointer to the function on the host which is useless …
  3. Calling the GPU function directly by emitting Nim C code. I would have to 
emit the Cuda function in the same file so either as text or metaprogramming.



And ended up writing C but oh well.

The code:

**square.cu**
    
    
    #include "square.cuh"
    
    __global__ void square(float * d_out, float * d_in){
        int idx = threadIdx.x;
        float f = d_in[idx];
        d_out[idx] = f * f;
    }
    
    
    void cuda_square(int bpg, int tpb, float * d_out, float * d_in){
        square<<<bpg,tpb>>>(d_out, d_in);
    }
    

**square.cuh**
    
    
    #include "cuda.h"
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    
    
    
    void cuda_square(int bpg, int tpb, float * d_out, float * d_in);
    

**call_cuda.nim**
    
    
    import nimcuda/[cuda_runtime_api, driver_types, nimcuda]
    import sequtils, future
    
    type GpuArray[T: SomeReal] = object
      data: ref[ptr T]
      len: int
    
    {.compile: "./square.cu".}
    proc cuda_square(bpg, tpb: cint, y: ptr cfloat, x: ptr cfloat) {.importc, 
header:"../square.cuh".}
    #../square.cuh is a workaround because header is not copied to nimcache
    ## Compute the square of x and store it in y
    ## bpg: BlocksPerGrid
    ## tpb: ThreadsPerBlock
    
    proc cudaMalloc[T](size: int): ptr T {.noSideEffect.}=
      let s = size * sizeof(T)
      check cudaMalloc(cast[ptr pointer](addr result), s)
    
    proc deallocCuda[T](p: ref[ptr T]) {.noSideEffect.}=
      if not p[].isNil:
        check cudaFree(p[])
    
    proc newGpuArray[T: SomeReal](len: int): GpuArray[T] {.noSideEffect.}=
      new(result.data, deallocCuda)
      result.len = len
      result.data[] = cudaMalloc[T](result.len)
    
    proc cuda[T:SomeReal](s: seq[T]): GpuArray[T] {.noSideEffect.}=
      result = newGpuArray[T](s.len)
      
      let size = result.len * sizeof(T)
      
      check cudaMemCpy(result.data[],
                       unsafeAddr s[0],
                       size,
                       cudaMemcpyHostToDevice)
    
    proc cpu[T:SomeReal](g: GpuArray[T]): seq[T] {.noSideEffect.}=
      result = newSeq[T](g.len)
      
      let size = result.len * sizeof(T)
      
      check cudaMemCpy(addr result[0],
                       g.data[],
                       size,
                       cudaMemcpyDeviceToHost)
    
    
    proc main() =
      let a = newSeq[float32](64)
      
      let b = toSeq(0..63).map(x => x.float32)
      
      echo a
      echo b
      
      var u = a.cuda
      let v = b.cuda
      
      cuda_square(1.cint, 64.cint, u.data[],v.data[])
      
      check cudaDeviceSynchronize()
      
      let z = u.cpu
      echo z
    
    main()
    ## Output:
    
    # @[0.0, 0.0, 0.0, 0.0, 0.0, ...]
    # @[0.0, 1.0, 2.0, 3.0, 4.0, ...]
    # @[0.0, 1.0, 4.0, 9.0, 16.0, ...]
    

Thanks andrea, jcosborn and Araq in particular for tooling and inspiration. 

Reply via email to