Finally, I managed to call custom cuda kernels from Nim. [Gist](https://gist.github.com/mratsim/dfbd944f64181727a97dffb30b8cbd0a).
Well, basically I lost the fights to: 1. Metaprogramming Cuda from Nim. I would have to prevent Nim from "optimizing" the Cuda part and using CPU optimization. 2. Getting a pointer to the function on the GPU. My attempts returned the pointer to the function on the host which is useless … 3. Calling the GPU function directly by emitting Nim C code. I would have to emit the Cuda function in the same file so either as text or metaprogramming. And ended up writing C but oh well. The code: **square.cu** #include "square.cuh" __global__ void square(float * d_out, float * d_in){ int idx = threadIdx.x; float f = d_in[idx]; d_out[idx] = f * f; } void cuda_square(int bpg, int tpb, float * d_out, float * d_in){ square<<<bpg,tpb>>>(d_out, d_in); } **square.cuh** #include "cuda.h" #include "cuda_runtime.h" #include "device_launch_parameters.h" void cuda_square(int bpg, int tpb, float * d_out, float * d_in); **call_cuda.nim** import nimcuda/[cuda_runtime_api, driver_types, nimcuda] import sequtils, future type GpuArray[T: SomeReal] = object data: ref[ptr T] len: int {.compile: "./square.cu".} proc cuda_square(bpg, tpb: cint, y: ptr cfloat, x: ptr cfloat) {.importc, header:"../square.cuh".} #../square.cuh is a workaround because header is not copied to nimcache ## Compute the square of x and store it in y ## bpg: BlocksPerGrid ## tpb: ThreadsPerBlock proc cudaMalloc[T](size: int): ptr T {.noSideEffect.}= let s = size * sizeof(T) check cudaMalloc(cast[ptr pointer](addr result), s) proc deallocCuda[T](p: ref[ptr T]) {.noSideEffect.}= if not p[].isNil: check cudaFree(p[]) proc newGpuArray[T: SomeReal](len: int): GpuArray[T] {.noSideEffect.}= new(result.data, deallocCuda) result.len = len result.data[] = cudaMalloc[T](result.len) proc cuda[T:SomeReal](s: seq[T]): GpuArray[T] {.noSideEffect.}= result = newGpuArray[T](s.len) let size = result.len * sizeof(T) check cudaMemCpy(result.data[], unsafeAddr s[0], size, cudaMemcpyHostToDevice) proc cpu[T:SomeReal](g: GpuArray[T]): seq[T] {.noSideEffect.}= result = newSeq[T](g.len) let size = result.len * sizeof(T) check cudaMemCpy(addr result[0], g.data[], size, cudaMemcpyDeviceToHost) proc main() = let a = newSeq[float32](64) let b = toSeq(0..63).map(x => x.float32) echo a echo b var u = a.cuda let v = b.cuda cuda_square(1.cint, 64.cint, u.data[],v.data[]) check cudaDeviceSynchronize() let z = u.cpu echo z main() ## Output: # @[0.0, 0.0, 0.0, 0.0, 0.0, ...] # @[0.0, 1.0, 2.0, 3.0, 4.0, ...] # @[0.0, 1.0, 4.0, 9.0, 16.0, ...] Thanks andrea, jcosborn and Araq in particular for tooling and inspiration.