kevinsala wrote:

My understanding is that the CUDA Driver API requires the arguments in the 
buffer to be placed with the proper alignment (i.e., padding between fields). 
However, the trailing padding after the last element should not be accounted. 
Otherwise, if it's accounted, the `cuLaunchKernel` call fails.

The documentation of `cuLaunchKernel` says:

> Kernel parameters can also be packaged by the application into a single 
> buffer that is passed in via the extra parameter. This places the burden on 
> the application of knowing each kernel parameter's size and alignment/padding 
> within the buffer.

and

> CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next value in extra 
> will be a pointer to a size_t containing the size of the buffer specified 
> with CU_LAUNCH_PARAM_BUFFER_POINTER.

The following code is a simple reproducer that works directly on top of the 
CUDA Driver API:

`kernel.cu`:
```c++
extern "C" __global__ void kernel(int *arg1, short arg2, int *arg3, short arg4) 
{
  *arg1 = arg2;
  *arg3 = arg4;
}
```

`main.cu`:
```c++
#include <cstdio>
#include <cuda.h>

#define CU_CHECK(err) \
  do { \
    CUresult err__ = (err); \
    if (err__ != CUDA_SUCCESS) { \
      const char *errStr; \
      cuGetErrorString(err__, &errStr); \
      fprintf(stderr, "Error: %s\n", errStr ? errStr : "Unknown"); \
      exit(1); \
    } \
  } while (0)


int main(int argc, char **argv) {
  CU_CHECK(cuInit(0));

  CUdevice device;
  CU_CHECK(cuDeviceGet(&device, 0));

  CUcontext context;
  CU_CHECK(cuCtxCreate(&context, 0, device));

  CUmodule module;
  CU_CHECK(cuModuleLoad(&module, "kernel.cubin"));

  CUfunction kernel;
  CU_CHECK(cuModuleGetFunction(&kernel, module, "kernel"));

  CUdeviceptr d_arg1, d_arg3;
  CU_CHECK(cuMemAlloc(&d_arg1, sizeof(int)));
  CU_CHECK(cuMemAlloc(&d_arg3, sizeof(int)));

  short arg2 = 2, arg4 = 4;

  struct Args {
    CUdeviceptr arg1;
    short arg2;
    CUdeviceptr arg3;
    short arg4;
  };

  Args args = { d_arg1, arg2, d_arg3, arg4 };
  size_t size = 8 + 8 + 8 + 2; // OK
  // size_t size = sizeof(Args); // ERROR

  void *config[] = { CU_LAUNCH_PARAM_BUFFER_POINTER, &args,
                     CU_LAUNCH_PARAM_BUFFER_SIZE,
                     reinterpret_cast<void *>(&size),
                     CU_LAUNCH_PARAM_END };

  CU_CHECK(cuLaunchKernel(
    kernel, 1, 1, 1, 1, 1, 1,
    0, 0, nullptr, config
  ));

  CU_CHECK(cuCtxSynchronize());

  int h_out = 0;
  CU_CHECK(cuMemcpyDtoH(&h_out, d_arg1, sizeof(int)));
  printf("Result from kernel: %d\n", h_out);

  CU_CHECK(cuMemFree(d_arg1));
  CU_CHECK(cuMemFree(d_arg3));
  CU_CHECK(cuModuleUnload(module));
  CU_CHECK(cuCtxDestroy(context));

  return 0;
}
```

Commands to build the reproducer:
```sh
nvcc -arch=sm_90 --cubin kernel.cu -o kernel.cubin
nvcc -arch=sm_90 main.cu -o main -lcuda
./main
```

The work works at is it, passing the size skipping the trailing padding. If it 
is replaced by `sizeof(Args)`, the CUDA call fails.

https://github.com/llvm/llvm-project/pull/156229
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to