Issue 83552
Summary [nvptx] Why dose clang generate empty ptx file?
Labels clang
Assignees
Reporter hlyix
    
```
#include "__clang_cuda_builtin_vars.h"
#include "stdio.h"
#define num_kpt 5
#define MAX_DISTANCE 10000
#define CUDA_1D_KERNEL_LOOP(i, n) \
  for (int i = (blockDim.x * blockDim.y * blockDim.z) * blockIdx.z * gridDim.x * gridDim.y + (blockDim.x * blockDim.y * blockDim.z) * blockIdx.y * gridDim.x + (blockDim.x * blockDim.y * blockDim.z) * blockIdx.x + threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; \
       i < (n); \
       i += blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * gridDim.z)

__global__ void test(int *__restrict__ address, int *__restrict__ dst, int n1, int m1, int num_roi) {
  int B_PVT[5][5];
  int A_PVT[5][5];
  B_PVT[threadIdx.x % 5][threadIdx.x % 5] = n1 * threadIdx.x;
  A_PVT[threadIdx.x % 5][threadIdx.x % 5] = n1 * threadIdx.x;
  CUDA_1D_KERNEL_LOOP(i, num_roi) {
#pragma unroll
 for (int j = 0; j < num_kpt; ++j) {
      // a. find the minimum distance and index in unmatched keypoints
      int min_index = num_kpt;
 int matched_heatmap_index = B_PVT[threadIdx.x][A_PVT[min_index][4]];
#pragma unroll
      for (int m = 0; m < num_kpt; ++m) {
#pragma unroll
        for (int n = 0; n < num_kpt; ++n) {
          if (A_PVT[m][n] == matched_heatmap_index * 996) {
            B_PVT[m][n] = MAX_DISTANCE;
          }
        }
 }
    }
    dst[threadIdx.x] = B_PVT[threadIdx.x][threadIdx.y];
 }
}

int main() {

  cudaStream_t stream;
 cudaStreamCreate(&stream);
  int num = 5000;

  int *a = (int *)malloc(num * sizeof(int));
  int *b = (int *)malloc(num * sizeof(int));
  for (int i = 0; i < num; i++) {
    a[i] = 0;
 b[i] = i + 1;
  }
  int *c;
  int *d;
  cudaMalloc((void **)&c, sizeof(int) * num);
  cudaMemcpy(c, a, num * sizeof(int), cudaMemcpyHostToDevice);
  cudaMalloc((void **)&d, sizeof(int) * num);
 cudaMemcpy(d, b, num * sizeof(int), cudaMemcpyHostToDevice);

 test<<<4, 128, 0, stream>>>(c, d, 0, 64, 1);

 cudaStreamSynchronize(stream);
  cudaMemcpy(a, c, num * sizeof(int), cudaMemcpyDeviceToHost);
  printf("%d", a[1]);
  return 0;
}

```
clang++ -O2  test.cu --cuda-device-only -S -o test.s


I get
![image](https://github.com/llvm/llvm-project/assets/31645239/0f986ae0-d263-485c-8336-31049939259a)


but nvcc get:
![image](https://github.com/llvm/llvm-project/assets/31645239/b307e16b-22e1-425c-bc84-d2765e343501)





_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to