| Issue |
83552
|
| Summary |
[nvptx] Why dose clang generate empty ptx file?
|
| Labels |
clang
|
| Assignees |
|
| Reporter |
hlyix
|
```
#include "__clang_cuda_builtin_vars.h"
#include "stdio.h"
#define num_kpt 5
#define MAX_DISTANCE 10000
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = (blockDim.x * blockDim.y * blockDim.z) * blockIdx.z * gridDim.x * gridDim.y + (blockDim.x * blockDim.y * blockDim.z) * blockIdx.y * gridDim.x + (blockDim.x * blockDim.y * blockDim.z) * blockIdx.x + threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * gridDim.z)
__global__ void test(int *__restrict__ address, int *__restrict__ dst, int n1, int m1, int num_roi) {
int B_PVT[5][5];
int A_PVT[5][5];
B_PVT[threadIdx.x % 5][threadIdx.x % 5] = n1 * threadIdx.x;
A_PVT[threadIdx.x % 5][threadIdx.x % 5] = n1 * threadIdx.x;
CUDA_1D_KERNEL_LOOP(i, num_roi) {
#pragma unroll
for (int j = 0; j < num_kpt; ++j) {
// a. find the minimum distance and index in unmatched keypoints
int min_index = num_kpt;
int matched_heatmap_index = B_PVT[threadIdx.x][A_PVT[min_index][4]];
#pragma unroll
for (int m = 0; m < num_kpt; ++m) {
#pragma unroll
for (int n = 0; n < num_kpt; ++n) {
if (A_PVT[m][n] == matched_heatmap_index * 996) {
B_PVT[m][n] = MAX_DISTANCE;
}
}
}
}
dst[threadIdx.x] = B_PVT[threadIdx.x][threadIdx.y];
}
}
int main() {
cudaStream_t stream;
cudaStreamCreate(&stream);
int num = 5000;
int *a = (int *)malloc(num * sizeof(int));
int *b = (int *)malloc(num * sizeof(int));
for (int i = 0; i < num; i++) {
a[i] = 0;
b[i] = i + 1;
}
int *c;
int *d;
cudaMalloc((void **)&c, sizeof(int) * num);
cudaMemcpy(c, a, num * sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc((void **)&d, sizeof(int) * num);
cudaMemcpy(d, b, num * sizeof(int), cudaMemcpyHostToDevice);
test<<<4, 128, 0, stream>>>(c, d, 0, 64, 1);
cudaStreamSynchronize(stream);
cudaMemcpy(a, c, num * sizeof(int), cudaMemcpyDeviceToHost);
printf("%d", a[1]);
return 0;
}
```
clang++ -O2 test.cu --cuda-device-only -S -o test.s
I get

but nvcc get:

_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs