llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang-codegen
@llvm/pr-subscribers-clang
Author: Rana Pratap Reddy (ranapratap55)
<details>
<summary>Changes</summary>
Fix the IR lowering for `__amdgpu_texture_t` to generate a single 256-bit load
instead of a double indirection through a flat pointer.
Previously, `__amdgpu_texture_t` was lowered to `ptr addrspace(0)` (64-bit flat
pointer), which caused the double load and indirection. With the same
reproducer like #<!-- -->187697.
```c
#define TSHARP __constant uint *
// Old tsharp handling:
// #define LOAD_TSHARP(I) *(__constant uint8 *)I
#define LOAD_TSHARP(I) *(__constant __amdgpu_texture_t *)I
float4 test_image_load_1D(TSHARP i, int c) {
return __builtin_amdgcn_image_load_1d_v4f32_i32(15, c, LOAD_TSHARP(i), 0, 0);
}
```
old output:
```llvm
define hidden <4 x float> @<!-- -->test_image_load_1D(ptr addrspace(4)
noundef readonly captures(none) %i, i32 noundef %c) local_unnamed_addr #<!--
-->0 {
entry:
%0 = load ptr, ptr addrspace(4) %i, align 32, !tbaa !9
%1 = addrspacecast ptr %0 to ptr addrspace(1)
%tex.rsrc.val = load <8 x i32>, ptr addrspace(1) %1, align 32
%2 = tail call <4 x float> @<!--
-->llvm.amdgcn.image.load.1d.v4f32.i32.v8i32(i32 15, i32 %c, <8 x i32>
%tex.rsrc.val, i32 0, i32 0)
ret <4 x float> %2
}
```
This matches the old `__constant uint8 *` behavior. With this fix new output is
```llvm
define hidden <4 x float> @<!-- -->test_image_load_1D(ptr addrspace(4)
noundef readonly captures(none) %0, i32 noundef %1) local_unnamed_addr #<!--
-->0 {
%3 = load <8 x i32>, ptr addrspace(4) %0, align 32, !tbaa !10
%4 = tail call <4 x float> @<!--
-->llvm.amdgcn.image.load.1d.v4f32.i32.v8i32(i32 15, i32 %1, <8 x i32>
%3, i32 0, i32 0)
ret <4 x float> %4
}
```
Fixes #<!-- -->187697
---
Patch is 302.34 KiB, truncated to 20.00 KiB below, full version:
https://github.com/llvm/llvm-project/pull/187774.diff
5 Files Affected:
- (modified) clang/lib/CodeGen/CodeGenTypes.cpp (+6-2)
- (modified) clang/test/CodeGen/amdgpu-image-rsrc-type-debug-info.c (+5-5)
- (modified) clang/test/CodeGen/builtins-extended-image.c (+220-264)
- (modified) clang/test/CodeGen/builtins-image-load.c (+210-252)
- (modified) clang/test/CodeGen/builtins-image-store.c (+140-168)
``````````diff
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp
b/clang/lib/CodeGen/CodeGenTypes.cpp
index 6bd79056e599a..b79b0bceb48fd 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -599,8 +599,12 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
} break;
#include "clang/Basic/WebAssemblyReferenceTypes.def"
#define AMDGPU_OPAQUE_PTR_TYPE(Name, Id, SingletonId, Width, Align, AS)
\
- case BuiltinType::Id:
\
- return llvm::PointerType::get(getLLVMContext(), AS);
+ case BuiltinType::Id: { \
+ if (BuiltinType::Id == BuiltinType::AMDGPUTexture) {\
+ return
llvm::FixedVectorType::get(llvm::Type::getInt32Ty(getLLVMContext()), 8); \
+ } \
+ return llvm::PointerType::get(getLLVMContext(), AS); \
+ }
#define AMDGPU_NAMED_BARRIER_TYPE(Name, Id, SingletonId, Width, Align, Scope)
\
case BuiltinType::Id:
\
return llvm::TargetExtType::get(getLLVMContext(), "amdgcn.named.barrier",
\
diff --git a/clang/test/CodeGen/amdgpu-image-rsrc-type-debug-info.c
b/clang/test/CodeGen/amdgpu-image-rsrc-type-debug-info.c
index ef68c79bef592..aba6c718719df 100644
--- a/clang/test/CodeGen/amdgpu-image-rsrc-type-debug-info.c
+++ b/clang/test/CodeGen/amdgpu-image-rsrc-type-debug-info.c
@@ -3,13 +3,13 @@
// RUN: %clang_cc1 -triple amdgcn -emit-llvm -o - %s -debug-info-kind=limited
| FileCheck %s
// CHECK-LABEL: define dso_local void @test_locals(
-// CHECK-SAME: ) #[[ATTR0:[0-9]+]] !dbg [[DBG6:![0-9]+]] {
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] !dbg [[DBG5:![0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[IMG:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT: [[IMG:%.*]] = alloca <8 x i32>, align 32, addrspace(5)
// CHECK-NEXT: [[IMG_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IMG]]
to ptr
-// CHECK-NEXT: #dbg_declare(ptr addrspace(5) [[IMG]], [[META11:![0-9]+]],
!DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), [[META14:![0-9]+]])
-// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[IMG_ASCAST]], align 32, !dbg
[[DBG15:![0-9]+]]
-// CHECK-NEXT: ret void, !dbg [[DBG16:![0-9]+]]
+// CHECK-NEXT: #dbg_declare(ptr addrspace(5) [[IMG]], [[META10:![0-9]+]],
!DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), [[META13:![0-9]+]])
+// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr [[IMG_ASCAST]], align 32,
!dbg [[DBG14:![0-9]+]]
+// CHECK-NEXT: ret void, !dbg [[DBG15:![0-9]+]]
//
void test_locals(void) {
__amdgpu_texture_t img;
diff --git a/clang/test/CodeGen/builtins-extended-image.c
b/clang/test/CodeGen/builtins-extended-image.c
index 9ac7ec42d4e50..f487710d8cbed 100644
--- a/clang/test/CodeGen/builtins-extended-image.c
+++ b/clang/test/CodeGen/builtins-extended-image.c
@@ -6,12 +6,12 @@ typedef float float4 __attribute__((ext_vector_type(4)));
typedef _Float16 half4 __attribute__((ext_vector_type(4)));
// CHECK-LABEL: define dso_local <4 x float>
@test_amdgcn_image_gather4_lz_2d_v4f32_f32_r(
-// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]],
i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]])
#[[ATTR0:[0-9]+]] {
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]],
i32 noundef [[I32:%.*]], <8 x i32> [[TEX:%.*]], <4 x i32> noundef
[[VEC4I32:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16,
addrspace(5)
// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca <8 x i32>, align 32, addrspace(5)
// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16,
addrspace(5)
// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[V4F32_ADDR]] to ptr
// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[F32_ADDR]] to ptr
@@ -21,14 +21,13 @@ typedef _Float16 half4 __attribute__((ext_vector_type(4)));
// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]],
align 16
// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
-// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT: store <8 x i32> [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]],
align 16
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
-// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
+// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr [[TEX_ADDR_ASCAST]],
align 32
// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]],
align 16
-// CHECK-NEXT: [[TMP4:%.*]] = call <4 x float>
@llvm.amdgcn.image.gather4.lz.2d.v4f32.f32.v8i32.v4i32(i32 1, float [[TMP0]],
float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32
120, i32 110)
+// CHECK-NEXT: [[TMP4:%.*]] = call <4 x float>
@llvm.amdgcn.image.gather4.lz.2d.v4f32.f32.v8i32.v4i32(i32 1, float [[TMP0]],
float [[TMP1]], <8 x i32> [[TMP2]], <4 x i32> [[TMP3]], i1 false, i32 120, i32
110)
// CHECK-NEXT: ret <4 x float> [[TMP4]]
//
float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_r(float4 v4f32, float f32,
int i32, __amdgpu_texture_t tex, int4 vec4i32) {
@@ -37,12 +36,12 @@ float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_r(float4
v4f32, float f32, int
}
// CHECK-LABEL: define dso_local <4 x float>
@test_amdgcn_image_gather4_lz_2d_v4f32_f32_g(
-// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]],
i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]])
#[[ATTR0]] {
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]],
i32 noundef [[I32:%.*]], <8 x i32> [[TEX:%.*]], <4 x i32> noundef
[[VEC4I32:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16,
addrspace(5)
// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca <8 x i32>, align 32, addrspace(5)
// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16,
addrspace(5)
// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[V4F32_ADDR]] to ptr
// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[F32_ADDR]] to ptr
@@ -52,14 +51,13 @@ float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_r(float4
v4f32, float f32, int
// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]],
align 16
// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
-// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT: store <8 x i32> [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]],
align 16
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
-// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
+// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr [[TEX_ADDR_ASCAST]],
align 32
// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]],
align 16
-// CHECK-NEXT: [[TMP4:%.*]] = call <4 x float>
@llvm.amdgcn.image.gather4.lz.2d.v4f32.f32.v8i32.v4i32(i32 2, float [[TMP0]],
float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32
120, i32 110)
+// CHECK-NEXT: [[TMP4:%.*]] = call <4 x float>
@llvm.amdgcn.image.gather4.lz.2d.v4f32.f32.v8i32.v4i32(i32 2, float [[TMP0]],
float [[TMP1]], <8 x i32> [[TMP2]], <4 x i32> [[TMP3]], i1 false, i32 120, i32
110)
// CHECK-NEXT: ret <4 x float> [[TMP4]]
//
float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_g(float4 v4f32, float f32,
int i32, __amdgpu_texture_t tex, int4 vec4i32) {
@@ -68,12 +66,12 @@ float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_g(float4
v4f32, float f32, int
}
// CHECK-LABEL: define dso_local <4 x float>
@test_amdgcn_image_gather4_lz_2d_v4f32_f32_b(
-// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]],
i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]])
#[[ATTR0]] {
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]],
i32 noundef [[I32:%.*]], <8 x i32> [[TEX:%.*]], <4 x i32> noundef
[[VEC4I32:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16,
addrspace(5)
// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca <8 x i32>, align 32, addrspace(5)
// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16,
addrspace(5)
// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[V4F32_ADDR]] to ptr
// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[F32_ADDR]] to ptr
@@ -83,14 +81,13 @@ float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_g(float4
v4f32, float f32, int
// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]],
align 16
// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
-// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT: store <8 x i32> [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]],
align 16
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
-// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
+// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr [[TEX_ADDR_ASCAST]],
align 32
// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]],
align 16
-// CHECK-NEXT: [[TMP4:%.*]] = call <4 x float>
@llvm.amdgcn.image.gather4.lz.2d.v4f32.f32.v8i32.v4i32(i32 4, float [[TMP0]],
float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32
120, i32 110)
+// CHECK-NEXT: [[TMP4:%.*]] = call <4 x float>
@llvm.amdgcn.image.gather4.lz.2d.v4f32.f32.v8i32.v4i32(i32 4, float [[TMP0]],
float [[TMP1]], <8 x i32> [[TMP2]], <4 x i32> [[TMP3]], i1 false, i32 120, i32
110)
// CHECK-NEXT: ret <4 x float> [[TMP4]]
//
float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_b(float4 v4f32, float f32,
int i32, __amdgpu_texture_t tex, int4 vec4i32) {
@@ -99,12 +96,12 @@ float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_b(float4
v4f32, float f32, int
}
// CHECK-LABEL: define dso_local <4 x float>
@test_amdgcn_image_gather4_lz_2d_v4f32_f32_a(
-// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]],
i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]])
#[[ATTR0]] {
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]],
i32 noundef [[I32:%.*]], <8 x i32> [[TEX:%.*]], <4 x i32> noundef
[[VEC4I32:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16,
addrspace(5)
// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca <8 x i32>, align 32, addrspace(5)
// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16,
addrspace(5)
// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[V4F32_ADDR]] to ptr
// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[F32_ADDR]] to ptr
@@ -114,14 +111,13 @@ float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_b(float4
v4f32, float f32, int
// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]],
align 16
// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
-// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT: store <8 x i32> [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]],
align 16
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
-// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
+// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr [[TEX_ADDR_ASCAST]],
align 32
// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]],
align 16
-// CHECK-NEXT: [[TMP4:%.*]] = call <4 x float>
@llvm.amdgcn.image.gather4.lz.2d.v4f32.f32.v8i32.v4i32(i32 8, float [[TMP0]],
float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32
120, i32 110)
+// CHECK-NEXT: [[TMP4:%.*]] = call <4 x float>
@llvm.amdgcn.image.gather4.lz.2d.v4f32.f32.v8i32.v4i32(i32 8, float [[TMP0]],
float [[TMP1]], <8 x i32> [[TMP2]], <4 x i32> [[TMP3]], i1 false, i32 120, i32
110)
// CHECK-NEXT: ret <4 x float> [[TMP4]]
//
float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_a(float4 v4f32, float f32,
int i32, __amdgpu_texture_t tex, int4 vec4i32) {
@@ -130,12 +126,12 @@ float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_a(float4
v4f32, float f32, int
}
// CHECK-LABEL: define dso_local <4 x float>
@test_amdgcn_image_sample_lz_1d_v4f32_f32(
-// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]],
i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]])
#[[ATTR0]] {
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]],
i32 noundef [[I32:%.*]], <8 x i32> [[TEX:%.*]], <4 x i32> noundef
[[VEC4I32:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16,
addrspace(5)
// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca <8 x i32>, align 32, addrspace(5)
// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16,
addrspace(5)
// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[V4F32_ADDR]] to ptr
// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[F32_ADDR]] to ptr
@@ -145,13 +141,12 @@ float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_a(float4
v4f32, float f32, int
// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]],
align 16
// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
-// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT: store <8 x i32> [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]],
align 16
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
-// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32
+// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr [[TEX_ADDR_ASCAST]],
align 32
// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]],
align 16
-// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float>
@llvm.amdgcn.image.sample.lz.1d.v4f32.f32.v8i32.v4i32(i32 15, float [[TMP0]],
<8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP2]], i1 false, i32 120, i32 110)
+// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float>
@llvm.amdgcn.image.sample.lz.1d.v4f32.f32.v8i32.v4i32(i32 15, float [[TMP0]],
<8 x i32> [[TMP1]], <4 x i32> [[TMP2]], i1 false, i32 120, i32 110)
// CHECK-NEXT: ret <4 x float> [[TMP3]]
//
float4 test_amdgcn_image_sample_lz_1d_v4f32_f32(float4 v4f32, float f32, int
i32, __amdgpu_texture_t tex, int4 vec4i32) {
@@ -160,12 +155,12 @@ float4 test_amdgcn_image_sample_lz_1d_v4f32_f32(float4
v4f32, float f32, int i32
}
// CHECK-LABEL: define dso_local <4 x float>
@test_amdgcn_image_sample_l_1d_v4f32_f32(
-// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]],
i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]])
#[[ATTR0]] {
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]],
i32 noundef [[I32:%.*]], <8 x i32> [[TEX:%.*]], <4 x i32> noundef
[[VEC4I32:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16,
addrspace(5)
// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca <8 x i32>, align 32, addrspace(5)
// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16,
addrspace(5)
// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[V4F32_ADDR]] to ptr
// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[F32_ADDR]] to ptr
@@ -175,14 +170,13 @@ float4 test_amdgcn_image_sample_lz_1d_v4f32_f32(float4
v4f32, float f32, int i32
// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]],
align 16
// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
-// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT: store <8 x i32> [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]],
align 16
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
-// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
+// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr [[TEX_ADDR_ASCAST]],
align 32
// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]],
align 16
-// CHECK-NEXT: [[TMP4:%.*]] = call <4 x float>
@llvm.amdgcn.image.sample.l.1d.v4f32.f32.v8i32.v4i32(i32 15, float [[TMP0]],
float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32
120, i32 110)
+// CHECK-NEXT: [[TMP4:%.*]] = call <4 x float>
@llvm.amdgcn.image.sample.l.1d.v4f32.f32.v8i32.v4i32(i32 15, float [[TMP0]],
float [[TMP1]], <8 x i32> [[TMP2]], <4 x i32> [[TMP3]], i1 false, i32 120, i32
110)
// CHECK-NEXT: ret <4 x float> [[TMP4]]
//
float4 test_amdgcn_image_sample_l_1d_v4f32_f32(float4 v4f32, float f32, int
i32, __amdgpu_texture_t tex, int4 vec4i32) {
@@ -191,12 +185,12 @@ float4 test_amdgcn_image_sample_l_1d_v4f32_f32(float4
v4f32, float f32, int i32,
}
// CHECK-LABEL: define dso_local <4 x float>
@test_amdgcn_image_sample_d_1d_v4f32_f32(
-// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]],
i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/187774
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits