llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang-codegen
Author: Shilei Tian (shiltian)
<details>
<summary>Changes</summary>
Currently, AMDGPU functions have `target-features` attribute populated with all
default features for the target GPU. This is redundant because the backend can
derive these defaults from the `target-cpu` attribute via
`AMDGPUTargetMachine::getFeatureString()`.
In this PR, for AMDGPU targets only:
- Functions without explicit target attributes no longer emit `target-features`
- Functions with `__attribute__((target(...)))` or `-target-feature` emit only
features that differ from the target's defaults (delta)
The backend already handles missing `target-features` correctly by falling back
to the TargetMachine's defaults.
A new cc1 flag `-famdgpu-emit-full-target-features` is added to emit full
features when needed.
Example:
Before:
```llvm
attributes #<!-- -->0 = { "target-cpu"="gfx90a"
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,..."
}
```
After (default):
```llvm
attributes #<!-- -->0 = { "target-cpu"="gfx90a" }
```
After (with explicit `+wavefrontsize32` override):
```llvm
attributes #<!-- -->0 = { "target-cpu"="gfx90a"
"target-features"="+wavefrontsize32" }
```
---
Patch is 83.39 KiB, truncated to 20.00 KiB below, full version:
https://github.com/llvm/llvm-project/pull/176533.diff
10 Files Affected:
- (modified) clang/include/clang/Basic/CodeGenOptions.def (+4)
- (modified) clang/include/clang/Options/Options.td (+6)
- (modified) clang/lib/CodeGen/CodeGenModule.cpp (+33-5)
- (modified) clang/test/CodeGen/link-builtin-bitcode.c (+1-1)
- (modified) clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl (+2-2)
- (modified) clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl (+42-40)
- (added) clang/test/CodeGenOpenCL/amdgpu-features-default-delta.cl (+70)
- (modified) clang/test/CodeGenOpenCL/amdgpu-features.cl (+53-53)
- (modified)
clang/test/CodeGenOpenCL/amdgpu-readonly-features-written-with-no-target.cl
(+4-6)
- (modified) clang/test/OpenMP/amdgcn-attributes.cpp (+341-12)
``````````diff
diff --git a/clang/include/clang/Basic/CodeGenOptions.def
b/clang/include/clang/Basic/CodeGenOptions.def
index baf8b093c10e6..ec3cf0b432143 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -470,6 +470,10 @@ CODEGENOPT(EmitIEEENaNCompliantInsts, 1, 1, Benign)
/// Expands s_waitcnt instructions to help PC-sampling profilers identify
stalls.
CODEGENOPT(AMDGPUExpandWaitcntProfiling, 1, 0, Benign)
+/// Emit full target-features attribute for AMDGPU (for testing). (AMDGPU Only)
+/// By default, only features that differ from the target CPU's defaults are
emitted.
+CODEGENOPT(AMDGPUEmitFullTargetFeatures, 1, 0, Benign)
+
// Whether to emit Swift Async function extended frame information: auto,
// never, always.
ENUM_CODEGENOPT(SwiftAsyncFramePointer, SwiftAsyncFramePointerKind, 2,
diff --git a/clang/include/clang/Options/Options.td
b/clang/include/clang/Options/Options.td
index 188739e72434a..daecec88adcf2 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -5606,6 +5606,12 @@ defm amdgpu_expand_waitcnt_profiling :
BoolMOption<"amdgpu-expand-waitcnt-profil
"emits waitcnt(N-1), waitcnt(N-2), ..., waitcnt(target). (AMDGPU only)">,
NegFlag<SetFalse, [], [ClangOption]>>;
+def famdgpu_emit_full_target_features : Flag<["-"],
"famdgpu-emit-full-target-features">,
+ Visibility<[CC1Option]>,
+ HelpText<"Emit full target-features attribute for AMDGPU functions instead
of "
+ "only the delta from the target CPU's defaults. (AMDGPU only)">,
+ MarshallingInfoFlag<CodeGenOpts<"AMDGPUEmitFullTargetFeatures">>;
+
def mcode_object_version_EQ : Joined<["-"], "mcode-object-version=">,
Group<m_Group>,
HelpText<"Specify code object ABI version. Defaults to 6. (AMDGPU only)">,
Visibility<[ClangOption, FlangOption, CC1Option, FC1Option]>,
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp
b/clang/lib/CodeGen/CodeGenModule.cpp
index dc8a31b7f7f0d..95dbe5b27339a 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -2954,10 +2954,6 @@ bool
CodeGenModule::GetCPUAndFeaturesAttributes(GlobalDecl GD,
llvm::StringMap<bool> FeatureMap;
getContext().getFunctionFeatureMap(FeatureMap, GD);
- // Produce the canonical string for this set of features.
- for (const llvm::StringMap<bool>::value_type &Entry : FeatureMap)
- Features.push_back((Entry.getValue() ? "+" : "-") +
Entry.getKey().str());
-
// Now add the target-cpu and target-features to the function.
// While we populated the feature map above, we still need to
// get and parse the target attribute so we can get the cpu for
@@ -2980,10 +2976,42 @@ bool
CodeGenModule::GetCPUAndFeaturesAttributes(GlobalDecl GD,
// favor this processor.
TuneCPU = SD->getCPUName(GD.getMultiVersionIndex())->getName();
}
+
+ // For AMDGPU, by default only emit delta features (features that differ
+ // from the target CPU's defaults). Use -famdgpu-emit-full-target-features
+ // to emit all features.
+ if (getTarget().getTriple().isAMDGPU() &&
+ !CodeGenOpts.AMDGPUEmitFullTargetFeatures) {
+ // Get the default feature map for the (possibly overridden) target CPU.
+ llvm::StringMap<bool> DefaultFeatureMap;
+ getTarget().initFeatureMap(DefaultFeatureMap,
+ getContext().getDiagnostics(), TargetCPU, {});
+
+ // Only emit features that differ from the defaults.
+ for (const auto &Entry : FeatureMap) {
+ auto DefaultIt = DefaultFeatureMap.find(Entry.getKey());
+ // Emit if the feature is not in defaults or has a different value.
+ if (DefaultIt == DefaultFeatureMap.end() ||
+ DefaultIt->getValue() != Entry.getValue())
+ Features.push_back((Entry.getValue() ? "+" : "-") +
+ Entry.getKey().str());
+ }
+ } else {
+ // Produce the canonical string for this set of features.
+ for (const llvm::StringMap<bool>::value_type &Entry : FeatureMap)
+ Features.push_back((Entry.getValue() ? "+" : "-") +
+ Entry.getKey().str());
+ }
} else {
// Otherwise just add the existing target cpu and target features to the
// function.
- Features = getTarget().getTargetOpts().Features;
+ // For AMDGPU, by default we don't emit target-features for functions
+ // without explicit target attributes, as the backend can derive the
+ // features from target-cpu. Use -famdgpu-emit-full-target-features to emit
+ // all features.
+ if (!getTarget().getTriple().isAMDGPU() ||
+ CodeGenOpts.AMDGPUEmitFullTargetFeatures)
+ Features = getTarget().getTargetOpts().Features;
}
if (!TargetCPU.empty()) {
diff --git a/clang/test/CodeGen/link-builtin-bitcode.c
b/clang/test/CodeGen/link-builtin-bitcode.c
index f6e45bf573705..d03fd6fc66d03 100644
--- a/clang/test/CodeGen/link-builtin-bitcode.c
+++ b/clang/test/CodeGen/link-builtin-bitcode.c
@@ -43,7 +43,7 @@ int bar() { return no_attr() + attr_in_target() +
attr_not_in_target() + attr_in
// CHECK-LABEL: @attr_incompatible
// CHECK-SAME: () #[[ATTR_INCOMPATIBLE:[0-9]+]] {
-// CHECK: attributes #[[ATTR_BAR]] = { {{.*}} "target-cpu"="gfx90a"
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+lerp-inst,+mai-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64"
}
+// CHECK: attributes #[[ATTR_BAR]] = { {{.*}} "target-cpu"="gfx90a" }
// CHECK: attributes #[[ATTR_COMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a"
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64"
}
// CHECK: attributes #[[ATTR_EXTEND]] = { {{.*}} "target-cpu"="gfx90a"
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+extended-image-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64"
}
// CHECK: attributes #[[ATTR_INCOMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a"
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64,-gfx9-insts"
}
diff --git a/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl
b/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl
index 38b5ed8de34cc..ece84d5b75ca7 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl
@@ -26,8 +26,8 @@ kernel void foo(global int *p) { *p = 1; }
// CHECK-NEXT: ret void
//
//.
-// CHECK: attributes #[[ATTR0]] = { convergent norecurse nounwind
"amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256"
"no-trapping-math"="true" "stack-protector-buffer-size"="8"
"target-cpu"="gfx1250"
"target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+mcast-load-insts,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
"uniform-work-group-size"="false" }
-// CHECK: attributes #[[ATTR1]] = { alwaysinline convergent norecurse nounwind
"amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256"
"no-trapping-math"="true" "stack-protector-buffer-size"="8"
"target-cpu"="gfx1250"
"target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+mcast-load-insts,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
}
+// CHECK: attributes #[[ATTR0]] = { convergent norecurse nounwind
"amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256"
"no-trapping-math"="true" "stack-protector-buffer-size"="8"
"target-cpu"="gfx1250" "uniform-work-group-size"="false" }
+// CHECK: attributes #[[ATTR1]] = { alwaysinline convergent norecurse nounwind
"amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256"
"no-trapping-math"="true" "stack-protector-buffer-size"="8"
"target-cpu"="gfx1250" }
// CHECK: attributes #[[ATTR2]] = { convergent nounwind }
//.
// CHECK: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
index 2cbc9787a04b0..a28bc62c11d19 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
@@ -481,7 +481,7 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: [[TMP1:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1,
!tbaa [[CHAR_TBAA18]]
// GFX900-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr
[[C_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA9]]
// GFX900-NEXT: [[TMP3:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8,
!tbaa [[LONG_TBAA7]]
-// GFX900-NEXT: call void @__clang_ocl_kern_imp_test(ptr addrspace(1)
noundef align 1 [[TMP0]], i8 noundef signext [[TMP1]], ptr addrspace(1) noundef
align 8 [[TMP2]], i64 noundef [[TMP3]]) #[[ATTR8:[0-9]+]]
+// GFX900-NEXT: call void @__clang_ocl_kern_imp_test(ptr addrspace(1)
noundef align 1 [[TMP0]], i8 noundef signext [[TMP1]], ptr addrspace(1) noundef
align 8 [[TMP2]], i64 noundef [[TMP3]]) #[[ATTR10:[0-9]+]]
// GFX900-NEXT: ret void
//
//
@@ -523,10 +523,10 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: store i8 [[B]], ptr [[B_ADDR_ASCAST]], align 1, !tbaa
[[CHAR_TBAA18]]
// GFX900-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR_ASCAST]], align
8, !tbaa [[LONGPTR_TBAA9]]
// GFX900-NEXT: store i64 [[D]], ptr [[D_ADDR_ASCAST]], align 8, !tbaa
[[LONG_TBAA7]]
-// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[DEFAULT_QUEUE]]) #[[ATTR9:[0-9]+]]
-// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[FLAGS]]) #[[ATTR9]]
+// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[DEFAULT_QUEUE]]) #[[ATTR11:[0-9]+]]
+// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[FLAGS]]) #[[ATTR11]]
// GFX900-NEXT: store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa
[[INT_TBAA3:![0-9]+]]
-// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[NDRANGE]]) #[[ATTR9]]
+// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[NDRANGE]]) #[[ATTR11]]
// GFX900-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5)
[[DEFAULT_QUEUE]], align 8, !tbaa [[QUEUE_T_TBAA19:![0-9]+]]
// GFX900-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align
4, !tbaa [[INT_TBAA3]]
// GFX900-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4
[[TMP_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false),
!tbaa.struct [[TBAA_STRUCT21:![0-9]+]]
@@ -586,12 +586,12 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: [[BLOCK_CAPTURED19:%.*]] = getelementptr inbounds nuw <{
i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr
[[BLOCK12_ASCAST]], i32 0, i32 5
// GFX900-NEXT: [[TMP17:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8,
!tbaa [[LONG_TBAA7]]
// GFX900-NEXT: store i64 [[TMP17]], ptr [[BLOCK_CAPTURED19]], align 8,
!tbaa [[LONG_TBAA7]]
-// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[BLOCK_SIZES]]) #[[ATTR9]]
+// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[BLOCK_SIZES]]) #[[ATTR11]]
// GFX900-NEXT: [[TMP18:%.*]] = getelementptr [1 x i64], ptr addrspace(5)
[[BLOCK_SIZES]], i32 0, i32 0
// GFX900-NEXT: store i64 100, ptr addrspace(5) [[TMP18]], align 8
// GFX900-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr
addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr
addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to
ptr), ptr [[BLOCK12_ASCAST]], i32 1, ptr addrspace(5) [[TMP18]])
-// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5)
[[BLOCK_SIZES]]) #[[ATTR9]]
-// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[BLOCK20]]) #[[ATTR9]]
+// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5)
[[BLOCK_SIZES]]) #[[ATTR11]]
+// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[BLOCK20]]) #[[ATTR11]]
// GFX900-NEXT: [[BLOCK_SIZE22:%.*]] = getelementptr inbounds nuw <{ i32,
i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 0
// GFX900-NEXT: store i32 32, ptr [[BLOCK_SIZE22]], align 8
// GFX900-NEXT: [[BLOCK_ALIGN23:%.*]] = getelementptr inbounds nuw <{ i32,
i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 1
@@ -610,28 +610,28 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4
[[TMP27_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false),
!tbaa.struct [[TBAA_STRUCT21]]
// GFX900-NEXT: [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[BLOCK20]],
align 8, !tbaa [[CHAR_TBAA18]]
// GFX900-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr
addrspace(1) [[TMP22]], i32 [[TMP23]], ptr addrspace(5) [[VARTMP27]], ptr
addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to
ptr), ptr [[BLOCK21_ASCAST]])
-// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5)
[[BLOCK20]]) #[[ATTR9]]
-// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5)
[[NDRANGE]]) #[[ATTR9]]
-// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) [[FLAGS]])
#[[ATTR9]]
-// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5)
[[DEFAULT_QUEUE]]) #[[ATTR9]]
+// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5)
[[BLOCK20]]) #[[ATTR11]]
+// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5)
[[NDRANGE]]) #[[ATTR11]]
+// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) [[FLAGS]])
#[[ATTR11]]
+// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5)
[[DEFAULT_QUEUE]]) #[[ATTR11]]
// GFX900-NEXT: ret void
//
//
// GFX900: Function Attrs: convergent norecurse nounwind
// GFX900-LABEL: define dso_local amdgpu_kernel void
@test_target_features_kernel(
-// GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2]]
!kernel_arg_addr_space [[META22:![0-9]+]] !kernel_arg_access_qual
[[META23:![0-9]+]] !kernel_arg_type [[META24:![0-9]+]] !kernel_arg_base_type
[[META24]] !kernel_arg_type_qual [[META25:![0-9]+]] {
+// GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR4:[0-9]+]]
!kernel_arg_addr_space [[META22:![0-9]+]] !kernel_arg_access_qual
[[META23:![0-9]+]] !kernel_arg_type [[META24:![0-9]+]] !kernel_arg_base_type
[[META24]] !kernel_arg_type_qual [[META25:![0-9]+]] {
// GFX900-NEXT: [[ENTRY:.*:]]
// GFX900-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8,
addrspace(5)
// GFX900-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[I_ADDR]] to ptr
// GFX900-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align
8, !tbaa [[INTPTR_TBAA26:![0-9]+]]
// GFX900-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr
[[I_ADDR_ASCAST]], align 8, !tbaa [[INTPTR_TBAA26]]
-// GFX900-NEXT: call void
@__clang_ocl_kern_imp_test_target_features_kernel(ptr addrspace(1) noundef
align 4 [[TMP0]]) #[[ATTR8]]
+// GFX900-NEXT: call void
@__clang_ocl_kern_imp_test_target_features_kernel(ptr addrspace(1) noundef
align 4 [[TMP0]]) #[[ATTR10]]
// GFX900-NEXT: ret void
//
//
// GFX900: Function Attrs: alwaysinline convergent norecurse nounwind
// GFX900-LABEL: define dso_local void
@__clang_ocl_kern_imp_test_target_features_kernel(
-// GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR3]]
!kernel_arg_addr_space [[META22]] !kernel_arg_access_qual [[META23]]
!kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]]
!kernel_arg_type_qual [[META25]] {
+// GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR5:[0-9]+]]
!kernel_arg_addr_space [[META22]] !kernel_arg_access_qual [[META23]]
!kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]]
!kernel_arg_type_qual [[META25]] {
// GFX900-NEXT: [[ENTRY:.*:]]
// GFX900-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8,
addrspace(5)
// GFX900-NEXT: [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8,
addrspace(5)
@@ -641,24 +641,24 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[I_ADDR]] to ptr
// GFX900-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]]
to ptr
// GFX900-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align
8, !tbaa [[INTPTR_TBAA26]]
-// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[DEFAULT_QUEUE]]) #[[ATTR9]]
-// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[FLAGS]]) #[[ATTR9]]
+// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[DEFAULT_QUEUE]]) #[[ATTR11]]
+// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[FLAGS]]) #[[ATTR11]]
// GFX900-NEXT: store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa
[[INT_TBAA3]]
-// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[NDRANGE]]) #[[ATTR9]]
+// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[NDRANGE]]) #[[ATTR11]]
// GFX900-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
// GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5)
[[DEFAULT_QUEUE]], align 8, !tbaa [[QUEUE_T_TBAA19]]
// GFX900-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align
4, !tbaa [[INT_TBAA3]]
// GFX900-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4
[[TMP_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false),
!tbaa.struct [[TBAA_STRUCT21]]
// GFX900-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr
addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) [[TMP]], ptr
addrspacecast (ptr addrspace(1)
@__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr), ptr
addrspacecast (ptr ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/176533
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits