https://github.com/shiltian created
https://github.com/llvm/llvm-project/pull/176533
Currently, AMDGPU functions have `target-features` attribute populated with all
default features for the target GPU. This is redundant because the backend can
derive these defaults from the `target-cpu` attribute via
`AMDGPUTargetMachine::getFeatureString()`.
In this PR, for AMDGPU targets only:
- Functions without explicit target attributes no longer emit `target-features`
- Functions with `__attribute__((target(...)))` or `-target-feature` emit only
features that differ from the target's defaults (delta)
The backend already handles missing `target-features` correctly by falling back
to the TargetMachine's defaults.
A new cc1 flag `-famdgpu-emit-full-target-features` is added to emit full
features when needed.
Example:
Before:
```llvm
attributes #0 = { "target-cpu"="gfx90a"
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,..."
}
```
After (default):
```llvm
attributes #0 = { "target-cpu"="gfx90a" }
```
After (with explicit `+wavefrontsize32` override):
```llvm
attributes #0 = { "target-cpu"="gfx90a" "target-features"="+wavefrontsize32" }
```
>From 6a156fcc8c93ebe6328a7d62067b843be5964dba Mon Sep 17 00:00:00 2001
From: Shilei Tian <[email protected]>
Date: Fri, 16 Jan 2026 22:39:13 -0500
Subject: [PATCH] [RFC][Clang][AMDGPU] Emit only delta target-features to
reduce IR bloat
Currently, AMDGPU functions have `target-features` attribute populated with all
default features for the target GPU. This is redundant because the backend can
derive these defaults from the `target-cpu` attribute via
`AMDGPUTargetMachine::getFeatureString()`.
In this PR, for AMDGPU targets only:
- Functions without explicit target attributes no longer emit `target-features`
- Functions with `__attribute__((target(...)))` or `-target-feature` emit only
features that differ from the target's defaults (delta)
The backend already handles missing `target-features` correctly by falling back
to the TargetMachine's defaults.
A new cc1 flag `-famdgpu-emit-full-target-features` is added to emit full
features when needed.
Example:
Before:
```llvm
attributes #0 = { "target-cpu"="gfx90a"
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,..."
}
```
After (default):
```llvm
attributes #0 = { "target-cpu"="gfx90a" }
```
After (with explicit `+wavefrontsize32` override):
```llvm
attributes #0 = { "target-cpu"="gfx90a" "target-features"="+wavefrontsize32" }
```
---
clang/include/clang/Basic/CodeGenOptions.def | 4 +
clang/include/clang/Options/Options.td | 6 +
clang/lib/CodeGen/CodeGenModule.cpp | 38 +-
clang/test/CodeGen/link-builtin-bitcode.c | 2 +-
.../test/CodeGenOpenCL/amdgpu-cluster-dims.cl | 4 +-
.../CodeGenOpenCL/amdgpu-enqueue-kernel.cl | 82 ++--
.../amdgpu-features-default-delta.cl | 70 ++++
clang/test/CodeGenOpenCL/amdgpu-features.cl | 106 +++---
...eadonly-features-written-with-no-target.cl | 10 +-
clang/test/OpenMP/amdgcn-attributes.cpp | 353 +++++++++++++++++-
10 files changed, 556 insertions(+), 119 deletions(-)
create mode 100644 clang/test/CodeGenOpenCL/amdgpu-features-default-delta.cl
diff --git a/clang/include/clang/Basic/CodeGenOptions.def
b/clang/include/clang/Basic/CodeGenOptions.def
index baf8b093c10e6..ec3cf0b432143 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -470,6 +470,10 @@ CODEGENOPT(EmitIEEENaNCompliantInsts, 1, 1, Benign)
/// Expands s_waitcnt instructions to help PC-sampling profilers identify
stalls.
CODEGENOPT(AMDGPUExpandWaitcntProfiling, 1, 0, Benign)
+/// Emit full target-features attribute for AMDGPU (for testing). (AMDGPU Only)
+/// By default, only features that differ from the target CPU's defaults are
emitted.
+CODEGENOPT(AMDGPUEmitFullTargetFeatures, 1, 0, Benign)
+
// Whether to emit Swift Async function extended frame information: auto,
// never, always.
ENUM_CODEGENOPT(SwiftAsyncFramePointer, SwiftAsyncFramePointerKind, 2,
diff --git a/clang/include/clang/Options/Options.td
b/clang/include/clang/Options/Options.td
index 188739e72434a..daecec88adcf2 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -5606,6 +5606,12 @@ defm amdgpu_expand_waitcnt_profiling :
BoolMOption<"amdgpu-expand-waitcnt-profil
"emits waitcnt(N-1), waitcnt(N-2), ..., waitcnt(target). (AMDGPU only)">,
NegFlag<SetFalse, [], [ClangOption]>>;
+def famdgpu_emit_full_target_features : Flag<["-"],
"famdgpu-emit-full-target-features">,
+ Visibility<[CC1Option]>,
+ HelpText<"Emit full target-features attribute for AMDGPU functions instead
of "
+ "only the delta from the target CPU's defaults. (AMDGPU only)">,
+ MarshallingInfoFlag<CodeGenOpts<"AMDGPUEmitFullTargetFeatures">>;
+
def mcode_object_version_EQ : Joined<["-"], "mcode-object-version=">,
Group<m_Group>,
HelpText<"Specify code object ABI version. Defaults to 6. (AMDGPU only)">,
Visibility<[ClangOption, FlangOption, CC1Option, FC1Option]>,
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp
b/clang/lib/CodeGen/CodeGenModule.cpp
index dc8a31b7f7f0d..95dbe5b27339a 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -2954,10 +2954,6 @@ bool
CodeGenModule::GetCPUAndFeaturesAttributes(GlobalDecl GD,
llvm::StringMap<bool> FeatureMap;
getContext().getFunctionFeatureMap(FeatureMap, GD);
- // Produce the canonical string for this set of features.
- for (const llvm::StringMap<bool>::value_type &Entry : FeatureMap)
- Features.push_back((Entry.getValue() ? "+" : "-") +
Entry.getKey().str());
-
// Now add the target-cpu and target-features to the function.
// While we populated the feature map above, we still need to
// get and parse the target attribute so we can get the cpu for
@@ -2980,10 +2976,42 @@ bool
CodeGenModule::GetCPUAndFeaturesAttributes(GlobalDecl GD,
// favor this processor.
TuneCPU = SD->getCPUName(GD.getMultiVersionIndex())->getName();
}
+
+ // For AMDGPU, by default only emit delta features (features that differ
+ // from the target CPU's defaults). Use -famdgpu-emit-full-target-features
+ // to emit all features.
+ if (getTarget().getTriple().isAMDGPU() &&
+ !CodeGenOpts.AMDGPUEmitFullTargetFeatures) {
+ // Get the default feature map for the (possibly overridden) target CPU.
+ llvm::StringMap<bool> DefaultFeatureMap;
+ getTarget().initFeatureMap(DefaultFeatureMap,
+ getContext().getDiagnostics(), TargetCPU, {});
+
+ // Only emit features that differ from the defaults.
+ for (const auto &Entry : FeatureMap) {
+ auto DefaultIt = DefaultFeatureMap.find(Entry.getKey());
+ // Emit if the feature is not in defaults or has a different value.
+ if (DefaultIt == DefaultFeatureMap.end() ||
+ DefaultIt->getValue() != Entry.getValue())
+ Features.push_back((Entry.getValue() ? "+" : "-") +
+ Entry.getKey().str());
+ }
+ } else {
+ // Produce the canonical string for this set of features.
+ for (const llvm::StringMap<bool>::value_type &Entry : FeatureMap)
+ Features.push_back((Entry.getValue() ? "+" : "-") +
+ Entry.getKey().str());
+ }
} else {
// Otherwise just add the existing target cpu and target features to the
// function.
- Features = getTarget().getTargetOpts().Features;
+ // For AMDGPU, by default we don't emit target-features for functions
+ // without explicit target attributes, as the backend can derive the
+ // features from target-cpu. Use -famdgpu-emit-full-target-features to emit
+ // all features.
+ if (!getTarget().getTriple().isAMDGPU() ||
+ CodeGenOpts.AMDGPUEmitFullTargetFeatures)
+ Features = getTarget().getTargetOpts().Features;
}
if (!TargetCPU.empty()) {
diff --git a/clang/test/CodeGen/link-builtin-bitcode.c
b/clang/test/CodeGen/link-builtin-bitcode.c
index f6e45bf573705..d03fd6fc66d03 100644
--- a/clang/test/CodeGen/link-builtin-bitcode.c
+++ b/clang/test/CodeGen/link-builtin-bitcode.c
@@ -43,7 +43,7 @@ int bar() { return no_attr() + attr_in_target() +
attr_not_in_target() + attr_in
// CHECK-LABEL: @attr_incompatible
// CHECK-SAME: () #[[ATTR_INCOMPATIBLE:[0-9]+]] {
-// CHECK: attributes #[[ATTR_BAR]] = { {{.*}} "target-cpu"="gfx90a"
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+lerp-inst,+mai-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64"
}
+// CHECK: attributes #[[ATTR_BAR]] = { {{.*}} "target-cpu"="gfx90a" }
// CHECK: attributes #[[ATTR_COMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a"
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64"
}
// CHECK: attributes #[[ATTR_EXTEND]] = { {{.*}} "target-cpu"="gfx90a"
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+extended-image-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64"
}
// CHECK: attributes #[[ATTR_INCOMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a"
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64,-gfx9-insts"
}
diff --git a/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl
b/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl
index 38b5ed8de34cc..ece84d5b75ca7 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl
@@ -26,8 +26,8 @@ kernel void foo(global int *p) { *p = 1; }
// CHECK-NEXT: ret void
//
//.
-// CHECK: attributes #[[ATTR0]] = { convergent norecurse nounwind
"amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256"
"no-trapping-math"="true" "stack-protector-buffer-size"="8"
"target-cpu"="gfx1250"
"target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+mcast-load-insts,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
"uniform-work-group-size"="false" }
-// CHECK: attributes #[[ATTR1]] = { alwaysinline convergent norecurse nounwind
"amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256"
"no-trapping-math"="true" "stack-protector-buffer-size"="8"
"target-cpu"="gfx1250"
"target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+mcast-load-insts,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
}
+// CHECK: attributes #[[ATTR0]] = { convergent norecurse nounwind
"amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256"
"no-trapping-math"="true" "stack-protector-buffer-size"="8"
"target-cpu"="gfx1250" "uniform-work-group-size"="false" }
+// CHECK: attributes #[[ATTR1]] = { alwaysinline convergent norecurse nounwind
"amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256"
"no-trapping-math"="true" "stack-protector-buffer-size"="8"
"target-cpu"="gfx1250" }
// CHECK: attributes #[[ATTR2]] = { convergent nounwind }
//.
// CHECK: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
index 2cbc9787a04b0..a28bc62c11d19 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
@@ -481,7 +481,7 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: [[TMP1:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1,
!tbaa [[CHAR_TBAA18]]
// GFX900-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr
[[C_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA9]]
// GFX900-NEXT: [[TMP3:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8,
!tbaa [[LONG_TBAA7]]
-// GFX900-NEXT: call void @__clang_ocl_kern_imp_test(ptr addrspace(1)
noundef align 1 [[TMP0]], i8 noundef signext [[TMP1]], ptr addrspace(1) noundef
align 8 [[TMP2]], i64 noundef [[TMP3]]) #[[ATTR8:[0-9]+]]
+// GFX900-NEXT: call void @__clang_ocl_kern_imp_test(ptr addrspace(1)
noundef align 1 [[TMP0]], i8 noundef signext [[TMP1]], ptr addrspace(1) noundef
align 8 [[TMP2]], i64 noundef [[TMP3]]) #[[ATTR10:[0-9]+]]
// GFX900-NEXT: ret void
//
//
@@ -523,10 +523,10 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: store i8 [[B]], ptr [[B_ADDR_ASCAST]], align 1, !tbaa
[[CHAR_TBAA18]]
// GFX900-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR_ASCAST]], align
8, !tbaa [[LONGPTR_TBAA9]]
// GFX900-NEXT: store i64 [[D]], ptr [[D_ADDR_ASCAST]], align 8, !tbaa
[[LONG_TBAA7]]
-// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[DEFAULT_QUEUE]]) #[[ATTR9:[0-9]+]]
-// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[FLAGS]]) #[[ATTR9]]
+// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[DEFAULT_QUEUE]]) #[[ATTR11:[0-9]+]]
+// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[FLAGS]]) #[[ATTR11]]
// GFX900-NEXT: store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa
[[INT_TBAA3:![0-9]+]]
-// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[NDRANGE]]) #[[ATTR9]]
+// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[NDRANGE]]) #[[ATTR11]]
// GFX900-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5)
[[DEFAULT_QUEUE]], align 8, !tbaa [[QUEUE_T_TBAA19:![0-9]+]]
// GFX900-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align
4, !tbaa [[INT_TBAA3]]
// GFX900-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4
[[TMP_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false),
!tbaa.struct [[TBAA_STRUCT21:![0-9]+]]
@@ -586,12 +586,12 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: [[BLOCK_CAPTURED19:%.*]] = getelementptr inbounds nuw <{
i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr
[[BLOCK12_ASCAST]], i32 0, i32 5
// GFX900-NEXT: [[TMP17:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8,
!tbaa [[LONG_TBAA7]]
// GFX900-NEXT: store i64 [[TMP17]], ptr [[BLOCK_CAPTURED19]], align 8,
!tbaa [[LONG_TBAA7]]
-// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[BLOCK_SIZES]]) #[[ATTR9]]
+// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[BLOCK_SIZES]]) #[[ATTR11]]
// GFX900-NEXT: [[TMP18:%.*]] = getelementptr [1 x i64], ptr addrspace(5)
[[BLOCK_SIZES]], i32 0, i32 0
// GFX900-NEXT: store i64 100, ptr addrspace(5) [[TMP18]], align 8
// GFX900-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr
addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr
addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to
ptr), ptr [[BLOCK12_ASCAST]], i32 1, ptr addrspace(5) [[TMP18]])
-// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5)
[[BLOCK_SIZES]]) #[[ATTR9]]
-// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[BLOCK20]]) #[[ATTR9]]
+// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5)
[[BLOCK_SIZES]]) #[[ATTR11]]
+// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[BLOCK20]]) #[[ATTR11]]
// GFX900-NEXT: [[BLOCK_SIZE22:%.*]] = getelementptr inbounds nuw <{ i32,
i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 0
// GFX900-NEXT: store i32 32, ptr [[BLOCK_SIZE22]], align 8
// GFX900-NEXT: [[BLOCK_ALIGN23:%.*]] = getelementptr inbounds nuw <{ i32,
i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 1
@@ -610,28 +610,28 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4
[[TMP27_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false),
!tbaa.struct [[TBAA_STRUCT21]]
// GFX900-NEXT: [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[BLOCK20]],
align 8, !tbaa [[CHAR_TBAA18]]
// GFX900-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr
addrspace(1) [[TMP22]], i32 [[TMP23]], ptr addrspace(5) [[VARTMP27]], ptr
addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to
ptr), ptr [[BLOCK21_ASCAST]])
-// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5)
[[BLOCK20]]) #[[ATTR9]]
-// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5)
[[NDRANGE]]) #[[ATTR9]]
-// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) [[FLAGS]])
#[[ATTR9]]
-// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5)
[[DEFAULT_QUEUE]]) #[[ATTR9]]
+// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5)
[[BLOCK20]]) #[[ATTR11]]
+// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5)
[[NDRANGE]]) #[[ATTR11]]
+// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) [[FLAGS]])
#[[ATTR11]]
+// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5)
[[DEFAULT_QUEUE]]) #[[ATTR11]]
// GFX900-NEXT: ret void
//
//
// GFX900: Function Attrs: convergent norecurse nounwind
// GFX900-LABEL: define dso_local amdgpu_kernel void
@test_target_features_kernel(
-// GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2]]
!kernel_arg_addr_space [[META22:![0-9]+]] !kernel_arg_access_qual
[[META23:![0-9]+]] !kernel_arg_type [[META24:![0-9]+]] !kernel_arg_base_type
[[META24]] !kernel_arg_type_qual [[META25:![0-9]+]] {
+// GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR4:[0-9]+]]
!kernel_arg_addr_space [[META22:![0-9]+]] !kernel_arg_access_qual
[[META23:![0-9]+]] !kernel_arg_type [[META24:![0-9]+]] !kernel_arg_base_type
[[META24]] !kernel_arg_type_qual [[META25:![0-9]+]] {
// GFX900-NEXT: [[ENTRY:.*:]]
// GFX900-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8,
addrspace(5)
// GFX900-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[I_ADDR]] to ptr
// GFX900-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align
8, !tbaa [[INTPTR_TBAA26:![0-9]+]]
// GFX900-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr
[[I_ADDR_ASCAST]], align 8, !tbaa [[INTPTR_TBAA26]]
-// GFX900-NEXT: call void
@__clang_ocl_kern_imp_test_target_features_kernel(ptr addrspace(1) noundef
align 4 [[TMP0]]) #[[ATTR8]]
+// GFX900-NEXT: call void
@__clang_ocl_kern_imp_test_target_features_kernel(ptr addrspace(1) noundef
align 4 [[TMP0]]) #[[ATTR10]]
// GFX900-NEXT: ret void
//
//
// GFX900: Function Attrs: alwaysinline convergent norecurse nounwind
// GFX900-LABEL: define dso_local void
@__clang_ocl_kern_imp_test_target_features_kernel(
-// GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR3]]
!kernel_arg_addr_space [[META22]] !kernel_arg_access_qual [[META23]]
!kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]]
!kernel_arg_type_qual [[META25]] {
+// GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR5:[0-9]+]]
!kernel_arg_addr_space [[META22]] !kernel_arg_access_qual [[META23]]
!kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]]
!kernel_arg_type_qual [[META25]] {
// GFX900-NEXT: [[ENTRY:.*:]]
// GFX900-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8,
addrspace(5)
// GFX900-NEXT: [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8,
addrspace(5)
@@ -641,24 +641,24 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[I_ADDR]] to ptr
// GFX900-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]]
to ptr
// GFX900-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align
8, !tbaa [[INTPTR_TBAA26]]
-// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[DEFAULT_QUEUE]]) #[[ATTR9]]
-// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[FLAGS]]) #[[ATTR9]]
+// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[DEFAULT_QUEUE]]) #[[ATTR11]]
+// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[FLAGS]]) #[[ATTR11]]
// GFX900-NEXT: store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa
[[INT_TBAA3]]
-// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[NDRANGE]]) #[[ATTR9]]
+// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5)
[[NDRANGE]]) #[[ATTR11]]
// GFX900-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
// GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5)
[[DEFAULT_QUEUE]], align 8, !tbaa [[QUEUE_T_TBAA19]]
// GFX900-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align
4, !tbaa [[INT_TBAA3]]
// GFX900-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4
[[TMP_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false),
!tbaa.struct [[TBAA_STRUCT21]]
// GFX900-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr
addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) [[TMP]], ptr
addrspacecast (ptr addrspace(1)
@__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr), ptr
addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
-// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5)
[[NDRANGE]]) #[[ATTR9]]
-// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) [[FLAGS]])
#[[ATTR9]]
-// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5)
[[DEFAULT_QUEUE]]) #[[ATTR9]]
+// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5)
[[NDRANGE]]) #[[ATTR11]]
+// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) [[FLAGS]])
#[[ATTR11]]
+// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5)
[[DEFAULT_QUEUE]]) #[[ATTR11]]
// GFX900-NEXT: ret void
//
//
// GFX900: Function Attrs: convergent nounwind
// GFX900-LABEL: define internal void @__test_block_invoke(
-// GFX900-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR6:[0-9]+]] {
+// GFX900-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR8:[0-9]+]] {
// GFX900-NEXT: [[ENTRY:.*:]]
// GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8,
addrspace(5)
// GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr
addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
@@ -674,7 +674,7 @@ kernel void test_target_features_kernel(global int *i) {
//
// GFX900: Function Attrs: convergent nounwind
// GFX900-LABEL: define internal amdgpu_kernel void
@__test_block_invoke_kernel(
-// GFX900-SAME: <{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]])
#[[ATTR6]] !associated [[META28:![0-9]+]] !kernel_arg_addr_space
[[META29:![0-9]+]] !kernel_arg_access_qual [[META23]] !kernel_arg_type
[[META30:![0-9]+]] !kernel_arg_base_type [[META30]] !kernel_arg_type_qual
[[META25]] {
+// GFX900-SAME: <{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]])
#[[ATTR8]] !associated [[META28:![0-9]+]] !kernel_arg_addr_space
[[META29:![0-9]+]] !kernel_arg_access_qual [[META23]] !kernel_arg_type
[[META30:![0-9]+]] !kernel_arg_base_type [[META30]] !kernel_arg_type_qual
[[META25]] {
// GFX900-NEXT: [[ENTRY:.*:]]
// GFX900-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1),
i8 }>, align 8, addrspace(5)
// GFX900-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0]],
ptr addrspace(5) [[TMP1]], align 8
@@ -685,7 +685,7 @@ kernel void test_target_features_kernel(global int *i) {
//
// GFX900: Function Attrs: convergent nounwind
// GFX900-LABEL: define internal void @__test_block_invoke_2(
-// GFX900-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR6]] {
+// GFX900-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR8]] {
// GFX900-NEXT: [[ENTRY:.*:]]
// GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8,
addrspace(5)
// GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr
addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
@@ -707,7 +707,7 @@ kernel void test_target_features_kernel(global int *i) {
//
// GFX900: Function Attrs: convergent nounwind
// GFX900-LABEL: define internal amdgpu_kernel void
@__test_block_invoke_2_kernel(
-// GFX900-SAME: <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8
}> [[TMP0:%.*]]) #[[ATTR6]] !associated [[META31:![0-9]+]]
!kernel_arg_addr_space [[META29]] !kernel_arg_access_qual [[META23]]
!kernel_arg_type [[META30]] !kernel_arg_base_type [[META30]]
!kernel_arg_type_qual [[META25]] {
+// GFX900-SAME: <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8
}> [[TMP0:%.*]]) #[[ATTR8]] !associated [[META31:![0-9]+]]
!kernel_arg_addr_space [[META29]] !kernel_arg_access_qual [[META23]]
!kernel_arg_type [[META30]] !kernel_arg_base_type [[META30]]
!kernel_arg_type_qual [[META25]] {
// GFX900-NEXT: [[ENTRY:.*:]]
// GFX900-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1),
ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
// GFX900-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1),
i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
@@ -718,7 +718,7 @@ kernel void test_target_features_kernel(global int *i) {
//
// GFX900: Function Attrs: convergent nounwind
// GFX900-LABEL: define internal void @__test_block_invoke_3(
-// GFX900-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]], ptr addrspace(3)
noundef [[LP:%.*]]) #[[ATTR6]] {
+// GFX900-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]], ptr addrspace(3)
noundef [[LP:%.*]]) #[[ATTR8]] {
// GFX900-NEXT: [[ENTRY:.*:]]
// GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8,
addrspace(5)
// GFX900-NEXT: [[LP_ADDR:%.*]] = alloca ptr addrspace(3), align 4,
addrspace(5)
@@ -746,7 +746,7 @@ kernel void test_target_features_kernel(global int *i) {
//
// GFX900: Function Attrs: convergent nounwind
// GFX900-LABEL: define internal amdgpu_kernel void
@__test_block_invoke_3_kernel(
-// GFX900-SAME: <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8
}> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR6]] !associated
[[META33:![0-9]+]] !kernel_arg_addr_space [[META34:![0-9]+]]
!kernel_arg_access_qual [[META35:![0-9]+]] !kernel_arg_type [[META36:![0-9]+]]
!kernel_arg_base_type [[META36]] !kernel_arg_type_qual [[META37:![0-9]+]] {
+// GFX900-SAME: <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8
}> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR8]] !associated
[[META33:![0-9]+]] !kernel_arg_addr_space [[META34:![0-9]+]]
!kernel_arg_access_qual [[META35:![0-9]+]] !kernel_arg_type [[META36:![0-9]+]]
!kernel_arg_base_type [[META36]] !kernel_arg_type_qual [[META37:![0-9]+]] {
// GFX900-NEXT: [[ENTRY:.*:]]
// GFX900-NEXT: [[TMP2:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1),
ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
// GFX900-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1),
i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP2]], align 8
@@ -757,7 +757,7 @@ kernel void test_target_features_kernel(global int *i) {
//
// GFX900: Function Attrs: convergent nounwind
// GFX900-LABEL: define internal void @__test_block_invoke_4(
-// GFX900-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR6]] {
+// GFX900-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR8]] {
// GFX900-NEXT: [[ENTRY:.*:]]
// GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8,
addrspace(5)
// GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr
addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
@@ -766,13 +766,13 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: [[TMP0:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR]], align
8, !tbaa [[LONG_TBAA7]]
// GFX900-NEXT: [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{
i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0,
i32 4
// GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr
[[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[LONGPTR_TBAA9]]
-// GFX900-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1)
noundef [[TMP1]]) #[[ATTR8]]
+// GFX900-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1)
noundef [[TMP1]]) #[[ATTR10]]
// GFX900-NEXT: ret void
//
//
// GFX900: Function Attrs: convergent nounwind
// GFX900-LABEL: define internal amdgpu_kernel void
@__test_block_invoke_4_kernel(
-// GFX900-SAME: <{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]])
#[[ATTR6]] !associated [[META38:![0-9]+]] !kernel_arg_addr_space [[META29]]
!kernel_arg_access_qual [[META23]] !kernel_arg_type [[META30]]
!kernel_arg_base_type [[META30]] !kernel_arg_type_qual [[META25]] {
+// GFX900-SAME: <{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]])
#[[ATTR8]] !associated [[META38:![0-9]+]] !kernel_arg_addr_space [[META29]]
!kernel_arg_access_qual [[META23]] !kernel_arg_type [[META30]]
!kernel_arg_base_type [[META30]] !kernel_arg_type_qual [[META25]] {
// GFX900-NEXT: [[ENTRY:.*:]]
// GFX900-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, i64, ptr
addrspace(1) }>, align 8, addrspace(5)
// GFX900-NEXT: store <{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0]],
ptr addrspace(5) [[TMP1]], align 8
@@ -783,7 +783,7 @@ kernel void test_target_features_kernel(global int *i) {
//
// GFX900: Function Attrs: convergent nounwind
// GFX900-LABEL: define internal void
@__test_target_features_kernel_block_invoke(
-// GFX900-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR6]] {
+// GFX900-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR8]] {
// GFX900-NEXT: [[ENTRY:.*:]]
// GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8,
addrspace(5)
// GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr
addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
@@ -794,7 +794,7 @@ kernel void test_target_features_kernel(global int *i) {
//
// GFX900: Function Attrs: convergent nounwind
// GFX900-LABEL: define internal amdgpu_kernel void
@__test_target_features_kernel_block_invoke_kernel(
-// GFX900-SAME: { i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR6]] !associated
[[META39:![0-9]+]] !kernel_arg_addr_space [[META29]] !kernel_arg_access_qual
[[META23]] !kernel_arg_type [[META30]] !kernel_arg_base_type [[META30]]
!kernel_arg_type_qual [[META25]] {
+// GFX900-SAME: { i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR8]] !associated
[[META39:![0-9]+]] !kernel_arg_addr_space [[META29]] !kernel_arg_access_qual
[[META23]] !kernel_arg_type [[META30]] !kernel_arg_base_type [[META30]]
!kernel_arg_type_qual [[META25]] {
// GFX900-NEXT: [[ENTRY:.*:]]
// GFX900-NEXT: [[TMP1:%.*]] = alloca { i32, i32, ptr }, align 8,
addrspace(5)
// GFX900-NEXT: store { i32, i32, ptr } [[TMP0]], ptr addrspace(5)
[[TMP1]], align 8
@@ -816,15 +816,17 @@ kernel void test_target_features_kernel(global int *i) {
// NOCPU: attributes #[[ATTR10]] = { convergent nounwind }
//.
// GFX900: attributes #[[ATTR0:[0-9]+]] = { "objc_arc_inert" }
-// GFX900: attributes #[[ATTR1]] = { convergent norecurse nounwind
"denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true"
"stack-protector-buffer-size"="8" "target-cpu"="gfx900"
"target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64,-sram-ecc"
}
-// GFX900: attributes #[[ATTR2]] = { convergent norecurse nounwind
"amdgpu-flat-work-group-size"="1,256"
"denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true"
"stack-protector-buffer-size"="8" "target-cpu"="gfx900"
"target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64,-sram-ecc"
"uniform-work-group-size"="false" }
-// GFX900: attributes #[[ATTR3]] = { alwaysinline convergent norecurse
nounwind "amdgpu-flat-work-group-size"="1,256"
"denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true"
"stack-protector-buffer-size"="8" "target-cpu"="gfx900"
"target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64,-sram-ecc"
}
-// GFX900: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind
willreturn memory(argmem: readwrite) }
-// GFX900: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind
willreturn memory(argmem: readwrite) }
-// GFX900: attributes #[[ATTR6]] = { convergent nounwind
"denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true"
"stack-protector-buffer-size"="8" "target-cpu"="gfx900"
"target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64,-sram-ecc"
}
-// GFX900: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind
willreturn }
-// GFX900: attributes #[[ATTR8]] = { convergent nounwind }
-// GFX900: attributes #[[ATTR9]] = { nounwind }
+// GFX900: attributes #[[ATTR1]] = { convergent norecurse nounwind
"denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true"
"stack-protector-buffer-size"="8" "target-cpu"="gfx900" }
+// GFX900: attributes #[[ATTR2]] = { convergent norecurse nounwind
"amdgpu-flat-work-group-size"="1,256"
"denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true"
"stack-protector-buffer-size"="8" "target-cpu"="gfx900"
"uniform-work-group-size"="false" }
+// GFX900: attributes #[[ATTR3]] = { alwaysinline convergent norecurse
nounwind "amdgpu-flat-work-group-size"="1,256"
"denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true"
"stack-protector-buffer-size"="8" "target-cpu"="gfx900" }
+// GFX900: attributes #[[ATTR4]] = { convergent norecurse nounwind
"amdgpu-flat-work-group-size"="1,256"
"denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true"
"stack-protector-buffer-size"="8" "target-cpu"="gfx900"
"target-features"="-sram-ecc" "uniform-work-group-size"="false" }
+// GFX900: attributes #[[ATTR5]] = { alwaysinline convergent norecurse
nounwind "amdgpu-flat-work-group-size"="1,256"
"denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true"
"stack-protector-buffer-size"="8" "target-cpu"="gfx900"
"target-features"="-sram-ecc" }
+// GFX900: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind
willreturn memory(argmem: readwrite) }
+// GFX900: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nounwind
willreturn memory(argmem: readwrite) }
+// GFX900: attributes #[[ATTR8]] = { convergent nounwind
"denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true"
"stack-protector-buffer-size"="8" "target-cpu"="gfx900" }
+// GFX900: attributes #[[ATTR9:[0-9]+]] = { nocallback nofree nosync nounwind
willreturn }
+// GFX900: attributes #[[ATTR10]] = { convergent nounwind }
+// GFX900: attributes #[[ATTR11]] = { nounwind }
//.
// NOCPU: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
// NOCPU: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features-default-delta.cl
b/clang/test/CodeGenOpenCL/amdgpu-features-default-delta.cl
new file mode 100644
index 0000000000000..d41a029c084b1
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/amdgpu-features-default-delta.cl
@@ -0,0 +1,70 @@
+// REQUIRES: amdgpu-registered-target
+
+// Test that by default, AMDGPU functions only emit delta target-features
+// (features that differ from the target CPU's defaults). This reduces IR
bloat.
+
+// Default behavior for gfx90a: test_default has no target-features,
+// test_explicit_attr has only the delta (+gfx11-insts).
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx90a -emit-llvm -o - %s |
FileCheck --check-prefix=GFX90A %s
+
+// With -famdgpu-emit-full-target-features, all features are emitted for both
functions.
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx90a
-famdgpu-emit-full-target-features -emit-llvm -o - %s | FileCheck
--check-prefix=FULL %s
+
+// With -target-feature, both functions get the delta feature.
+// gfx1030 defaults to wavefrontsize32, so +wavefrontsize64 is a delta.
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1030 -target-feature
+wavefrontsize64 -emit-llvm -o - %s | FileCheck --check-prefix=CMDLINE %s
+
+// GFX90A-LABEL: define {{.*}} @test_default()
+// GFX90A-SAME: #[[ATTR_DEFAULT:[0-9]+]]
+// GFX90A-LABEL: define {{.*}} @test_explicit_attr()
+// GFX90A-SAME: #[[ATTR_EXPLICIT:[0-9]+]]
+//
+// test_default should have target-cpu but NO target-features
+// GFX90A: attributes #[[ATTR_DEFAULT]] = {
+// GFX90A-SAME: "target-cpu"="gfx90a"
+// GFX90A-NOT: "target-features"
+// GFX90A-SAME: }
+//
+// test_explicit_attr should have target-cpu and ONLY the delta target-features
+// GFX90A: attributes #[[ATTR_EXPLICIT]] = {
+// GFX90A-SAME: "target-cpu"="gfx90a"
+// GFX90A-SAME: "target-features"="+gfx11-insts"
+// GFX90A-SAME: }
+
+// With -famdgpu-emit-full-target-features, both functions get full features.
+// FULL-LABEL: define {{.*}} @test_default()
+// FULL-SAME: #[[ATTR_DEFAULT:[0-9]+]]
+// FULL-LABEL: define {{.*}} @test_explicit_attr()
+// FULL-SAME: #[[ATTR_EXPLICIT:[0-9]+]]
+//
+// FULL: attributes #[[ATTR_DEFAULT]] = {
+// FULL-SAME: "target-cpu"="gfx90a"
+// FULL-SAME: "target-features"="{{[^"]+}}"
+// FULL-SAME: }
+//
+// FULL: attributes #[[ATTR_EXPLICIT]] = {
+// FULL-SAME: "target-cpu"="gfx90a"
+// FULL-SAME: "target-features"="{{[^"]+}}"
+// FULL-SAME: }
+
+// With -target-feature +wavefrontsize64, test_default gets just that delta,
+// test_explicit_attr gets both +gfx11-insts and +wavefrontsize64.
+// CMDLINE-LABEL: define {{.*}} @test_default()
+// CMDLINE-SAME: #[[ATTR_DEFAULT:[0-9]+]]
+// CMDLINE-LABEL: define {{.*}} @test_explicit_attr()
+// CMDLINE-SAME: #[[ATTR_EXPLICIT:[0-9]+]]
+//
+// CMDLINE: attributes #[[ATTR_DEFAULT]] = {
+// CMDLINE-SAME: "target-cpu"="gfx1030"
+// CMDLINE-SAME: "target-features"="+wavefrontsize64"
+// CMDLINE-SAME: }
+//
+// CMDLINE: attributes #[[ATTR_EXPLICIT]] = {
+// CMDLINE-SAME: "target-cpu"="gfx1030"
+// CMDLINE-SAME:
"target-features"="{{[^"]*}}+gfx11-insts{{[^"]*}}+wavefrontsize64{{[^"]*}}"
+// CMDLINE-SAME: }
+
+kernel void test_default() {}
+
+__attribute__((target("gfx11-insts")))
+void test_explicit_attr() {}
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl
b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index df5b56890dd5c..b3f844739e5c5 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -1,63 +1,63 @@
// REQUIRES: amdgpu-registered-target
// Check that appropriate features are defined for every supported AMDGPU
-// "-target" and "-mcpu" options.
+// "-target" and "-mcpu" options when -famdgpu-emit-full-target-features is
used.
-// RUN: %clang_cc1 -triple amdgcn -emit-llvm -o - %s | FileCheck
--check-prefix=NOCPU %s
-// RUN: %clang_cc1 -triple amdgcn -target-feature +wavefrontsize32 -emit-llvm
-o - %s | FileCheck --check-prefix=NOCPU-WAVE32 %s
-// RUN: %clang_cc1 -triple amdgcn -target-feature +wavefrontsize64 -emit-llvm
-o - %s | FileCheck --check-prefix=NOCPU-WAVE64 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-emit-llvm -o - %s | FileCheck --check-prefix=NOCPU %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck
--check-prefix=NOCPU-WAVE32 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck
--check-prefix=NOCPU-WAVE64 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx600 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX600 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx601 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX601 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx602 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX602 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx700 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX700 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx701 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX701 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx702 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX702 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx703 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX703 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx704 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX704 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx705 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX705 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx801 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX801 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx802 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX802 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx803 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX803 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx805 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX805 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx810 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX810 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx900 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX900 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx902 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX902 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx904 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX904 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx906 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX906 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx908 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX908 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx909 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX909 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx90a -emit-llvm -o - %s |
FileCheck --check-prefix=GFX90A %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx90c -emit-llvm -o - %s |
FileCheck --check-prefix=GFX90C %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx942 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX942 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx950 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX950 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1010 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX1010 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1011 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX1011 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1012 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX1012 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1013 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX1013 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1030 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX1030 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1031 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX1031 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1032 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX1032 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1033 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX1033 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1034 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX1034 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1035 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX1035 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1036 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX1036 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1100 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX1100 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1101 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX1101 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1102 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX1102 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1103 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX1103 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1150 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX1150 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1151 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX1151 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1152 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX1152 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1153 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX1153 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1200 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX1200 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1201 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX1201 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1250 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX1250 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1251 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX1251 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx600 -emit-llvm -o - %s | FileCheck --check-prefix=GFX600 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx601 -emit-llvm -o - %s | FileCheck --check-prefix=GFX601 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx602 -emit-llvm -o - %s | FileCheck --check-prefix=GFX602 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx700 -emit-llvm -o - %s | FileCheck --check-prefix=GFX700 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx701 -emit-llvm -o - %s | FileCheck --check-prefix=GFX701 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx702 -emit-llvm -o - %s | FileCheck --check-prefix=GFX702 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx703 -emit-llvm -o - %s | FileCheck --check-prefix=GFX703 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx704 -emit-llvm -o - %s | FileCheck --check-prefix=GFX704 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx705 -emit-llvm -o - %s | FileCheck --check-prefix=GFX705 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx801 -emit-llvm -o - %s | FileCheck --check-prefix=GFX801 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx802 -emit-llvm -o - %s | FileCheck --check-prefix=GFX802 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx803 -emit-llvm -o - %s | FileCheck --check-prefix=GFX803 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx805 -emit-llvm -o - %s | FileCheck --check-prefix=GFX805 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx810 -emit-llvm -o - %s | FileCheck --check-prefix=GFX810 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx900 -emit-llvm -o - %s | FileCheck --check-prefix=GFX900 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx902 -emit-llvm -o - %s | FileCheck --check-prefix=GFX902 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx904 -emit-llvm -o - %s | FileCheck --check-prefix=GFX904 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx906 -emit-llvm -o - %s | FileCheck --check-prefix=GFX906 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx908 -emit-llvm -o - %s | FileCheck --check-prefix=GFX908 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx909 -emit-llvm -o - %s | FileCheck --check-prefix=GFX909 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx90a -emit-llvm -o - %s | FileCheck --check-prefix=GFX90A %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx90c -emit-llvm -o - %s | FileCheck --check-prefix=GFX90C %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx942 -emit-llvm -o - %s | FileCheck --check-prefix=GFX942 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx950 -emit-llvm -o - %s | FileCheck --check-prefix=GFX950 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx1010 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1010 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx1011 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1011 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx1012 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1012 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx1013 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1013 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx1030 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1030 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx1031 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1031 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx1032 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1032 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx1033 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1033 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx1034 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1034 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx1035 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1035 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx1036 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1036 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx1100 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1100 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx1101 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1101 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx1102 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1102 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx1103 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1103 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx1150 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1150 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx1151 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1151 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx1152 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1152 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx1153 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1153 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx1200 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1200 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx1201 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1201 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx1250 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1250 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx1251 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1251 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1103 -target-feature
+wavefrontsize64 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1103-W64 %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx1103 -target-feature +wavefrontsize64 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX1103-W64 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx9-4-generic -emit-llvm -o -
%s | FileCheck --check-prefix=GFX9_4_Generic %s
+// RUN: %clang_cc1 -triple amdgcn -famdgpu-emit-full-target-features
-target-cpu gfx9-4-generic -emit-llvm -o - %s | FileCheck
--check-prefix=GFX9_4_Generic %s
// NOCPU-NOT: "target-features"
// NOCPU-WAVE32: "target-features"="+wavefrontsize32"
diff --git
a/clang/test/CodeGenOpenCL/amdgpu-readonly-features-written-with-no-target.cl
b/clang/test/CodeGenOpenCL/amdgpu-readonly-features-written-with-no-target.cl
index 2d50ce7cab2e0..1239df2a96d2e 100644
---
a/clang/test/CodeGenOpenCL/amdgpu-readonly-features-written-with-no-target.cl
+++
b/clang/test/CodeGenOpenCL/amdgpu-readonly-features-written-with-no-target.cl
@@ -4,13 +4,11 @@
// if there is no target specified.
// RUN: %clang_cc1 -triple amdgcn -emit-llvm -o - %s | FileCheck
--check-prefix=NOCPU %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx942 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX942 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1100 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX1100 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1200 -emit-llvm -o - %s |
FileCheck --check-prefix=GFX1200 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx942 -emit-llvm -o - %s |
FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1100 -emit-llvm -o - %s |
FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1200 -emit-llvm -o - %s |
FileCheck %s
__attribute__((target("gws,image-insts,vmem-to-lds-load-insts"))) void test()
{}
// NOCPU: "target-features"="+gws,+image-insts,+vmem-to-lds-load-insts"
-// GFX942:
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+lerp-inst,+mai-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64,+xf32-insts"
-// GFX1100:
"target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32"
-// GFX1200:
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32"
+// CHECK-NOT: "target-features"={{.*}}
diff --git a/clang/test/OpenMP/amdgcn-attributes.cpp
b/clang/test/OpenMP/amdgcn-attributes.cpp
index 03f5c31e3157c..4d9e54ae13a04 100644
--- a/clang/test/OpenMP/amdgcn-attributes.cpp
+++ b/clang/test/OpenMP/amdgcn-attributes.cpp
@@ -1,10 +1,11 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
UTC_ARGS: --check-attributes --check-globals all --include-generated-funcs
--prefix-filecheck-ir-name VAR --version 6
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown
-fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa
-fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device
-fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck
-check-prefixes=DEFAULT,ALL %s
-// RUN: %clang_cc1 -target-cpu gfx900 -fopenmp -x c++ -std=c++11 -triple
amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s
-fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - |
FileCheck -check-prefixes=CPU,ALL %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa
-fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device
-fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck
-check-prefix=DEFAULT %s
+// RUN: %clang_cc1 -target-cpu gfx900 -fopenmp -x c++ -std=c++11 -triple
amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s
-fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - |
FileCheck -check-prefix=CPU %s
-// RUN: %clang_cc1 -menable-no-nans -mno-amdgpu-ieee -fopenmp -x c++
-std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa
-emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path
%t-ppc-host.bc -o - | FileCheck -check-prefixes=NOIEEE,ALL %s
+// RUN: %clang_cc1 -menable-no-nans -mno-amdgpu-ieee -fopenmp -x c++
-std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa
-emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path
%t-ppc-host.bc -o - | FileCheck -check-prefix=NOIEEE %s
// expected-no-diagnostics
@@ -14,7 +15,6 @@ int callable(int);
// Check that the target attributes are set on the generated kernel
int func() {
- // ALL-LABEL: amdgpu_kernel void @__omp_offloading{{.*}} #0
int arr[N];
@@ -30,11 +30,340 @@ int callable(int x) {
// ALL-LABEL: @_Z8callablei(i32 noundef %x) #2
return x + 1;
}
-
-// DEFAULT: attributes #0 = { convergent mustprogress noinline norecurse
nounwind optnone "amdgpu-flat-work-group-size"="1,42" "kernel"
"no-trapping-math"="true" "omp_target_thread_limit"="42"
"stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
-// CPU: attributes #0 = { convergent mustprogress noinline norecurse nounwind
optnone "amdgpu-flat-work-group-size"="1,42" "kernel" "no-trapping-math"="true"
"omp_target_thread_limit"="42" "stack-protector-buffer-size"="8"
"target-cpu"="gfx900"
"target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64"
"uniform-work-group-size"="true" }
-// NOIEEE: attributes #0 = { convergent mustprogress noinline norecurse
nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-ieee"="false"
"kernel" "no-nans-fp-math"="true" "no-trapping-math"="true"
"omp_target_thread_limit"="42" "stack-protector-buffer-size"="8"
"uniform-work-group-size"="true" }
-
-// DEFAULT: attributes #2 = { convergent mustprogress noinline nounwind
optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CPU: attributes #2 = { convergent mustprogress noinline nounwind optnone
"no-trapping-math"="true" "stack-protector-buffer-size"="8"
"target-cpu"="gfx900"
"target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64"
}
-// NOIEEE: attributes #2 = { convergent mustprogress noinline nounwind optnone
"amdgpu-ieee"="false" "no-nans-fp-math"="true" "no-trapping-math"="true"
"stack-protector-buffer-size"="8" }
+//.
+// DEFAULT: @__omp_rtl_debug_kind = weak_odr hidden addrspace(1) constant i32 0
+// DEFAULT: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden
addrspace(1) constant i32 0
+// DEFAULT: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden
addrspace(1) constant i32 0
+// DEFAULT: @__omp_rtl_assume_no_thread_state = weak_odr hidden addrspace(1)
constant i32 0
+// DEFAULT: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden
addrspace(1) constant i32 0
+// DEFAULT: @[[GLOB0:[0-9]+]] = private unnamed_addr addrspace(1) constant [23
x i8] c"
+// DEFAULT: @[[GLOB1:[0-9]+]] = private unnamed_addr addrspace(1) constant
%struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr addrspacecast (ptr
addrspace(1) @[[GLOB0]] to ptr) }, align 8
+// DEFAULT: @__omp_offloading_10302_8f219df__Z4funcv_l21_dynamic_environment =
weak_odr protected addrspace(1) global %struct.DynamicEnvironmentTy
zeroinitializer
+// DEFAULT: @__omp_offloading_10302_8f219df__Z4funcv_l21_kernel_environment =
weak_odr protected addrspace(1) constant %struct.KernelEnvironmentTy {
%struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 42, i32 0,
i32 0, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr),
ptr addrspacecast (ptr addrspace(1)
@__omp_offloading_10302_8f219df__Z4funcv_l21_dynamic_environment to ptr) }
+//.
+// CPU: @__omp_rtl_debug_kind = weak_odr hidden addrspace(1) constant i32 0
+// CPU: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden
addrspace(1) constant i32 0
+// CPU: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden
addrspace(1) constant i32 0
+// CPU: @__omp_rtl_assume_no_thread_state = weak_odr hidden addrspace(1)
constant i32 0
+// CPU: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden addrspace(1)
constant i32 0
+// CPU: @[[GLOB0:[0-9]+]] = private unnamed_addr addrspace(1) constant [23 x
i8] c"
+// CPU: @[[GLOB1:[0-9]+]] = private unnamed_addr addrspace(1) constant
%struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr addrspacecast (ptr
addrspace(1) @[[GLOB0]] to ptr) }, align 8
+// CPU: @__omp_offloading_10302_8f219df__Z4funcv_l21_dynamic_environment =
weak_odr protected addrspace(1) global %struct.DynamicEnvironmentTy
zeroinitializer
+// CPU: @__omp_offloading_10302_8f219df__Z4funcv_l21_kernel_environment =
weak_odr protected addrspace(1) constant %struct.KernelEnvironmentTy {
%struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 42, i32 0,
i32 0, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr),
ptr addrspacecast (ptr addrspace(1)
@__omp_offloading_10302_8f219df__Z4funcv_l21_dynamic_environment to ptr) }
+//.
+// NOIEEE: @__omp_rtl_debug_kind = weak_odr hidden addrspace(1) constant i32 0
+// NOIEEE: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden
addrspace(1) constant i32 0
+// NOIEEE: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden
addrspace(1) constant i32 0
+// NOIEEE: @__omp_rtl_assume_no_thread_state = weak_odr hidden addrspace(1)
constant i32 0
+// NOIEEE: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden
addrspace(1) constant i32 0
+// NOIEEE: @[[GLOB0:[0-9]+]] = private unnamed_addr addrspace(1) constant [23
x i8] c"
+// NOIEEE: @[[GLOB1:[0-9]+]] = private unnamed_addr addrspace(1) constant
%struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr addrspacecast (ptr
addrspace(1) @[[GLOB0]] to ptr) }, align 8
+// NOIEEE: @__omp_offloading_10302_8f219df__Z4funcv_l21_dynamic_environment =
weak_odr protected addrspace(1) global %struct.DynamicEnvironmentTy
zeroinitializer
+// NOIEEE: @__omp_offloading_10302_8f219df__Z4funcv_l21_kernel_environment =
weak_odr protected addrspace(1) constant %struct.KernelEnvironmentTy {
%struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 42, i32 0,
i32 0, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr),
ptr addrspacecast (ptr addrspace(1)
@__omp_offloading_10302_8f219df__Z4funcv_l21_dynamic_environment to ptr) }
+//.
+// DEFAULT: Function Attrs: convergent mustprogress noinline norecurse
nounwind optnone
+// DEFAULT-LABEL: define weak_odr protected amdgpu_kernel void
@__omp_offloading_10302_8f219df__Z4funcv_l21(
+// DEFAULT-SAME: ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull
align 4 dereferenceable(400) [[ARR:%.*]]) #[[ATTR0:[0-9]+]] {
+// DEFAULT-NEXT: [[ENTRY:.*:]]
+// DEFAULT-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// DEFAULT-NEXT: [[ARR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// DEFAULT-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// DEFAULT-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4,
addrspace(5)
+// DEFAULT-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr
addrspace(5) [[DYN_PTR_ADDR]] to ptr
+// DEFAULT-NEXT: [[ARR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[ARR_ADDR]] to ptr
+// DEFAULT-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr
addrspace(5) [[DOTZERO_ADDR]] to ptr
+// DEFAULT-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr
addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
+// DEFAULT-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
+// DEFAULT-NEXT: store ptr [[ARR]], ptr [[ARR_ADDR_ASCAST]], align 8
+// DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARR_ADDR_ASCAST]], align 8,
!nonnull [[META6:![0-9]+]], !align [[META7:![0-9]+]]
+// DEFAULT-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr
addrspacecast (ptr addrspace(1)
@__omp_offloading_10302_8f219df__Z4funcv_l21_kernel_environment to ptr), ptr
[[DYN_PTR]])
+// DEFAULT-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// DEFAULT-NEXT: br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]],
label %[[WORKER_EXIT:.*]]
+// DEFAULT: [[USER_CODE_ENTRY]]:
+// DEFAULT-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr
addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
+// DEFAULT-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
+// DEFAULT-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP__ASCAST]],
align 4
+// DEFAULT-NEXT: call void
@__omp_offloading_10302_8f219df__Z4funcv_l21_omp_outlined(ptr
[[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], ptr [[TMP0]])
#[[ATTR3:[0-9]+]]
+// DEFAULT-NEXT: call void @__kmpc_target_deinit()
+// DEFAULT-NEXT: ret void
+// DEFAULT: [[WORKER_EXIT]]:
+// DEFAULT-NEXT: ret void
+//
+//
+// DEFAULT: Function Attrs: convergent noinline norecurse nounwind optnone
+// DEFAULT-LABEL: define internal void
@__omp_offloading_10302_8f219df__Z4funcv_l21_omp_outlined(
+// DEFAULT-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias
noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(400)
[[ARR:%.*]]) #[[ATTR1:[0-9]+]] {
+// DEFAULT-NEXT: [[ENTRY:.*:]]
+// DEFAULT-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8,
addrspace(5)
+// DEFAULT-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8,
addrspace(5)
+// DEFAULT-NEXT: [[ARR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// DEFAULT-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
+// DEFAULT-NEXT: [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr
addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
+// DEFAULT-NEXT: [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr
addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
+// DEFAULT-NEXT: [[ARR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[ARR_ADDR]] to ptr
+// DEFAULT-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to
ptr
+// DEFAULT-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr
[[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// DEFAULT-NEXT: store ptr [[DOTBOUND_TID_]], ptr
[[DOTBOUND_TID__ADDR_ASCAST]], align 8
+// DEFAULT-NEXT: store ptr [[ARR]], ptr [[ARR_ADDR_ASCAST]], align 8
+// DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARR_ADDR_ASCAST]], align 8,
!nonnull [[META6]], !align [[META7]]
+// DEFAULT-NEXT: store i32 0, ptr [[I_ASCAST]], align 4
+// DEFAULT-NEXT: br label %[[FOR_COND:.*]]
+// DEFAULT: [[FOR_COND]]:
+// DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[I_ASCAST]], align 4
+// DEFAULT-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 100
+// DEFAULT-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label
%[[FOR_END:.*]]
+// DEFAULT: [[FOR_BODY]]:
+// DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[I_ASCAST]], align 4
+// DEFAULT-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP2]] to i64
+// DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x i32], ptr
[[TMP0]], i64 0, i64 [[IDXPROM]]
+// DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// DEFAULT-NEXT: [[CALL:%.*]] = call noundef i32 @_Z8callablei(i32 noundef
[[TMP3]]) #[[ATTR4:[0-9]+]]
+// DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[I_ASCAST]], align 4
+// DEFAULT-NEXT: [[IDXPROM1:%.*]] = sext i32 [[TMP4]] to i64
+// DEFAULT-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [100 x i32],
ptr [[TMP0]], i64 0, i64 [[IDXPROM1]]
+// DEFAULT-NEXT: store i32 [[CALL]], ptr [[ARRAYIDX2]], align 4
+// DEFAULT-NEXT: br label %[[FOR_INC:.*]]
+// DEFAULT: [[FOR_INC]]:
+// DEFAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[I_ASCAST]], align 4
+// DEFAULT-NEXT: [[INC:%.*]] = add nsw i32 [[TMP5]], 1
+// DEFAULT-NEXT: store i32 [[INC]], ptr [[I_ASCAST]], align 4
+// DEFAULT-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]]
+// DEFAULT: [[FOR_END]]:
+// DEFAULT-NEXT: ret void
+//
+//
+// DEFAULT: Function Attrs: convergent mustprogress noinline nounwind optnone
+// DEFAULT-LABEL: define hidden noundef i32 @_Z8callablei(
+// DEFAULT-SAME: i32 noundef [[X:%.*]]) #[[ATTR2:[0-9]+]] {
+// DEFAULT-NEXT: [[ENTRY:.*:]]
+// DEFAULT-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// DEFAULT-NEXT: [[X_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// DEFAULT-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[RETVAL]] to ptr
+// DEFAULT-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[X_ADDR]] to ptr
+// DEFAULT-NEXT: store i32 [[X]], ptr [[X_ADDR_ASCAST]], align 4
+// DEFAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[X_ADDR_ASCAST]], align 4
+// DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// DEFAULT-NEXT: ret i32 [[ADD]]
+//
+//
+// CPU: Function Attrs: convergent mustprogress noinline norecurse nounwind
optnone
+// CPU-LABEL: define weak_odr protected amdgpu_kernel void
@__omp_offloading_10302_8f219df__Z4funcv_l21(
+// CPU-SAME: ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4
dereferenceable(400) [[ARR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPU-NEXT: [[ENTRY:.*:]]
+// CPU-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CPU-NEXT: [[ARR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4, addrspace(5)
+// CPU-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[DYN_PTR_ADDR]] to ptr
+// CPU-NEXT: [[ARR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[ARR_ADDR]] to ptr
+// CPU-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[DOTZERO_ADDR]] to ptr
+// CPU-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr
addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
+// CPU-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
+// CPU-NEXT: store ptr [[ARR]], ptr [[ARR_ADDR_ASCAST]], align 8
+// CPU-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARR_ADDR_ASCAST]], align 8,
!nonnull [[META6:![0-9]+]], !align [[META7:![0-9]+]]
+// CPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast
(ptr addrspace(1)
@__omp_offloading_10302_8f219df__Z4funcv_l21_kernel_environment to ptr), ptr
[[DYN_PTR]])
+// CPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CPU-NEXT: br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label
%[[WORKER_EXIT:.*]]
+// CPU: [[USER_CODE_ENTRY]]:
+// CPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr
addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
+// CPU-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
+// CPU-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4
+// CPU-NEXT: call void
@__omp_offloading_10302_8f219df__Z4funcv_l21_omp_outlined(ptr
[[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], ptr [[TMP0]])
#[[ATTR3:[0-9]+]]
+// CPU-NEXT: call void @__kmpc_target_deinit()
+// CPU-NEXT: ret void
+// CPU: [[WORKER_EXIT]]:
+// CPU-NEXT: ret void
+//
+//
+// CPU: Function Attrs: convergent noinline norecurse nounwind optnone
+// CPU-LABEL: define internal void
@__omp_offloading_10302_8f219df__Z4funcv_l21_omp_outlined(
+// CPU-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef
[[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(400)
[[ARR:%.*]]) #[[ATTR1:[0-9]+]] {
+// CPU-NEXT: [[ENTRY:.*:]]
+// CPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CPU-NEXT: [[ARR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CPU-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
+// CPU-NEXT: [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr
addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
+// CPU-NEXT: [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr
addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
+// CPU-NEXT: [[ARR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[ARR_ADDR]] to ptr
+// CPU-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
+// CPU-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr
[[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// CPU-NEXT: store ptr [[DOTBOUND_TID_]], ptr
[[DOTBOUND_TID__ADDR_ASCAST]], align 8
+// CPU-NEXT: store ptr [[ARR]], ptr [[ARR_ADDR_ASCAST]], align 8
+// CPU-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARR_ADDR_ASCAST]], align 8,
!nonnull [[META6]], !align [[META7]]
+// CPU-NEXT: store i32 0, ptr [[I_ASCAST]], align 4
+// CPU-NEXT: br label %[[FOR_COND:.*]]
+// CPU: [[FOR_COND]]:
+// CPU-NEXT: [[TMP1:%.*]] = load i32, ptr [[I_ASCAST]], align 4
+// CPU-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 100
+// CPU-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CPU: [[FOR_BODY]]:
+// CPU-NEXT: [[TMP2:%.*]] = load i32, ptr [[I_ASCAST]], align 4
+// CPU-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP2]] to i64
+// CPU-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x i32], ptr
[[TMP0]], i64 0, i64 [[IDXPROM]]
+// CPU-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CPU-NEXT: [[CALL:%.*]] = call noundef i32 @_Z8callablei(i32 noundef
[[TMP3]]) #[[ATTR4:[0-9]+]]
+// CPU-NEXT: [[TMP4:%.*]] = load i32, ptr [[I_ASCAST]], align 4
+// CPU-NEXT: [[IDXPROM1:%.*]] = sext i32 [[TMP4]] to i64
+// CPU-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [100 x i32], ptr
[[TMP0]], i64 0, i64 [[IDXPROM1]]
+// CPU-NEXT: store i32 [[CALL]], ptr [[ARRAYIDX2]], align 4
+// CPU-NEXT: br label %[[FOR_INC:.*]]
+// CPU: [[FOR_INC]]:
+// CPU-NEXT: [[TMP5:%.*]] = load i32, ptr [[I_ASCAST]], align 4
+// CPU-NEXT: [[INC:%.*]] = add nsw i32 [[TMP5]], 1
+// CPU-NEXT: store i32 [[INC]], ptr [[I_ASCAST]], align 4
+// CPU-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]]
+// CPU: [[FOR_END]]:
+// CPU-NEXT: ret void
+//
+//
+// CPU: Function Attrs: convergent mustprogress noinline nounwind optnone
+// CPU-LABEL: define hidden noundef i32 @_Z8callablei(
+// CPU-SAME: i32 noundef [[X:%.*]]) #[[ATTR2:[0-9]+]] {
+// CPU-NEXT: [[ENTRY:.*:]]
+// CPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// CPU-NEXT: [[X_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[RETVAL]] to ptr
+// CPU-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[X_ADDR]] to ptr
+// CPU-NEXT: store i32 [[X]], ptr [[X_ADDR_ASCAST]], align 4
+// CPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[X_ADDR_ASCAST]], align 4
+// CPU-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// CPU-NEXT: ret i32 [[ADD]]
+//
+//
+// NOIEEE: Function Attrs: convergent mustprogress noinline norecurse nounwind
optnone
+// NOIEEE-LABEL: define weak_odr protected amdgpu_kernel void
@__omp_offloading_10302_8f219df__Z4funcv_l21(
+// NOIEEE-SAME: ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align
4 dereferenceable(400) [[ARR:%.*]]) #[[ATTR0:[0-9]+]] {
+// NOIEEE-NEXT: [[ENTRY:.*:]]
+// NOIEEE-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// NOIEEE-NEXT: [[ARR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// NOIEEE-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// NOIEEE-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4,
addrspace(5)
+// NOIEEE-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr
addrspace(5) [[DYN_PTR_ADDR]] to ptr
+// NOIEEE-NEXT: [[ARR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[ARR_ADDR]] to ptr
+// NOIEEE-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr
addrspace(5) [[DOTZERO_ADDR]] to ptr
+// NOIEEE-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr
addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
+// NOIEEE-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
+// NOIEEE-NEXT: store ptr [[ARR]], ptr [[ARR_ADDR_ASCAST]], align 8
+// NOIEEE-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARR_ADDR_ASCAST]], align 8,
!nonnull [[META6:![0-9]+]], !align [[META7:![0-9]+]]
+// NOIEEE-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr
addrspacecast (ptr addrspace(1)
@__omp_offloading_10302_8f219df__Z4funcv_l21_kernel_environment to ptr), ptr
[[DYN_PTR]])
+// NOIEEE-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// NOIEEE-NEXT: br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]],
label %[[WORKER_EXIT:.*]]
+// NOIEEE: [[USER_CODE_ENTRY]]:
+// NOIEEE-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr
addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
+// NOIEEE-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
+// NOIEEE-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP__ASCAST]], align
4
+// NOIEEE-NEXT: call void
@__omp_offloading_10302_8f219df__Z4funcv_l21_omp_outlined(ptr
[[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], ptr [[TMP0]])
#[[ATTR3:[0-9]+]]
+// NOIEEE-NEXT: call void @__kmpc_target_deinit()
+// NOIEEE-NEXT: ret void
+// NOIEEE: [[WORKER_EXIT]]:
+// NOIEEE-NEXT: ret void
+//
+//
+// NOIEEE: Function Attrs: convergent noinline norecurse nounwind optnone
+// NOIEEE-LABEL: define internal void
@__omp_offloading_10302_8f219df__Z4funcv_l21_omp_outlined(
+// NOIEEE-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias
noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(400)
[[ARR:%.*]]) #[[ATTR1:[0-9]+]] {
+// NOIEEE-NEXT: [[ENTRY:.*:]]
+// NOIEEE-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8,
addrspace(5)
+// NOIEEE-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8,
addrspace(5)
+// NOIEEE-NEXT: [[ARR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// NOIEEE-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
+// NOIEEE-NEXT: [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr
addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
+// NOIEEE-NEXT: [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr
addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
+// NOIEEE-NEXT: [[ARR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[ARR_ADDR]] to ptr
+// NOIEEE-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to
ptr
+// NOIEEE-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr
[[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// NOIEEE-NEXT: store ptr [[DOTBOUND_TID_]], ptr
[[DOTBOUND_TID__ADDR_ASCAST]], align 8
+// NOIEEE-NEXT: store ptr [[ARR]], ptr [[ARR_ADDR_ASCAST]], align 8
+// NOIEEE-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARR_ADDR_ASCAST]], align 8,
!nonnull [[META6]], !align [[META7]]
+// NOIEEE-NEXT: store i32 0, ptr [[I_ASCAST]], align 4
+// NOIEEE-NEXT: br label %[[FOR_COND:.*]]
+// NOIEEE: [[FOR_COND]]:
+// NOIEEE-NEXT: [[TMP1:%.*]] = load i32, ptr [[I_ASCAST]], align 4
+// NOIEEE-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 100
+// NOIEEE-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// NOIEEE: [[FOR_BODY]]:
+// NOIEEE-NEXT: [[TMP2:%.*]] = load i32, ptr [[I_ASCAST]], align 4
+// NOIEEE-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP2]] to i64
+// NOIEEE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x i32], ptr
[[TMP0]], i64 0, i64 [[IDXPROM]]
+// NOIEEE-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// NOIEEE-NEXT: [[CALL:%.*]] = call noundef i32 @_Z8callablei(i32 noundef
[[TMP3]]) #[[ATTR4:[0-9]+]]
+// NOIEEE-NEXT: [[TMP4:%.*]] = load i32, ptr [[I_ASCAST]], align 4
+// NOIEEE-NEXT: [[IDXPROM1:%.*]] = sext i32 [[TMP4]] to i64
+// NOIEEE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [100 x i32], ptr
[[TMP0]], i64 0, i64 [[IDXPROM1]]
+// NOIEEE-NEXT: store i32 [[CALL]], ptr [[ARRAYIDX2]], align 4
+// NOIEEE-NEXT: br label %[[FOR_INC:.*]]
+// NOIEEE: [[FOR_INC]]:
+// NOIEEE-NEXT: [[TMP5:%.*]] = load i32, ptr [[I_ASCAST]], align 4
+// NOIEEE-NEXT: [[INC:%.*]] = add nsw i32 [[TMP5]], 1
+// NOIEEE-NEXT: store i32 [[INC]], ptr [[I_ASCAST]], align 4
+// NOIEEE-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]]
+// NOIEEE: [[FOR_END]]:
+// NOIEEE-NEXT: ret void
+//
+//
+// NOIEEE: Function Attrs: convergent mustprogress noinline nounwind optnone
+// NOIEEE-LABEL: define hidden noundef i32 @_Z8callablei(
+// NOIEEE-SAME: i32 noundef [[X:%.*]]) #[[ATTR2:[0-9]+]] {
+// NOIEEE-NEXT: [[ENTRY:.*:]]
+// NOIEEE-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// NOIEEE-NEXT: [[X_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// NOIEEE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[RETVAL]] to ptr
+// NOIEEE-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[X_ADDR]] to ptr
+// NOIEEE-NEXT: store i32 [[X]], ptr [[X_ADDR_ASCAST]], align 4
+// NOIEEE-NEXT: [[TMP0:%.*]] = load i32, ptr [[X_ADDR_ASCAST]], align 4
+// NOIEEE-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// NOIEEE-NEXT: ret i32 [[ADD]]
+//
+//.
+// DEFAULT: attributes #[[ATTR0]] = { convergent mustprogress noinline
norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "kernel"
"no-trapping-math"="true" "omp_target_thread_limit"="42"
"stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+// DEFAULT: attributes #[[ATTR1]] = { convergent noinline norecurse nounwind
optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// DEFAULT: attributes #[[ATTR2]] = { convergent mustprogress noinline
nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// DEFAULT: attributes #[[ATTR3]] = { nounwind }
+// DEFAULT: attributes #[[ATTR4]] = { convergent }
+//.
+// CPU: attributes #[[ATTR0]] = { convergent mustprogress noinline norecurse
nounwind optnone "amdgpu-flat-work-group-size"="1,42" "kernel"
"no-trapping-math"="true" "omp_target_thread_limit"="42"
"stack-protector-buffer-size"="8" "target-cpu"="gfx900"
"uniform-work-group-size"="true" }
+// CPU: attributes #[[ATTR1]] = { convergent noinline norecurse nounwind
optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8"
"target-cpu"="gfx900" }
+// CPU: attributes #[[ATTR2]] = { convergent mustprogress noinline nounwind
optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8"
"target-cpu"="gfx900" }
+// CPU: attributes #[[ATTR3]] = { nounwind }
+// CPU: attributes #[[ATTR4]] = { convergent }
+//.
+// NOIEEE: attributes #[[ATTR0]] = { convergent mustprogress noinline
norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42"
"amdgpu-ieee"="false" "kernel" "no-nans-fp-math"="true"
"no-trapping-math"="true" "omp_target_thread_limit"="42"
"stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+// NOIEEE: attributes #[[ATTR1]] = { convergent noinline norecurse nounwind
optnone "no-nans-fp-math"="true" "no-trapping-math"="true"
"stack-protector-buffer-size"="8" }
+// NOIEEE: attributes #[[ATTR2]] = { convergent mustprogress noinline nounwind
optnone "amdgpu-ieee"="false" "no-nans-fp-math"="true"
"no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// NOIEEE: attributes #[[ATTR3]] = { nounwind }
+// NOIEEE: attributes #[[ATTR4]] = { convergent }
+//.
+// DEFAULT: [[META0:![0-9]+]] = !{i32 0, i32 66306, i32 150084063,
!"_Z4funcv", i32 21, i32 0, i32 0}
+// DEFAULT: [[META1:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32
600}
+// DEFAULT: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// DEFAULT: [[META3:![0-9]+]] = !{i32 7, !"openmp", i32 51}
+// DEFAULT: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 51}
+// DEFAULT: [[META5:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+// DEFAULT: [[META6]] = !{}
+// DEFAULT: [[META7]] = !{i64 4}
+// DEFAULT: [[LOOP8]] = distinct !{[[LOOP8]], [[META9:![0-9]+]]}
+// DEFAULT: [[META9]] = !{!"llvm.loop.mustprogress"}
+//.
+// CPU: [[META0:![0-9]+]] = !{i32 0, i32 66306, i32 150084063, !"_Z4funcv",
i32 21, i32 0, i32 0}
+// CPU: [[META1:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// CPU: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CPU: [[META3:![0-9]+]] = !{i32 7, !"openmp", i32 51}
+// CPU: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 51}
+// CPU: [[META5:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+// CPU: [[META6]] = !{}
+// CPU: [[META7]] = !{i64 4}
+// CPU: [[LOOP8]] = distinct !{[[LOOP8]], [[META9:![0-9]+]]}
+// CPU: [[META9]] = !{!"llvm.loop.mustprogress"}
+//.
+// NOIEEE: [[META0:![0-9]+]] = !{i32 0, i32 66306, i32 150084063, !"_Z4funcv",
i32 21, i32 0, i32 0}
+// NOIEEE: [[META1:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// NOIEEE: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// NOIEEE: [[META3:![0-9]+]] = !{i32 7, !"openmp", i32 51}
+// NOIEEE: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 51}
+// NOIEEE: [[META5:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+// NOIEEE: [[META6]] = !{}
+// NOIEEE: [[META7]] = !{i64 4}
+// NOIEEE: [[LOOP8]] = distinct !{[[LOOP8]], [[META9:![0-9]+]]}
+// NOIEEE: [[META9]] = !{!"llvm.loop.mustprogress"}
+//.
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits