https://github.com/ssahasra updated https://github.com/llvm/llvm-project/pull/199176
>From ad5404e9eb3bace085a8c51838de2e1553e7cf90 Mon Sep 17 00:00:00 2001 From: Sameer Sahasrabuddhe <[email protected]> Date: Thu, 21 May 2026 16:55:40 +0530 Subject: [PATCH 1/4] [AMDGPU][Clang] refactor addrspace and scope checks [NFC] Assisted-By: Claude Opus 4.6 --- .../clang/Basic/DiagnosticSemaKinds.td | 2 +- clang/lib/Sema/SemaAMDGPU.cpp | 54 ++++++++++--------- ...mdgcn-error-gfx1250-cooperative-atomics.cl | 2 +- 3 files changed, 31 insertions(+), 27 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index dbe6cb2c3a41c..76bacb7d49c8b 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -14230,7 +14230,7 @@ def note_amdgcn_unguarded_builtin_silence : Note<"enclose %0 in a __builtin_amdgcn_is_invocable check to silence " "this warning">; -def err_amdgcn_coop_atomic_invalid_as : Error<"cooperative atomic requires a global or generic pointer">; +def err_amdgcn_global_or_flat_pointer_required : Error<"builtin requires a global or generic pointer">; def err_amdgcn_dmask_has_too_many_bits_set : Error<"dmask argument cannot have more bits set than there are elements " diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index 1d2b3898c92d6..385736c7e1eac 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -452,19 +452,35 @@ bool SemaAMDGPU::checkAtomicOrderingCABIArg(Expr *E, bool MayLoad, return false; } -bool SemaAMDGPU::checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore) { - bool Fail = false; - - // First argument is a global or generic pointer. +// Check that the first argument to TheCall is a global or generic pointer. +static bool checkGlobalOrFlatPointerArg(SemaAMDGPU &S, CallExpr *TheCall) { Expr *PtrArg = TheCall->getArg(0); QualType PtrTy = PtrArg->getType()->getPointeeType(); - unsigned AS = getASTContext().getTargetAddressSpace(PtrTy.getAddressSpace()); + unsigned AS = + S.getASTContext().getTargetAddressSpace(PtrTy.getAddressSpace()); if (AS != llvm::AMDGPUAS::FLAT_ADDRESS && - AS != llvm::AMDGPUAS::GLOBAL_ADDRESS) { - Fail = true; - Diag(TheCall->getBeginLoc(), diag::err_amdgcn_coop_atomic_invalid_as) - << PtrArg->getSourceRange(); - } + AS != llvm::AMDGPUAS::GLOBAL_ADDRESS) + return S.Diag(TheCall->getBeginLoc(), + diag::err_amdgcn_global_or_flat_pointer_required) + << PtrArg->getSourceRange(); + return false; +} + +static bool checkScopeAsInt(SemaAMDGPU &S, Expr *Scope) { + if (Scope->isValueDependent()) + return false; + auto ScopeModel = AtomicScopeModel::create(AtomicScopeModelKind::Generic); + if (std::optional<llvm::APSInt> Result = + Scope->getIntegerConstantExpr(S.SemaRef.Context)) + if (!ScopeModel->isValid(Result->getZExtValue())) + return S.Diag(Scope->getBeginLoc(), + diag::err_atomic_op_has_invalid_sync_scope) + << Scope->getSourceRange(); + return false; +} + +bool SemaAMDGPU::checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore) { + bool Fail = checkGlobalOrFlatPointerArg(*this, TheCall); Expr *AO = TheCall->getArg(IsStore ? 2 : 1); Expr *Scope = TheCall->getArg(TheCall->getNumArgs() - 1); @@ -488,27 +504,15 @@ bool SemaAMDGPU::checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore) { } bool SemaAMDGPU::checkAtomicMonitorLoad(CallExpr *TheCall) { - bool Fail = false; - Expr *AO = TheCall->getArg(1); Expr *Scope = TheCall->getArg(TheCall->getNumArgs() - 1); if (AO->isValueDependent() || Scope->isValueDependent()) return false; - Fail |= checkAtomicOrderingCABIArg(TheCall->getArg(1), /*MayLoad=*/true, - /*MayStore=*/false); - - auto ScopeModel = AtomicScopeModel::create(AtomicScopeModelKind::Generic); - if (std::optional<llvm::APSInt> Result = - Scope->getIntegerConstantExpr(SemaRef.Context)) { - if (!ScopeModel->isValid(Result->getZExtValue())) { - Diag(Scope->getBeginLoc(), diag::err_atomic_op_has_invalid_sync_scope) - << Scope->getSourceRange(); - Fail = true; - } - } - + bool Fail = checkAtomicOrderingCABIArg(AO, /*MayLoad=*/true, + /*MayStore=*/false); + Fail |= checkScopeAsInt(*this, Scope); return Fail; } diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-cooperative-atomics.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-cooperative-atomics.cl index 8f02e6775d37a..a440a1c040270 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-cooperative-atomics.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-cooperative-atomics.cl @@ -48,7 +48,7 @@ v4i test_amdgcn_cooperative_atomic_load_8x16B_acq_rel(global v4i* gaddr) void test_amdgcn_cooperative_atomic_store_32x4B__sharedptr(local int* addr, int val) { - __builtin_amdgcn_cooperative_atomic_store_32x4B(addr, val, __ATOMIC_RELAXED, ""); // expected-error {{cooperative atomic requires a global or generic pointer}} + __builtin_amdgcn_cooperative_atomic_store_32x4B(addr, val, __ATOMIC_RELAXED, ""); // expected-error {{builtin requires a global or generic pointer}} } void test_amdgcn_cooperative_atomic_store_32x4B__ordering_not_imm(local int* addr, int ord, int val) >From 2f00bae403e863bec2260e98ae45b71f7e45f85a Mon Sep 17 00:00:00 2001 From: Sameer Sahasrabuddhe <[email protected]> Date: Thu, 12 Mar 2026 12:30:16 +0530 Subject: [PATCH 2/4] [AMDGPU][Clang] add __builtin_amdgcn_av_(load|store)_b128 These builtins allow the program to request store-available and load-visible accesses as described in #191246. Each of them takes a __MEMORY_SCOPE_* operand that is then translated to target-specific cache policy bits. This patch was extracted from #172090. Co-authored-by: macurtis-amd <[email protected]> Assisted-by: Claude Opus 4.6z --- clang/docs/LanguageExtensions.rst | 28 ++ clang/include/clang/Basic/BuiltinsAMDGPU.td | 7 + clang/include/clang/Sema/SemaAMDGPU.h | 1 + clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 16 ++ clang/lib/Sema/SemaAMDGPU.cpp | 12 + .../builtins-amdgcn-global-load-store.cl | 250 ++++++++++++++++++ ...builtins-amdgcn-global-load-store-error.cl | 22 ++ ...s-amdgcn-global-load-store-target-error.cl | 26 ++ 8 files changed, 362 insertions(+) create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index 03cb02deb5e7f..6ad4e76c239d9 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -5259,6 +5259,8 @@ builtin function, and are named with a ``__opencl_`` prefix. The macros and ``__OPENCL_MEMORY_SCOPE_SUB_GROUP`` are provided, with values corresponding to the enumerators of OpenCL's ``memory_scope`` enumeration.) +.. _langext-__scoped_atomic: + __scoped_atomic builtins ------------------------ @@ -5754,6 +5756,32 @@ returns the bit at the position of the current lane. It is almost equivalent to ``(mask & (1 << lane_id)) != 0``, except that its behavior is only defined if the given mask has the same value for all active lanes of the current wave. + +__builtin_amdgcn_av_{load,store}_b128 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Signature: + +.. code-block:: c + + typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u; + + v4u __builtin_amdgcn_av_load_b128(v4u *src, int scope); + + void __builtin_amdgcn_av_store_b128(v4u *dst, v4u data, int scope); + +Load or store a vector of 4 unsigned integers from or to memory with cache +behavior specified by ``scope``, which is one of the ``__MEMORY_SCOPE_*`` macros +defined for :ref:`scoped atomic builtins<langext-__c11_atomic>`. + +The pointer argument must point to the global or generic address space. + +These builtins are supported on gfx9, gfx10, gfx11, and gfx12 targets. + +They map to the LLVM intrinsics ``llvm.amdgcn.av.load.b128`` and +``llvm.amdgcn.av.store.b128`` documented in `User Guide for AMDGPU Backend +<https://llvm.org/docs/AMDGPUUsage.html>`_. + ARM/AArch64 Language Extensions ------------------------------- diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td index b15a36df6c08f..a91ad0431d4a6 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.td +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td @@ -280,6 +280,13 @@ def __builtin_amdgcn_raw_ptr_buffer_load_async_lds : AMDGPUBuiltin<"void(__amdgp def __builtin_amdgcn_struct_ptr_buffer_load_lds : AMDGPUBuiltin<"void(__amdgpu_buffer_rsrc_t, void address_space<3> *, _Constant unsigned int, int, int, int, _Constant int, _Constant int)", [], "vmem-to-lds-load-insts">; def __builtin_amdgcn_struct_ptr_buffer_load_async_lds : AMDGPUBuiltin<"void(__amdgpu_buffer_rsrc_t, void address_space<3> *, _Constant unsigned int, int, int, int, _Constant int, _Constant int)", [], "vmem-to-lds-load-insts">; +//===----------------------------------------------------------------------===// +// Global Available/Visible memory accesses. +//===----------------------------------------------------------------------===// + +def __builtin_amdgcn_av_load_b128: AMDGPUBuiltin<"_ExtVector<4, unsigned int>(_ExtVector<4, unsigned int> *, int)", [], "gfx9-insts">; +def __builtin_amdgcn_av_store_b128: AMDGPUBuiltin<"void(_ExtVector<4, unsigned int> *, _ExtVector<4, unsigned int>, int)", [], "gfx9-insts">; + //===----------------------------------------------------------------------===// // Async mark builtins. //===----------------------------------------------------------------------===// diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h index d520f3df544f4..a6205534e0de3 100644 --- a/clang/include/clang/Sema/SemaAMDGPU.h +++ b/clang/include/clang/Sema/SemaAMDGPU.h @@ -38,6 +38,7 @@ class SemaAMDGPU : public SemaBase { bool checkAtomicOrderingCABIArg(Expr *E, bool MayLoad, bool MayStore); bool checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore); + bool checkAVLoadStore(CallExpr *TheCall, bool IsStore); bool checkAtomicMonitorLoad(CallExpr *TheCall); bool checkMovDPPFunctionCall(CallExpr *TheCall, unsigned NumArgs, diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index cb883e8780e59..fc6e397674cd1 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -1012,6 +1012,22 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::Function *F = CGM.getIntrinsic(IID, {Args[0]->getType()}); return Builder.CreateCall(F, {Args}); } + case AMDGPU::BI__builtin_amdgcn_av_load_b128: + case AMDGPU::BI__builtin_amdgcn_av_store_b128: { + const bool IsStore = BuiltinID == AMDGPU::BI__builtin_amdgcn_av_store_b128; + SmallVector<Value *, 5> Args = {EmitScalarExpr(E->getArg(0))}; // addr + if (IsStore) + Args.push_back(EmitScalarExpr(E->getArg(1))); // data + const unsigned ScopeIdx = E->getNumArgs() - 1; + auto *ScopeExpr = + cast<llvm::ConstantInt>(EmitScalarExpr(E->getArg(ScopeIdx))); + Args.push_back(emitScopeMD(*this, ScopeExpr->getZExtValue())); + llvm::Function *F = + CGM.getIntrinsic(IsStore ? Intrinsic::amdgcn_av_store_b128 + : Intrinsic::amdgcn_av_load_b128, + {Args[0]->getType()}); + return Builder.CreateCall(F, Args); + } case AMDGPU::BI__builtin_amdgcn_get_fpenv: { Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv, {llvm::Type::getInt64Ty(getLLVMContext())}); diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index 385736c7e1eac..f2dbe28b76b5c 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -149,6 +149,10 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk16_f32_fp6: case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk16_f32_bf6: return SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 15); + case AMDGPU::BI__builtin_amdgcn_av_load_b128: + return checkAVLoadStore(TheCall, /*IsStore=*/false); + case AMDGPU::BI__builtin_amdgcn_av_store_b128: + return checkAVLoadStore(TheCall, /*IsStore=*/true); case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_32x4B: case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_16x8B: case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_8x16B: @@ -479,6 +483,14 @@ static bool checkScopeAsInt(SemaAMDGPU &S, Expr *Scope) { return false; } +bool SemaAMDGPU::checkAVLoadStore(CallExpr *TheCall, bool IsStore) { + if (checkGlobalOrFlatPointerArg(*this, TheCall)) + return true; + + Expr *Scope = TheCall->getArg(TheCall->getNumArgs() - 1); + return checkScopeAsInt(*this, Scope); +} + bool SemaAMDGPU::checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore) { bool Fail = checkGlobalOrFlatPointerArg(*this, TheCall); diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl new file mode 100644 index 0000000000000..63d7fcac16874 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl @@ -0,0 +1,250 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals smart +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s + +typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32; +typedef v4u32 __global *global_ptr_to_v4u32; + +//------------------------------------------------------------------------------ +// Global Load +//------------------------------------------------------------------------------ + +// CHECK-LABEL: @test_amdgcn_global_load_b128_wave( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META7:![0-9]+]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_global_load_b128_wave(global_ptr_to_v4u32 ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WVFRNT); +} + +// CHECK-LABEL: @test_amdgcn_global_load_b128_workgroup( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META8:![0-9]+]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_global_load_b128_workgroup(global_ptr_to_v4u32 ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WRKGRP); +} + +// CHECK-LABEL: @test_amdgcn_global_load_b128_device( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META9:![0-9]+]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_global_load_b128_device(global_ptr_to_v4u32 ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_DEVICE); +} + +// CHECK-LABEL: @test_amdgcn_global_load_b128_system( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META10:![0-9]+]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_global_load_b128_system(global_ptr_to_v4u32 ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SYSTEM); +} + +// CHECK-LABEL: @test_amdgcn_global_load_b128_single( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META11:![0-9]+]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_global_load_b128_single(global_ptr_to_v4u32 ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SINGLE); +} + +// CHECK-LABEL: @test_amdgcn_global_load_b128_cluster( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META12:![0-9]+]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_global_load_b128_cluster(global_ptr_to_v4u32 ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_CLUSTR); +} + +//------------------------------------------------------------------------------ +// Global Store +//------------------------------------------------------------------------------ + +// CHECK-LABEL: @test_amdgcn_global_store_b128_wave( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META7]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_global_store_b128_wave(global_ptr_to_v4u32 ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WVFRNT); +} + +// CHECK-LABEL: @test_amdgcn_global_store_b128_workgroup( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META8]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_global_store_b128_workgroup(global_ptr_to_v4u32 ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WRKGRP); +} + +// CHECK-LABEL: @test_amdgcn_global_store_b128_device( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META9]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_global_store_b128_device(global_ptr_to_v4u32 ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_DEVICE); +} + +// CHECK-LABEL: @test_amdgcn_global_store_b128_system( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META10]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_global_store_b128_system(global_ptr_to_v4u32 ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SYSTEM); +} + +// CHECK-LABEL: @test_amdgcn_global_store_b128_single( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META11]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_global_store_b128_single(global_ptr_to_v4u32 ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SINGLE); +} + +// CHECK-LABEL: @test_amdgcn_global_store_b128_cluster( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META12]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_global_store_b128_cluster(global_ptr_to_v4u32 ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_CLUSTR); +} + +//------------------------------------------------------------------------------ +// Flat Load +//------------------------------------------------------------------------------ + +// CHECK-LABEL: @test_amdgcn_flat_load_b128_wave( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META7]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_flat_load_b128_wave(v4u32 * ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WVFRNT); +} + +// CHECK-LABEL: @test_amdgcn_flat_load_b128_workgroup( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META8]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_flat_load_b128_workgroup(v4u32 * ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WRKGRP); +} + +// CHECK-LABEL: @test_amdgcn_flat_load_b128_device( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META9]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_flat_load_b128_device(v4u32 * ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_DEVICE); +} + +// CHECK-LABEL: @test_amdgcn_flat_load_b128_system( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META10]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_flat_load_b128_system(v4u32 * ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SYSTEM); +} + +// CHECK-LABEL: @test_amdgcn_flat_load_b128_single( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META11]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_flat_load_b128_single(v4u32 * ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SINGLE); +} + +// CHECK-LABEL: @test_amdgcn_flat_load_b128_cluster( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META12]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_flat_load_b128_cluster(v4u32 * ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_CLUSTR); +} + +//------------------------------------------------------------------------------ +// Flat Store +//------------------------------------------------------------------------------ + +// CHECK-LABEL: @test_amdgcn_flat_store_b128_wave( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META7]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_flat_store_b128_wave(v4u32 * ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WVFRNT); +} + +// CHECK-LABEL: @test_amdgcn_flat_store_b128_workgroup( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META8]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_flat_store_b128_workgroup(v4u32 * ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WRKGRP); +} + +// CHECK-LABEL: @test_amdgcn_flat_store_b128_device( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META9]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_flat_store_b128_device(v4u32 * ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_DEVICE); +} + +// CHECK-LABEL: @test_amdgcn_flat_store_b128_system( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META10]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_flat_store_b128_system(v4u32 * ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SYSTEM); +} + +// CHECK-LABEL: @test_amdgcn_flat_store_b128_single( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META11]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_flat_store_b128_single(v4u32 * ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SINGLE); +} + +// CHECK-LABEL: @test_amdgcn_flat_store_b128_cluster( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META12]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_flat_store_b128_cluster(v4u32 * ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_CLUSTR); +} +//. +// CHECK: [[META7]] = !{!"wavefront"} +// CHECK: [[META8]] = !{!"workgroup"} +// CHECK: [[META9]] = !{!"agent"} +// CHECK: [[META10]] = !{!""} +// CHECK: [[META11]] = !{!"singlethread"} +// CHECK: [[META12]] = !{!"cluster"} +//. diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl new file mode 100644 index 0000000000000..b2f7b46547632 --- /dev/null +++ b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl @@ -0,0 +1,22 @@ +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -S -verify -o - %s +// REQUIRES: amdgpu-registered-target + +typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32; +typedef v4u32 __global *global_ptr_to_v4u32; +typedef v4u32 __private *private_ptr_to_v4u32; + +void test_amdgcn_av_store_b128_bad_ptr(private_ptr_to_v4u32 ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SYSTEM); //expected-error{{builtin requires a global or generic pointer}} +} + +void test_amdgcn_av_store_b128_bad_scope(global_ptr_to_v4u32 ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, 42); //expected-error{{synchronization scope argument to atomic operation is invalid}} +} + +v4u32 test_amdgcn_av_load_b128_bad_ptr(private_ptr_to_v4u32 ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SYSTEM); //expected-error{{builtin requires a global or generic pointer}} +} + +v4u32 test_amdgcn_av_load_b128_bad_scope(global_ptr_to_v4u32 ptr) { + return __builtin_amdgcn_av_load_b128(ptr, 42); //expected-error{{synchronization scope argument to atomic operation is invalid}} +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl new file mode 100644 index 0000000000000..e85b120661cfd --- /dev/null +++ b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl @@ -0,0 +1,26 @@ +// We test loads and stores separately because clang only seems to exit after +// the first 'target feature' error. + +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx602 -DTEST_LOAD -S -verify -o - %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx705 -DTEST_LOAD -S -verify -o - %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx810 -DTEST_LOAD -S -verify -o - %s + +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx602 -DTEST_STORE -S -verify -o - %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx705 -DTEST_STORE -S -verify -o - %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx810 -DTEST_STORE -S -verify -o - %s +// REQUIRES: amdgpu-registered-target + +typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32; +typedef v4u32 __global *global_ptr_to_v4u32; + +#ifdef TEST_LOAD +v4u32 test_amdgcn_av_load_b128_target(global_ptr_to_v4u32 ptr) { + return __builtin_amdgcn_av_load_b128(ptr, 0); // expected-error{{'__builtin_amdgcn_av_load_b128' needs target feature gfx9-insts}} +} +#endif + +#ifdef TEST_STORE +void test_amdgcn_av_store_b128_target(global_ptr_to_v4u32 ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, 0); // expected-error{{'__builtin_amdgcn_av_store_b128' needs target feature gfx9-insts}} +} +#endif >From c295aabe208b0650f2d5ac4bde4ec6135ed2a285 Mon Sep 17 00:00:00 2001 From: Sameer Sahasrabuddhe <[email protected]> Date: Tue, 2 Jun 2026 16:57:20 +0530 Subject: [PATCH 3/4] add docs for the builtins; split the target test; add a host/device test --- clang/include/clang/Basic/BuiltinsAMDGPU.td | 10 +++++-- .../include/clang/Basic/BuiltinsAMDGPUDocs.td | 29 +++++++++++++++++++ clang/test/SemaHIP/amdgpu-av-load-store.hip | 20 +++++++++++++ ...s-amdgcn-global-load-store-target-error.cl | 26 ++++++++--------- 4 files changed, 69 insertions(+), 16 deletions(-) create mode 100644 clang/test/SemaHIP/amdgpu-av-load-store.hip diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td index a91ad0431d4a6..a08e922ec1f7a 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.td +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td @@ -284,8 +284,14 @@ def __builtin_amdgcn_struct_ptr_buffer_load_async_lds : AMDGPUBuiltin<"void(__am // Global Available/Visible memory accesses. //===----------------------------------------------------------------------===// -def __builtin_amdgcn_av_load_b128: AMDGPUBuiltin<"_ExtVector<4, unsigned int>(_ExtVector<4, unsigned int> *, int)", [], "gfx9-insts">; -def __builtin_amdgcn_av_store_b128: AMDGPUBuiltin<"void(_ExtVector<4, unsigned int> *, _ExtVector<4, unsigned int>, int)", [], "gfx9-insts">; +def __builtin_amdgcn_av_load_b128 + : AMDGPUBuiltin<"_ExtVector<4, unsigned int>(_ExtVector<4, unsigned int> *, int)", [], "gfx9-insts"> { + let Documentation = [DocAVLoadB128]; +} +def __builtin_amdgcn_av_store_b128 + : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int> *, _ExtVector<4, unsigned int>, int)", [], "gfx9-insts"> { + let Documentation = [DocAVStoreB128]; +} //===----------------------------------------------------------------------===// // Async mark builtins. diff --git a/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td b/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td index cb2f000fcf548..04a3451adf667 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td +++ b/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td @@ -573,6 +573,35 @@ WMMA with per-operand scale factors applied during the computation. }]; } +//===----------------------------------------------------------------------===// +// Global Available/Visible Memory Access Builtins +//===----------------------------------------------------------------------===// + +def DocCatAVLoadStore : DocumentationCategory<"Available/Visible Memory Access Builtins"> { + let Content = [{ +These builtins perform 128-bit global or flat memory loads and stores with +available/visible (AV) semantics. +}]; +} + +def DocAVLoadB128 : Documentation { + let Category = DocCatAVLoadStore; + let Content = [{ +Loads 128 bits (4 x i32) from the pointer ``ptr``. The pointer must be in +the global or generic address space. The ``scope`` argument specifies the +synchronization scope using a ``__MEMORY_SCOPE_*`` constant. +}]; +} + +def DocAVStoreB128 : Documentation { + let Category = DocCatAVLoadStore; + let Content = [{ +Stores 128 bits (4 x i32) of ``data`` to the pointer ``ptr``. The pointer +must be in the global or generic address space. The ``scope`` argument +specifies the synchronization scope using a ``__MEMORY_SCOPE_*`` constant. +}]; +} + //===----------------------------------------------------------------------===// // Wave Data Exchange Builtins //===----------------------------------------------------------------------===// diff --git a/clang/test/SemaHIP/amdgpu-av-load-store.hip b/clang/test/SemaHIP/amdgpu-av-load-store.hip new file mode 100644 index 0000000000000..1e9688e891228 --- /dev/null +++ b/clang/test/SemaHIP/amdgpu-av-load-store.hip @@ -0,0 +1,20 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -fsyntax-only -triple amdgcn -target-cpu gfx950 -verify %s -fcuda-is-device +// RUN: %clang_cc1 -fsyntax-only -triple x86_64 -aux-triple amdgcn -verify %s + +// expected-no-diagnostics + +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) + +typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32; + +__device__ void test_av_load_store_device(v4u32 *ptr, v4u32 data) { + v4u32 res = __builtin_amdgcn_av_load_b128(ptr, 0); + __builtin_amdgcn_av_store_b128(ptr, data, 0); +} + +__global__ void test_av_load_store_kernel(v4u32 *ptr, v4u32 data) { + v4u32 res = __builtin_amdgcn_av_load_b128(ptr, 0); + __builtin_amdgcn_av_store_b128(ptr, data, 0); +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl index e85b120661cfd..cec85fbeb9446 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl @@ -1,26 +1,24 @@ -// We test loads and stores separately because clang only seems to exit after -// the first 'target feature' error. - -// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx602 -DTEST_LOAD -S -verify -o - %s -// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx705 -DTEST_LOAD -S -verify -o - %s -// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx810 -DTEST_LOAD -S -verify -o - %s - -// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx602 -DTEST_STORE -S -verify -o - %s -// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx705 -DTEST_STORE -S -verify -o - %s -// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx810 -DTEST_STORE -S -verify -o - %s +// RUN: split-file %s %t +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx602 -S -verify -o - %t/load.cl +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx705 -S -verify -o - %t/load.cl +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx810 -S -verify -o - %t/load.cl +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx602 -S -verify -o - %t/store.cl +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx705 -S -verify -o - %t/store.cl +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx810 -S -verify -o - %t/store.cl // REQUIRES: amdgpu-registered-target +//--- load.cl typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32; typedef v4u32 __global *global_ptr_to_v4u32; -#ifdef TEST_LOAD v4u32 test_amdgcn_av_load_b128_target(global_ptr_to_v4u32 ptr) { return __builtin_amdgcn_av_load_b128(ptr, 0); // expected-error{{'__builtin_amdgcn_av_load_b128' needs target feature gfx9-insts}} } -#endif -#ifdef TEST_STORE +//--- store.cl +typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32; +typedef v4u32 __global *global_ptr_to_v4u32; + void test_amdgcn_av_store_b128_target(global_ptr_to_v4u32 ptr, v4u32 data) { __builtin_amdgcn_av_store_b128(ptr, data, 0); // expected-error{{'__builtin_amdgcn_av_store_b128' needs target feature gfx9-insts}} } -#endif >From 1834a71e9907c7c3d1cb276f622b3600b2ac4de2 Mon Sep 17 00:00:00 2001 From: Sameer Sahasrabuddhe <[email protected]> Date: Wed, 3 Jun 2026 12:21:37 +0530 Subject: [PATCH 4/4] add ArgNames; use target feature "flat-global-insts" --- clang/include/clang/Basic/BuiltinsAMDGPU.td | 8 +++++--- .../builtins-amdgcn-global-load-store-target-error.cl | 4 ++-- llvm/lib/TargetParser/AMDGPUTargetParser.cpp | 8 ++++++++ 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td index a08e922ec1f7a..0b50946bb47b3 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.td +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td @@ -285,12 +285,14 @@ def __builtin_amdgcn_struct_ptr_buffer_load_async_lds : AMDGPUBuiltin<"void(__am //===----------------------------------------------------------------------===// def __builtin_amdgcn_av_load_b128 - : AMDGPUBuiltin<"_ExtVector<4, unsigned int>(_ExtVector<4, unsigned int> *, int)", [], "gfx9-insts"> { + : AMDGPUBuiltin<"_ExtVector<4, unsigned int>(_ExtVector<4, unsigned int> *, int)", [], "flat-global-insts"> { let Documentation = [DocAVLoadB128]; + let ArgNames = ["ptr", "scope"]; } def __builtin_amdgcn_av_store_b128 - : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int> *, _ExtVector<4, unsigned int>, int)", [], "gfx9-insts"> { + : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int> *, _ExtVector<4, unsigned int>, int)", [], "flat-global-insts"> { let Documentation = [DocAVStoreB128]; + let ArgNames = ["ptr", "data", "scope"]; } //===----------------------------------------------------------------------===// @@ -375,7 +377,7 @@ def __builtin_amdgcn_perm : AMDGPUBuiltin<"unsigned int(unsigned int, unsigned i // GFX9+ only builtins. //===----------------------------------------------------------------------===// -def __builtin_amdgcn_fmed3h : AMDGPUBuiltin<"__fp16(__fp16, __fp16, __fp16)", [Const], "gfx9-insts">; +def __builtin_amdgcn_fmed3h : AMDGPUBuiltin<"__fp16(__fp16, __fp16, __fp16)", [Const], "flat-global-insts">; def __builtin_amdgcn_global_atomic_fadd_f64 : AMDGPUBuiltin<"double(double address_space<1> *, double)", [], "gfx90a-insts">; def __builtin_amdgcn_global_atomic_fadd_f32 : AMDGPUBuiltin<"float(float address_space<1> *, float)", [], "atomic-fadd-rtn-insts">; diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl index cec85fbeb9446..9a61513cdc05b 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl @@ -12,7 +12,7 @@ typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int typedef v4u32 __global *global_ptr_to_v4u32; v4u32 test_amdgcn_av_load_b128_target(global_ptr_to_v4u32 ptr) { - return __builtin_amdgcn_av_load_b128(ptr, 0); // expected-error{{'__builtin_amdgcn_av_load_b128' needs target feature gfx9-insts}} + return __builtin_amdgcn_av_load_b128(ptr, 0); // expected-error{{'__builtin_amdgcn_av_load_b128' needs target feature flat-global-insts}} } //--- store.cl @@ -20,5 +20,5 @@ typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int typedef v4u32 __global *global_ptr_to_v4u32; void test_amdgcn_av_store_b128_target(global_ptr_to_v4u32 ptr, v4u32 data) { - __builtin_amdgcn_av_store_b128(ptr, data, 0); // expected-error{{'__builtin_amdgcn_av_store_b128' needs target feature gfx9-insts}} + __builtin_amdgcn_av_store_b128(ptr, data, 0); // expected-error{{'__builtin_amdgcn_av_store_b128' needs target feature flat-global-insts}} } diff --git a/llvm/lib/TargetParser/AMDGPUTargetParser.cpp b/llvm/lib/TargetParser/AMDGPUTargetParser.cpp index 756b7c2154ca2..24e6ece329c4c 100644 --- a/llvm/lib/TargetParser/AMDGPUTargetParser.cpp +++ b/llvm/lib/TargetParser/AMDGPUTargetParser.cpp @@ -226,6 +226,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["dpp"] = true; Features["gfx8-insts"] = true; Features["gfx9-insts"] = true; + Features["flat-global-insts"] = true; Features["gfx10-insts"] = true; Features["gfx10-3-insts"] = true; Features["gfx11-insts"] = true; @@ -280,6 +281,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["dpp"] = true; Features["gfx8-insts"] = true; Features["gfx9-insts"] = true; + Features["flat-global-insts"] = true; Features["gfx10-insts"] = true; Features["gfx10-3-insts"] = true; Features["gfx11-insts"] = true; @@ -313,6 +315,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["dpp"] = true; Features["gfx8-insts"] = true; Features["gfx9-insts"] = true; + Features["flat-global-insts"] = true; Features["gfx10-insts"] = true; Features["gfx10-3-insts"] = true; Features["gfx11-insts"] = true; @@ -354,6 +357,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["dpp"] = true; Features["gfx8-insts"] = true; Features["gfx9-insts"] = true; + Features["flat-global-insts"] = true; Features["gfx10-insts"] = true; Features["gfx10-3-insts"] = true; Features["gfx11-insts"] = true; @@ -391,6 +395,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["dpp"] = true; Features["gfx8-insts"] = true; Features["gfx9-insts"] = true; + Features["flat-global-insts"] = true; Features["gfx10-insts"] = true; Features["gfx10-3-insts"] = true; Features["image-insts"] = true; @@ -427,6 +432,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["dpp"] = true; Features["gfx8-insts"] = true; Features["gfx9-insts"] = true; + Features["flat-global-insts"] = true; Features["gfx10-insts"] = true; Features["image-insts"] = true; Features["s-memrealtime"] = true; @@ -486,6 +492,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["dot7-insts"] = true; Features["dot10-insts"] = true; Features["gfx9-insts"] = true; + Features["flat-global-insts"] = true; Features["gfx8-insts"] = true; Features["16-bit-insts"] = true; Features["dpp"] = true; @@ -532,6 +539,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, case GK_GFX900: case GK_GFX9_GENERIC: Features["gfx9-insts"] = true; + Features["flat-global-insts"] = true; Features["vmem-to-lds-load-insts"] = true; [[fallthrough]]; case GK_GFX810: _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
