llvmorg-github-actions[bot] wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-llvm-support Author: Pierre van Houtryve (Pierre-vh) <details> <summary>Changes</summary> Add a new EXECSYNC address space that is used for global variables that are used to represent the barrier IDs in GFX12.5. This patch only aims to add the new AS, and plumb it through the LLVM machinery. It does not change the pointer layout. It's essentially syntactic sugar over a local pointer. Depending on how the discussion evolves, we can consider changing the pointer layout subsequently or in this PR. The motivation for this is to make the relation between LDS and barrier GVs explicit in the compiler. It does add a bit more complexity, but that complexity was already there, just hidden by pretending barrier GVs were actual LDS. --- Patch is 104.11 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/195613.diff 36 Files Affected: - (modified) clang/lib/CodeGen/Targets/AMDGPU.cpp (+14-1) - (modified) clang/test/CodeGen/target-data.c (+2-2) - (modified) clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl (+1-1) - (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl (+8-8) - (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl (+7-7) - (modified) llvm/docs/AMDGPUUsage.rst (+31-9) - (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+6-5) - (modified) llvm/include/llvm/Support/AMDGPUAddrSpace.h (+12-2) - (modified) llvm/lib/Target/AMDGPU/AMDGPU.h (+17-11) - (modified) llvm/lib/Target/AMDGPU/AMDGPU.td (+1) - (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (+24-10) - (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+12-2) - (modified) llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp (+34-47) - (modified) llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp (-10) - (modified) llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp (+3-1) - (modified) llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.cpp (+21-14) - (modified) llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.h (+6-1) - (modified) llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp (+3-9) - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+27-18) - (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.td (+1-1) - (modified) llvm/lib/TargetParser/TargetDataLayout.cpp (+3-2) - (added) llvm/test/CodeGen/AMDGPU/addrspacecast-execsync.ll (+132) - (modified) llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll (+30-30) - (modified) llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll (+12-23) - (modified) llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll (+30-30) - (modified) llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll (+4-4) - (modified) llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll (+2-2) - (modified) llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll (+10-10) - (modified) llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll (+2-2) - (modified) llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-named-barrier.ll (+5-8) - (modified) llvm/test/CodeGen/AMDGPU/lds-link-time-named-barrier.ll (+7-7) - (modified) llvm/test/CodeGen/AMDGPU/nullptr.ll (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll (+18-29) - (modified) llvm/test/CodeGen/AMDGPU/s-barrier.ll (+25-25) - (modified) llvm/test/CodeGen/AMDGPU/s-wakeup-barrier.ll (+5-5) - (modified) llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll (+1-1) ``````````diff diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index be7c2f9c89d97..c72e34a9adc8b 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -555,8 +555,21 @@ AMDGPUTargetCodeGenInfo::getSRetAddrSpace(const CXXRecordDecl *RD) const { LangAS AMDGPUTargetCodeGenInfo::adjustGlobalVarAddressSpace( CodeGenModule &CGM, const VarDecl *D, std::optional<LangAS> AS) const { - if (AS) + if (AS) { + // NamedWorkgroupBarrier GVs are declared as __shared__, but the back-end + // models them as a separate address space. + const LangOptions &LangOpts = CGM.getLangOpts(); + if (D && LangOpts.CUDA && LangOpts.CUDAIsDevice && + AS == LangAS::cuda_shared) { + const Type *Ty = D->getType().getCanonicalType().getTypePtr(); + if (Ty->isArrayType()) + Ty = Ty->getBaseElementTypeUnsafe(); + const BuiltinType *BTy = dyn_cast<BuiltinType>(Ty); + if (BTy && BTy->getKind() == BuiltinType::AMDGPUNamedWorkgroupBarrier) + return getLangASFromTargetAS(llvm::AMDGPUAS::EXECSYNC); + } return *AS; + } LangAS DefaultGlobalAS = getLangASFromTargetAS( CGM.getContext().getTargetAddressSpace(LangAS::opencl_global)); diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c index 6df20c728f71b..033ec3907ca6f 100644 --- a/clang/test/CodeGen/target-data.c +++ b/clang/test/CodeGen/target-data.c @@ -160,12 +160,12 @@ // RUN: %clang_cc1 -triple amdgcn-unknown -target-cpu hawaii -o - -emit-llvm %s \ // RUN: | FileCheck %s -check-prefix=R600SI -// R600SI: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" +// R600SI: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p15:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" // Test default -target-cpu // RUN: %clang_cc1 -triple amdgcn-unknown -o - -emit-llvm %s \ // RUN: | FileCheck %s -check-prefix=R600SIDefault -// R600SIDefault: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" +// R600SIDefault: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p15:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" // RUN: %clang_cc1 -triple arm64-unknown -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=AARCH64 diff --git a/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl b/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl index 72ce72644b8ea..fcee9b3b20813 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 %s -O0 -triple amdgcn -emit-llvm -o - | FileCheck %s // RUN: %clang_cc1 %s -O0 -triple amdgcn---opencl -emit-llvm -o - | FileCheck %s -// CHECK: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" +// CHECK: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p15:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" void foo(void) {} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl index 14d7e7a365989..aa0fb93db3cc1 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl @@ -94,9 +94,9 @@ void test_s_barrier_signal() // CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3) +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(15) // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4 -// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) [[TMP1]], i32 [[TMP2]]) // CHECK-NEXT: ret void // void test_s_barrier_signal_var(void *bar, int a) @@ -148,9 +148,9 @@ void test_s_barrier_signal_isfirst(int* a, int* b, int *c) // CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3) +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(15) // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4 -// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.init(ptr addrspace(15) [[TMP1]], i32 [[TMP2]]) // CHECK-NEXT: ret void // void test_s_barrier_init(void *bar, int a) @@ -164,8 +164,8 @@ void test_s_barrier_init(void *bar, int a) // CHECK-NEXT: [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr // CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3) -// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) [[TMP1]]) +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(15) +// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) [[TMP1]]) // CHECK-NEXT: ret void // void test_s_barrier_join(void *bar) @@ -208,8 +208,8 @@ unsigned test_s_get_barrier_state(int a) // CHECK-NEXT: [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr // CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3) -// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) [[TMP1]]) +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(15) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) [[TMP1]]) // CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[STATE]], align 4 // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[STATE]], align 4 // CHECK-NEXT: ret i32 [[TMP3]] diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index 0b4cdd0c2c28f..9435644f8f530 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -1495,8 +1495,8 @@ void test_s_cluster_barrier() // CHECK-NEXT: [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr // CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3) -// CHECK-NEXT: call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) [[TMP1]]) +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(15) +// CHECK-NEXT: call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15) [[TMP1]]) // CHECK-NEXT: ret void // void test_s_wakeup_barrier(void *bar) @@ -1514,7 +1514,7 @@ void test_s_wakeup_barrier(void *bar) // CHECK-NEXT: store float [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[ADDR_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[X_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], float [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4:![0-9]+]], !amdgpu.ignore.denormal.mode [[META4]] +// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], float [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3:![0-9]+]], !amdgpu.ignore.denormal.mode [[META3]] // CHECK-NEXT: ret float [[TMP2]] // float test_global_add_f32(global float *addr, float x) { @@ -1531,7 +1531,7 @@ float test_global_add_f32(global float *addr, float x) { // CHECK-NEXT: store <2 x half> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[ADDR_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[X_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]] +// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]] // CHECK-NEXT: ret <2 x half> [[TMP2]] // half2 test_global_add_half2(global half2 *addr, half2 x) { @@ -1548,7 +1548,7 @@ half2 test_global_add_half2(global half2 *addr, half2 x) { // CHECK-NEXT: store <2 x half> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[X_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]] +// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]] // CHECK-NEXT: ret <2 x half> [[TMP2]] // half2 test_flat_add_2f16(generic half2 *addr, half2 x) { @@ -1566,7 +1566,7 @@ half2 test_flat_add_2f16(generic half2 *addr, half2 x) { // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[X_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[TMP1]] to <2 x bfloat> -// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]] +// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]] // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[TMP3]] to <2 x i16> // CHECK-NEXT: ret <2 x i16> [[TMP4]] // @@ -1585,7 +1585,7 @@ short2 test_flat_add_2bf16(generic short2 *addr, short2 x) { // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[ADDR_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[X_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[TMP1]] to <2 x bfloat> -// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]] +// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]] // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[TMP3]] to <2 x i16> // CHECK-NEXT: ret <2 x i16> [[TMP4]] // diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index fbc05c1732d90..b359daf8bcf45 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -927,6 +927,7 @@ supported for the ``amdgcn`` target. *reserved for downstream use (LLPC)* 12 *reserved for future use* 13 *reserved for future use* 14 + Execution Synchronization 15 N/A N/A 32 0xFFFFFFFF Streamout Registers 128 N/A GS_REGS ===================================== =============== =========== ================ ======= ============================ @@ -1139,6 +1140,29 @@ supported for the ``amdgcn`` target. a buffer strided pointer, this means that the base pointer is ``align(4)``, that the offset is a multiple of 4 bytes, and that the stride is a multiple of 4. +**Execution Synchronization (execsync)** + This address space represents hardware resources used to synchronize the execution + of wavefronts. It does not map directly to any addressable memory, thus pointers + into this address space: + + * Never alias with any other pointers outside this address space. + * Cannot be dereferenced. + * Can only be consumed by intrinsics. + * Are always uniform. + + Pointer are 32 bits. Due to these pointers being a compiler abstraction without + a corresponding hardware aperture, the back-end handles them as-if they were + local pointers; this is why the NULL pointer for the execsync address space is the + same as for local memory. The layout of pointer values into the execsync address space + is also designed so that no valid execsync address can conflict with valid local addresses. + + The pointer layout for ``s_barrier`` IDs (``Bar#``) is: + + * Offset: ``0x802000u``. + * Bits ``[9:11]``: Barrier scope. + * Bits ``[4:8]``: Barrier ID. + * Bits ``[0:3]`` are zeroes. + **Streamout Registers** Dedicated registers used by the GS NGG Streamout Instructions. The register file is modelled as a memory in a distinct address space because it is indexed @@ -1291,10 +1315,8 @@ Named barriers are fixed function hardware barrier objects that are available in gfx12.5+ in addition to the traditional default barriers. In LLVM IR, named barriers are represented by global variables of type -``target("amdgcn.named.barrier", 0)`` in the LDS address space. Named barrier -global variables do not occupy actual LDS memory, but their lifetime and -allocation scope matches that of global variables in LDS. Programs in LLVM IR -refer to named barriers using pointers. +``target("amdgcn.named.barrier", 0)`` in the Execution Synchronization Resources +address space. Programs in LLVM IR refer to named barriers using pointers. The following named barrier types are supported in global variables, defined recursively: @@ -1305,14 +1327,14 @@ recursively: .. code-block:: llvm - @bar = addrspace(3) global target("amdgcn.named.barrier", 0) undef - @foo = addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] undef - @baz = addrspace(3) global { target("amdgcn.named.barrier", 0) } undef + @bar = addrspace(15) global target("amdgcn.named.barrier", 0) undef + @foo = addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] undef + @baz = addrspace(15) global { target("amdgcn.named.barrier", 0) } undef ... - %foo.i = getelementptr [2 x target("amdgcn.named.barrier", 0)], ptr addrspace(3) @foo, i32 0, i32 %i - call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %foo.i, i32 0) + %foo.i = getelementptr [2 x target("amdgcn.named.barrier", 0)], ptr addrspace(15) @foo, i32 0, i32 %i + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) %foo.i, i32 0) Named barrier types may not be used in ``alloca``. diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 8631985de9a0a..908d520e0dc69 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -13,6 +13,7 @@ def flat_ptr_ty : LLVMQualPointerType<0>; def global_ptr_ty : LLVMQualPointerType<1>; def local_ptr_ty : LLVMQualPointerType<3>; +def execsync_ptr_ty : LLVMQualPointerType<15>; // The amdgpu-no-* attributes (ex amdgpu-no-workitem-id-z) typically inferred // by the backend cause whole-program undefined behavior when violated, such as @@ -295,7 +296,7 @@ def int_amdgcn_s_barrier_signal : ClangBuiltin<"__builtin_amdgcn_s_barrier_signa // If %memberCnt is 0, the member count is retained from the previous // s_barrier_init or s_barrier_signal operation. def int_amdgcn_s_barrier_signal_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_var">, - Intrinsic<[], [local_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, + Intrinsic<[], [execsync_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; // bool @llvm.amdgcn.s.barrier.signal.isfirst(i32 %barrierType) @@ -307,19 +308,19 @@ def int_amdgcn_s_barrier_signal_isfirst : ClangBuiltin<"__builtin_amdgcn_s_barri // void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) %barrier, i32 %memberCnt) // The %barrier and %memberCnt argument must be uniform, otherwise behavior is undefined. def int_amdgcn_s_barrier_init : ClangBuiltin<"__builtin_amdgcn_s_barrier_init">, - Intrinsic<[], [local_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, + Intrinsic<[], [execsync_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; // void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %barrier) // The %barrier argument must be uniform, otherwise behavior is undefined. def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">, - Intrinsic<[], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, + Intrinsic<[], [execsync_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; // void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) %barrier) // The %barrier argument must be uniform, otherwise behavior is undefined. def int_amdgcn_s_wakeup_barrier : ClangBuiltin<"__builtin_amdgcn_s_wakeup_barrier">, - Intrinsic<[], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, + Intrinsic<[], [execsync_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; // void @llvm.amdgcn.s.barrier.wait(i16 %barrierType) @@ -342,7 +343,7 @@ def int_amdgcn_s_get_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_barrie // uint32_t @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) %barrier) // The %barrier argument must be uniform, otherwise behavior is undefined. def int_amdgcn_s_get_named_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_named_barrier_state">, - Intrinsic<[llvm_i32_ty], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, + Intrinsic<[llvm_i32_ty], [execsync_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">, diff --git a/llvm/include/llvm/Support/AMDGPUAddrSpace.h b/llvm/include/llvm/Support/AMDGPUAddrSpace.h index 3fe6492584d84..d2c82a0d09778 100644 --- a/llvm/include/llvm/Support/AMDGPUAddrSpace.h +++ b/llvm/include/llvm/Support/AMDGPUAddrSpace.h @@ -26,8 +26,7 @@ namespace llvm { /// memory locations. namespace AMDGPUAS { enum : unsigned { - // The maximum value for flat, generic, local, private, constant and region. - MAX_AMDGPU_ADDRESS = 9, + MAX_AMDGPU_ADDRESS = 15, FLAT_ADDRESS = 0, ///< Address space for flat memory. ... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/195613 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
