https://github.com/changpeng updated https://github.com/llvm/llvm-project/pull/156595
>From e1dd9629e396f1786903f7cda38072253ffaca00 Mon Sep 17 00:00:00 2001 From: Changpeng Fang <changpeng.f...@amd.com> Date: Tue, 2 Sep 2025 23:50:28 -0700 Subject: [PATCH 1/2] [AMDGPU] Support cluster_load_async_to_lds instructions on gfx1250 --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 + ...ins-amdgcn-gfx1250-async-load-store-lds.cl | 40 +++ .../builtins-amdgcn-error-gfx1250-param.cl | 10 + llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 24 ++ llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 3 + llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 18 ++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 2 + .../AMDGPU/AMDGPUInstructionSelector.cpp | 11 + .../Target/AMDGPU/AMDGPUInstructionSelector.h | 2 + .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 19 ++ llvm/lib/Target/AMDGPU/FLATInstructions.td | 36 +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 14 +- .../llvm.amdgcn.cluster.load.async.to.lds.ll | 301 ++++++++++++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s | 105 +++++- llvm/test/MC/AMDGPU/gfx1250_asm_vflat_err.s | 24 ++ .../AMDGPU/gfx1250_dasm_vflat.txt | 78 +++++ 16 files changed, 689 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.async.to.lds.ll diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 7b7dbf7043099..24e35cea128e9 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -670,6 +670,10 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b128, "V4iV4i*0Ii", "nc", "gfx TARGET_BUILTIN(__builtin_amdgcn_cluster_load_b32, "ii*1Iii", "nc", "gfx1250-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_cluster_load_b64, "V2iV2i*1Iii", "nc", "gfx1250-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_cluster_load_b128, "V4iV4i*1Iii", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_cluster_load_async_to_lds_b8, "vc*1c*3IiIii", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_cluster_load_async_to_lds_b32, "vi*1i*3IiIii", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_cluster_load_async_to_lds_b64, "vV2i*1V2i*3IiIii", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_cluster_load_async_to_lds_b128, "vV4i*1V4i*3IiIii", "nc", "gfx1250-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b8, "vc*1c*3IiIi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b32, "vi*1i*3IiIi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b64, "vV2i*1V2i*3IiIi", "nc", "gfx1250-insts") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl index ccc05f0aa5af3..c645d52cc7e38 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl @@ -5,6 +5,46 @@ typedef int v2i __attribute__((ext_vector_type(2))); typedef int v4i __attribute__((ext_vector_type(4))); +// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_async_to_lds_b8( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.cluster.load.async.to.lds.b8(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0, i32 [[MASK:%.*]]) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_cluster_load_async_to_lds_b8(global char* gaddr, local char* laddr, int mask) +{ + __builtin_amdgcn_cluster_load_async_to_lds_b8(gaddr, laddr, 16, 0, mask); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_async_to_lds_b32( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.cluster.load.async.to.lds.b32(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0, i32 [[MASK:%.*]]) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_cluster_load_async_to_lds_b32(global int* gaddr, local int* laddr, int mask) +{ + __builtin_amdgcn_cluster_load_async_to_lds_b32(gaddr, laddr, 16, 0, mask); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_async_to_lds_b64( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.cluster.load.async.to.lds.b64(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0, i32 [[MASK:%.*]]) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_cluster_load_async_to_lds_b64(global v2i* gaddr, local v2i* laddr, int mask) +{ + __builtin_amdgcn_cluster_load_async_to_lds_b64(gaddr, laddr, 16, 0, mask); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_async_to_lds_b128( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.cluster.load.async.to.lds.b128(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0, i32 [[MASK:%.*]]) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_cluster_load_async_to_lds_b128(global v4i* gaddr, local v4i* laddr, int mask) +{ + __builtin_amdgcn_cluster_load_async_to_lds_b128(gaddr, laddr, 16, 0, mask); +} + // CHECK-GFX1250-LABEL: @test_amdgcn_global_load_async_to_lds_b8( // CHECK-GFX1250-NEXT: entry: // CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.load.async.to.lds.b8(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0) diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl index 4c61d72703e3c..273c65e6d106d 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl @@ -113,6 +113,11 @@ void test_amdgcn_cluster_load(global int* addr32, global v2i* addr64, global v4i void test_amdgcn_async_load_store_lds_offset(global char* gaddr8, global int *gaddr32, global v2i* gaddr64, global v4i* gaddr128, local char* laddr8, local int *laddr32, local v2i* laddr64, local v4i* laddr128, int offset, int mask) { + __builtin_amdgcn_cluster_load_async_to_lds_b8(gaddr8, laddr8, offset, 0, mask); // expected-error {{'__builtin_amdgcn_cluster_load_async_to_lds_b8' must be a constant integer}} + __builtin_amdgcn_cluster_load_async_to_lds_b32(gaddr32, laddr32, offset, 0, mask); // expected-error {{'__builtin_amdgcn_cluster_load_async_to_lds_b32' must be a constant integer}} + __builtin_amdgcn_cluster_load_async_to_lds_b64(gaddr64, laddr64, offset, 0, mask); // expected-error {{'__builtin_amdgcn_cluster_load_async_to_lds_b64' must be a constant integer}} + __builtin_amdgcn_cluster_load_async_to_lds_b128(gaddr128, laddr128, offset, 0, mask); // expected-error {{'__builtin_amdgcn_cluster_load_async_to_lds_b128' must be a constant integer}} + __builtin_amdgcn_global_load_async_to_lds_b8(gaddr8, laddr8, offset, 0); // expected-error {{'__builtin_amdgcn_global_load_async_to_lds_b8' must be a constant integer}} __builtin_amdgcn_global_load_async_to_lds_b32(gaddr32, laddr32, offset, 0); // expected-error {{'__builtin_amdgcn_global_load_async_to_lds_b32' must be a constant integer}} __builtin_amdgcn_global_load_async_to_lds_b64(gaddr64, laddr64, offset, 0); // expected-error {{'__builtin_amdgcn_global_load_async_to_lds_b64' must be a constant integer}} @@ -127,6 +132,11 @@ void test_amdgcn_async_load_store_lds_offset(global char* gaddr8, global int *ga void test_amdgcn_async_load_store_lds_cpol(global char* gaddr8, global int *gaddr32, global v2i* gaddr64, global v4i* gaddr128, local char* laddr8, local int *laddr32, local v2i* laddr64, local v4i* laddr128, int cpol, int mask) { + __builtin_amdgcn_cluster_load_async_to_lds_b8(gaddr8, laddr8, 16, cpol, mask); // expected-error {{'__builtin_amdgcn_cluster_load_async_to_lds_b8' must be a constant integer}} + __builtin_amdgcn_cluster_load_async_to_lds_b32(gaddr32, laddr32, 16, cpol, mask); // expected-error {{'__builtin_amdgcn_cluster_load_async_to_lds_b32' must be a constant integer}} + __builtin_amdgcn_cluster_load_async_to_lds_b64(gaddr64, laddr64, 16, cpol, mask); // expected-error {{'__builtin_amdgcn_cluster_load_async_to_lds_b64' must be a constant integer}} + __builtin_amdgcn_cluster_load_async_to_lds_b128(gaddr128, laddr128, 16, cpol, mask); // expected-error {{'__builtin_amdgcn_cluster_load_async_to_lds_b128' must be a constant integer}} + __builtin_amdgcn_global_load_async_to_lds_b8(gaddr8, laddr8, 16, cpol); // expected-error {{'__builtin_amdgcn_global_load_async_to_lds_b8' must be a constant integer}} __builtin_amdgcn_global_load_async_to_lds_b32(gaddr32, laddr32, 16, cpol); // expected-error {{'__builtin_amdgcn_global_load_async_to_lds_b32' must be a constant integer}} __builtin_amdgcn_global_load_async_to_lds_b64(gaddr64, laddr64, 16, cpol); // expected-error {{'__builtin_amdgcn_global_load_async_to_lds_b64' must be a constant integer}} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 3c5ac99512a64..4a91b40f0a2e6 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -3815,6 +3815,21 @@ def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic< [IntrNoMem, IntrSpeculatable] >; +// Async instructions increment ASYNCcnt which is modeled as InaccessibleMem. +class AMDGPUAsyncClusterLoadLDS : Intrinsic < + [], + [global_ptr_ty, // Base global pointer to load from + local_ptr_ty, // LDS base pointer to store to + llvm_i32_ty, // offset + llvm_i32_ty, // gfx12+ cachepolicy: + // bits [0-2] = th + // bits [3-4] = scope + llvm_i32_ty], // workgroup broadcast mask (to M0) + [IntrInaccessibleMemOrArgMemOnly, ReadOnly<ArgIndex<0>>, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<0>>, + NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree], + "", [SDNPMemOperand] +>; + class AMDGPUAsyncGlobalLoadToLDS : Intrinsic < [], [global_ptr_ty, // Base global pointer to load from @@ -3841,6 +3856,15 @@ class AMDGPUAsyncGlobalStoreFromLDS : Intrinsic < "", [SDNPMemOperand] >; +def int_amdgcn_cluster_load_async_to_lds_b8 : + ClangBuiltin<"__builtin_amdgcn_cluster_load_async_to_lds_b8">, AMDGPUAsyncClusterLoadLDS; +def int_amdgcn_cluster_load_async_to_lds_b32 : + ClangBuiltin<"__builtin_amdgcn_cluster_load_async_to_lds_b32">, AMDGPUAsyncClusterLoadLDS; +def int_amdgcn_cluster_load_async_to_lds_b64 : + ClangBuiltin<"__builtin_amdgcn_cluster_load_async_to_lds_b64">, AMDGPUAsyncClusterLoadLDS; +def int_amdgcn_cluster_load_async_to_lds_b128 : + ClangBuiltin<"__builtin_amdgcn_cluster_load_async_to_lds_b128">, AMDGPUAsyncClusterLoadLDS; + def int_amdgcn_global_load_async_to_lds_b8 : ClangBuiltin<"__builtin_amdgcn_global_load_async_to_lds_b8">, AMDGPUAsyncGlobalLoadToLDS; def int_amdgcn_global_load_async_to_lds_b32 : diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index bc88404442c3f..0c112d1787c1a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -137,6 +137,9 @@ def gi_global_saddr_glc : def gi_global_saddr_no_ioffset : GIComplexOperandMatcher<s64, "selectGlobalSAddrNoIOffset">, GIComplexPatternEquiv<GlobalSAddrNoIOffset>; +def gi_global_saddr_no_ioffset_m0 : + GIComplexOperandMatcher<s64, "selectGlobalSAddrNoIOffsetM0">, + GIComplexPatternEquiv<GlobalSAddrNoIOffsetM0>; def gi_mubuf_scratch_offset : GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 2734bc27ede3d..3785d0f7f2688 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2137,6 +2137,24 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr, return true; } +bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffsetM0(SDNode *N, SDValue Addr, + SDValue &SAddr, + SDValue &VOffset, + SDValue &CPol) const { + bool ScaleOffset; + SDValue DummyOffset; + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset, + false)) + return false; + + // We are assuming CPol is second from last operand of the intrinsic. + auto PassedCPol = + N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL; + CPol = CurDAG->getTargetConstant( + (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32); + return true; +} + static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) { if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) { SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index e79585844a01c..4fa0d3f72e1c7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -179,6 +179,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { SDValue &CPol) const; bool SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &CPol) const; + bool SelectGlobalSAddrNoIOffsetM0(SDNode *N, SDValue Addr, SDValue &SAddr, + SDValue &VOffset, SDValue &CPol) const; bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &Offset) const; bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index e8482a9b936b3..12915c7344426 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -5734,6 +5734,17 @@ AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset( return selectGlobalSAddr(Root, PassedCPol, false); } +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectGlobalSAddrNoIOffsetM0( + MachineOperand &Root) const { + const MachineInstr &I = *Root.getParent(); + + // We are assuming CPol is second from last operand of the intrinsic. + auto PassedCPol = + I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL; + return selectGlobalSAddr(Root, PassedCPol, false); +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { Register Addr = Root.getReg(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 194dd6e4099a8..c760fe7ef99dd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -261,6 +261,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector { selectGlobalSAddrGLC(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectGlobalSAddrNoIOffset(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectGlobalSAddrNoIOffsetM0(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectScratchSAddr(MachineOperand &Root) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 08a9ed2714ec0..c5bafc492480d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3322,6 +3322,14 @@ void AMDGPURegisterBankInfo::applyMappingImpl( constrainOpWithReadfirstlane(B, MI, 6); // soffset return; } + case Intrinsic::amdgcn_cluster_load_async_to_lds_b8: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b32: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b64: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: { + applyDefaultMapping(OpdMapper); + constrainOpWithReadfirstlane(B, MI, 5); + return; + } case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: { applyDefaultMapping(OpdMapper); @@ -5483,6 +5491,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[4] = AMDGPU::getValueMapping(M0Bank, 32); break; } + case Intrinsic::amdgcn_cluster_load_async_to_lds_b8: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b32: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b64: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: { + OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + unsigned M0Bank = + getRegBankID(MI.getOperand(5).getReg(), MRI, AMDGPU::SGPRRegBankID); + OpdsMapping[5] = AMDGPU::getValueMapping(M0Bank, 32); + break; + } case Intrinsic::amdgcn_global_store_async_from_lds_b8: case Intrinsic::amdgcn_global_store_async_from_lds_b32: case Intrinsic::amdgcn_global_store_async_from_lds_b64: diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 19f95c5ac4c37..dcb4f506dfbd2 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -12,6 +12,7 @@ let WantsRoot = true in { def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [], -10>; def GlobalSAddrNoIOffset : ComplexPattern<iPTR, 3, "SelectGlobalSAddrNoIOffset", [], [], -3>; + def GlobalSAddrNoIOffsetM0 : ComplexPattern<iPTR, 3, "SelectGlobalSAddrNoIOffsetM0", [], [], -3>; def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>; def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>; def GlobalSAddrCPol : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPol", [], [], -10>; @@ -1192,6 +1193,12 @@ let SubtargetPredicate = isGFX12Plus in { let SubtargetPredicate = isGFX1250Plus in { +let Uses = [M0, EXEC, ASYNCcnt], WaveSizePredicate = isWave32 in { +defm CLUSTER_LOAD_ASYNC_TO_LDS_B8 : FLAT_Global_Load_LDS_Pseudo<"cluster_load_async_to_lds_b8", 1>; +defm CLUSTER_LOAD_ASYNC_TO_LDS_B32 : FLAT_Global_Load_LDS_Pseudo<"cluster_load_async_to_lds_b32", 1>; +defm CLUSTER_LOAD_ASYNC_TO_LDS_B64 : FLAT_Global_Load_LDS_Pseudo<"cluster_load_async_to_lds_b64", 1>; +defm CLUSTER_LOAD_ASYNC_TO_LDS_B128 : FLAT_Global_Load_LDS_Pseudo<"cluster_load_async_to_lds_b128", 1>; +} // End Uses = [M0, EXEC, ASYNCcnt], WaveSizePredicate = isWave32 defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b8", 1>; defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b32", 1>; defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b64", 1>; @@ -1368,6 +1375,16 @@ class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT (inst $saddr, $voffset, $offset, $cpol) >; +class FlatLoadLDSSignedPat_M0 <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < + (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol), M0), + (inst $dsaddr, $vaddr, $offset, $cpol) +>; + +class GlobalLoadLDSSaddrPat_M0 <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < + (node (GlobalSAddrNoIOffsetM0 (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm), M0), + (inst $dsaddr, $saddr, $voffset, $offset, $cpol) +>; + class FlatLoadLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)), (inst $dsaddr, $vaddr, $offset, $cpol) @@ -1608,6 +1625,16 @@ class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Va (inst $vaddr, $saddr, $offset, $cpol) >; +multiclass GlobalLoadLDSPats_M0<FLAT_Pseudo inst, SDPatternOperator node> { + def : FlatLoadLDSSignedPat_M0 <inst, node> { + let AddedComplexity = 10; + } + + def : GlobalLoadLDSSaddrPat_M0<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> { + let AddedComplexity = 11; + } +} + multiclass GlobalLoadLDSPats<FLAT_Pseudo inst, SDPatternOperator node> { def : FlatLoadLDSSignedPat <inst, node> { let AddedComplexity = 10; @@ -2209,6 +2236,11 @@ let OtherPredicates = [isGFX1250Plus] in { defm : GlobalFLATLoadPats_M0 <CLUSTER_LOAD_B64, int_amdgcn_cluster_load_b64, v2i32>; defm : GlobalFLATLoadPats_M0 <CLUSTER_LOAD_B128, int_amdgcn_cluster_load_b128, v4i32>; + defm : GlobalLoadLDSPats_M0 <CLUSTER_LOAD_ASYNC_TO_LDS_B8, int_amdgcn_cluster_load_async_to_lds_b8>; + defm : GlobalLoadLDSPats_M0 <CLUSTER_LOAD_ASYNC_TO_LDS_B32, int_amdgcn_cluster_load_async_to_lds_b32>; + defm : GlobalLoadLDSPats_M0 <CLUSTER_LOAD_ASYNC_TO_LDS_B64, int_amdgcn_cluster_load_async_to_lds_b64>; + defm : GlobalLoadLDSPats_M0 <CLUSTER_LOAD_ASYNC_TO_LDS_B128, int_amdgcn_cluster_load_async_to_lds_b128>; + defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B8, int_amdgcn_global_load_async_to_lds_b8>; defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B32, int_amdgcn_global_load_async_to_lds_b32>; defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B64, int_amdgcn_global_load_async_to_lds_b64>; @@ -3507,6 +3539,10 @@ defm CLUSTER_LOAD_B32 : VFLAT_Real_AllAddr_gfx1250<0x067>; defm CLUSTER_LOAD_B64 : VFLAT_Real_AllAddr_gfx1250<0x068>; defm CLUSTER_LOAD_B128 : VFLAT_Real_AllAddr_gfx1250<0x069>; +defm CLUSTER_LOAD_ASYNC_TO_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x6a>; +defm CLUSTER_LOAD_ASYNC_TO_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x6b>; +defm CLUSTER_LOAD_ASYNC_TO_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x6c>; +defm CLUSTER_LOAD_ASYNC_TO_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x6d>; defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x5f>; defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x60>; defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x61>; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index dad5b292893fd..2f5a2bc31caf7 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1263,15 +1263,19 @@ MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const { static unsigned getIntrMemWidth(unsigned IntrID) { switch (IntrID) { case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b8: case Intrinsic::amdgcn_global_store_async_from_lds_b8: return 8; case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b32: case Intrinsic::amdgcn_global_store_async_from_lds_b32: return 32; case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b64: case Intrinsic::amdgcn_global_store_async_from_lds_b64: return 64; case Intrinsic::amdgcn_global_load_async_to_lds_b128: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: case Intrinsic::amdgcn_global_store_async_from_lds_b128: return 128; default: @@ -1556,7 +1560,11 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::amdgcn_global_load_async_to_lds_b8: case Intrinsic::amdgcn_global_load_async_to_lds_b32: case Intrinsic::amdgcn_global_load_async_to_lds_b64: - case Intrinsic::amdgcn_global_load_async_to_lds_b128: { + case Intrinsic::amdgcn_global_load_async_to_lds_b128: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b8: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b32: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b64: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: { Info.opc = ISD::INTRINSIC_VOID; Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); Info.ptrVal = CI.getArgOperand(1); @@ -1684,6 +1692,10 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, case Intrinsic::amdgcn_global_load_async_to_lds_b32: case Intrinsic::amdgcn_global_load_async_to_lds_b64: case Intrinsic::amdgcn_global_load_async_to_lds_b128: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b8: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b32: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b64: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: Ptr = II->getArgOperand(1); break; default: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.async.to.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.async.to.lds.ll new file mode 100644 index 0000000000000..f571030077870 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.async.to.lds.ll @@ -0,0 +1,301 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s + +declare void @llvm.amdgcn.cluster.load.async.to.lds.b8(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %offset, i32 %cpol, i32 %mask) +declare void @llvm.amdgcn.cluster.load.async.to.lds.b32(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %offset, i32 %cpol, i32 %mask) +declare void @llvm.amdgcn.cluster.load.async.to.lds.b64(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %offset, i32 %cpol, i32 %mask) +declare void @llvm.amdgcn.cluster.load.async.to.lds.b128(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %offset, i32 %cpol, i32 %mask) + +define amdgpu_ps void @cluster_load_async_to_lds_b8_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %mask) { +; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b8_vaddr: +; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v3 +; GFX1250-SDAG-NEXT: s_mov_b32 m0, s0 +; GFX1250-SDAG-NEXT: cluster_load_async_to_lds_b8 v2, v[0:1], off offset:16 th:TH_LOAD_NT +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b8_vaddr: +; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v3 +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_mov_b32 m0, s0 +; GFX1250-GISEL-NEXT: cluster_load_async_to_lds_b8 v2, v[0:1], off offset:16 th:TH_LOAD_NT +; GFX1250-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.cluster.load.async.to.lds.b8(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 1, i32 %mask) + ret void +} + +define amdgpu_ps void @cluster_load_async_to_lds_b8_vaddr_imm_mask(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) { +; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b8_vaddr_imm_mask: +; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] +; GFX1250-SDAG-NEXT: s_mov_b32 m0, 15 +; GFX1250-SDAG-NEXT: cluster_load_async_to_lds_b8 v2, v[0:1], off offset:16 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b8_vaddr_imm_mask: +; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_mov_b32 m0, 15 +; GFX1250-GISEL-NEXT: cluster_load_async_to_lds_b8 v2, v[0:1], off offset:16 +; GFX1250-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.cluster.load.async.to.lds.b8(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 0, i32 15) + ret void +} + +define amdgpu_ps void @cluster_load_async_to_lds_b8_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 inreg %mask) { +; GFX1250-LABEL: cluster_load_async_to_lds_b8_saddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, 32 +; GFX1250-NEXT: s_mov_b32 m0, s2 +; GFX1250-NEXT: cluster_load_async_to_lds_b8 v0, v1, s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.cluster.load.async.to.lds.b8(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 0, i32 %mask) + ret void +} + +define amdgpu_ps void @cluster_load_async_to_lds_b32_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %mask) { +; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b32_vaddr: +; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v3 +; GFX1250-SDAG-NEXT: s_mov_b32 m0, s0 +; GFX1250-SDAG-NEXT: cluster_load_async_to_lds_b32 v2, v[0:1], off offset:16 th:TH_LOAD_HT scope:SCOPE_SE +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b32_vaddr: +; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v3 +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_mov_b32 m0, s0 +; GFX1250-GISEL-NEXT: cluster_load_async_to_lds_b32 v2, v[0:1], off offset:16 th:TH_LOAD_HT scope:SCOPE_SE +; GFX1250-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.cluster.load.async.to.lds.b32(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 10, i32 %mask) + ret void +} + +define amdgpu_ps void @cluster_load_async_to_lds_b32_vaddr_imm_mask(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) { +; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b32_vaddr_imm_mask: +; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] +; GFX1250-SDAG-NEXT: s_mov_b32 m0, 15 +; GFX1250-SDAG-NEXT: cluster_load_async_to_lds_b32 v2, v[0:1], off offset:16 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b32_vaddr_imm_mask: +; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_mov_b32 m0, 15 +; GFX1250-GISEL-NEXT: cluster_load_async_to_lds_b32 v2, v[0:1], off offset:16 +; GFX1250-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.cluster.load.async.to.lds.b32(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 0, i32 15) + ret void +} + +define amdgpu_ps void @cluster_load_async_to_lds_b32_saddr( ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 inreg %mask) { +; GFX1250-LABEL: cluster_load_async_to_lds_b32_saddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, 32 +; GFX1250-NEXT: s_mov_b32 m0, s2 +; GFX1250-NEXT: cluster_load_async_to_lds_b32 v0, v1, s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.cluster.load.async.to.lds.b32(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 0, i32 %mask) + ret void +} + +define amdgpu_ps void @cluster_load_async_to_lds_b64_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %mask) { +; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b64_vaddr: +; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v3 +; GFX1250-SDAG-NEXT: s_mov_b32 m0, s0 +; GFX1250-SDAG-NEXT: cluster_load_async_to_lds_b64 v2, v[0:1], off offset:16 th:TH_LOAD_NT_HT scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b64_vaddr: +; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v3 +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_mov_b32 m0, s0 +; GFX1250-GISEL-NEXT: cluster_load_async_to_lds_b64 v2, v[0:1], off offset:16 th:TH_LOAD_NT_HT scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.cluster.load.async.to.lds.b64(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 22, i32 %mask) + ret void +} + +define amdgpu_ps void @cluster_load_async_to_lds_b64_vaddr_imm_mask( ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) { +; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b64_vaddr_imm_mask: +; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] +; GFX1250-SDAG-NEXT: s_movk_i32 m0, 0x7f +; GFX1250-SDAG-NEXT: cluster_load_async_to_lds_b64 v2, v[0:1], off offset:16 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b64_vaddr_imm_mask: +; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_movk_i32 m0, 0x7f +; GFX1250-GISEL-NEXT: cluster_load_async_to_lds_b64 v2, v[0:1], off offset:16 +; GFX1250-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.cluster.load.async.to.lds.b64(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 0, i32 127) + ret void +} + +define amdgpu_ps void @cluster_load_async_to_lds_b64_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 inreg %mask) { +; GFX1250-LABEL: cluster_load_async_to_lds_b64_saddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, 32 +; GFX1250-NEXT: s_mov_b32 m0, s2 +; GFX1250-NEXT: cluster_load_async_to_lds_b64 v0, v1, s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.cluster.load.async.to.lds.b64(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 0, i32 %mask) + ret void +} + +define amdgpu_ps void @cluster_load_async_to_lds_b128_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %mask) { +; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b128_vaddr: +; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v3 +; GFX1250-SDAG-NEXT: s_mov_b32 m0, s0 +; GFX1250-SDAG-NEXT: cluster_load_async_to_lds_b128 v2, v[0:1], off offset:16 th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b128_vaddr: +; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v3 +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_mov_b32 m0, s0 +; GFX1250-GISEL-NEXT: cluster_load_async_to_lds_b128 v2, v[0:1], off offset:16 th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.cluster.load.async.to.lds.b128(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 27, i32 %mask) + ret void +} + +define amdgpu_ps void @cluster_load_async_to_lds_b128_vaddr_imm_mask(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) { +; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b128_vaddr_imm_mask: +; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] +; GFX1250-SDAG-NEXT: s_movk_i32 m0, 0x7f +; GFX1250-SDAG-NEXT: cluster_load_async_to_lds_b128 v2, v[0:1], off offset:16 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b128_vaddr_imm_mask: +; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_movk_i32 m0, 0x7f +; GFX1250-GISEL-NEXT: cluster_load_async_to_lds_b128 v2, v[0:1], off offset:16 +; GFX1250-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.cluster.load.async.to.lds.b128(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 0, i32 127) + ret void +} + +define amdgpu_ps void @cluster_load_async_to_lds_b128_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 inreg %mask) { +; GFX1250-LABEL: cluster_load_async_to_lds_b128_saddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, 32 +; GFX1250-NEXT: s_mov_b32 m0, s2 +; GFX1250-NEXT: cluster_load_async_to_lds_b128 v0, v1, s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.cluster.load.async.to.lds.b128(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 0, i32 %mask) + ret void +} + +define amdgpu_ps void @cluster_load_async_to_lds_b32_saddr_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 inreg %mask, i32 %idx) { +; GFX1250-LABEL: cluster_load_async_to_lds_b32_saddr_scale_offset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_mov_b32 m0, s2 +; GFX1250-NEXT: cluster_load_async_to_lds_b32 v0, v1, s[0:1] offset:16 scale_offset th:TH_LOAD_NT +; GFX1250-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %gep = getelementptr i32, ptr addrspace(1) %gaddr, i64 %idxprom + call void @llvm.amdgcn.cluster.load.async.to.lds.b32(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 1, i32 %mask) + ret void +} + +define amdgpu_ps void @cluster_load_async_to_lds_b64_saddr_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 inreg %mask, i32 %idx) { +; GFX1250-LABEL: cluster_load_async_to_lds_b64_saddr_scale_offset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_mov_b32 m0, s2 +; GFX1250-NEXT: cluster_load_async_to_lds_b64 v0, v1, s[0:1] offset:16 scale_offset th:TH_LOAD_NT +; GFX1250-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i64 %idxprom + call void @llvm.amdgcn.cluster.load.async.to.lds.b64(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 1, i32 %mask) + ret void +} + +define amdgpu_ps void @cluster_load_async_to_lds_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 inreg %mask, i32 %idx) { +; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b64_saddr_no_scale_offset: +; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-SDAG-NEXT: s_mov_b32 m0, s2 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1] +; GFX1250-SDAG-NEXT: cluster_load_async_to_lds_b64 v0, v[2:3], off offset:16 th:TH_LOAD_NT +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b64_saddr_no_scale_offset: +; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX1250-GISEL-NEXT: s_mov_b32 m0, s2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v5, v3, vcc_lo +; GFX1250-GISEL-NEXT: cluster_load_async_to_lds_b64 v0, v[2:3], off offset:16 th:TH_LOAD_NT +; GFX1250-GISEL-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %gep = getelementptr i32, ptr addrspace(1) %gaddr, i64 %idxprom + call void @llvm.amdgcn.cluster.load.async.to.lds.b64(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 1, i32 %mask) + ret void +} diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s index 8323b6c9a483f..3a95a018c3979 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s @@ -3176,7 +3176,6 @@ cluster_load_b128 v[0:3], v4, s[0:1] offset:-64 // GFX1250: cluster_load_b128 v[0:3], v4, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0xc0,0xff,0xff] // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU - flat_atomic_add_f64 v[0:1], v[2:3] offset:4095 // GFX1250: flat_atomic_add_f64 v[0:1], v[2:3] offset:4095 ; encoding: [0x7c,0x40,0x15,0xec,0x00,0x00,0x00,0x01,0x00,0xff,0x0f,0x00] // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU @@ -3491,6 +3490,110 @@ global_store_async_from_lds_b64 v2, v1, s[4:5] scale_offset th:TH_STORE_BYPASS s // GFX1250: global_store_async_from_lds_b64 v2, v1, s[4:5] scale_offset th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0x40,0x19,0xee,0x00,0x00,0xbd,0x00,0x02,0x00,0x00,0x00] // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU +cluster_load_async_to_lds_b8 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS +// GFX1250: cluster_load_async_to_lds_b8 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x80,0x1a,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_async_to_lds_b8 v1, v[2:3], off offset:64 +// GFX1250: cluster_load_async_to_lds_b8 v1, v[2:3], off offset:64 ; encoding: [0x7c,0x80,0x1a,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_async_to_lds_b8 v1, v[2:3], off offset:-64 +// GFX1250: cluster_load_async_to_lds_b8 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0x80,0x1a,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_async_to_lds_b8 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV +// GFX1250: cluster_load_async_to_lds_b8 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x80,0x1a,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_async_to_lds_b8 v1, v2, s[2:3] offset:64 +// GFX1250: cluster_load_async_to_lds_b8 v1, v2, s[2:3] offset:64 ; encoding: [0x02,0x80,0x1a,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_async_to_lds_b8 v1, v2, s[2:3] offset:-64 +// GFX1250: cluster_load_async_to_lds_b8 v1, v2, s[2:3] offset:-64 ; encoding: [0x02,0x80,0x1a,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_async_to_lds_b32 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS +// GFX1250: cluster_load_async_to_lds_b32 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0xc0,0x1a,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_async_to_lds_b32 v1, v[2:3], off offset:64 +// GFX1250: cluster_load_async_to_lds_b32 v1, v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x1a,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_async_to_lds_b32 v1, v[2:3], off offset:-64 +// GFX1250: cluster_load_async_to_lds_b32 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x1a,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_async_to_lds_b32 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV +// GFX1250: cluster_load_async_to_lds_b32 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0xc0,0x1a,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_async_to_lds_b32 v1, v2, s[2:3] offset:64 +// GFX1250: cluster_load_async_to_lds_b32 v1, v2, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x1a,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_async_to_lds_b32 v1, v2, s[2:3] offset:-64 +// GFX1250: cluster_load_async_to_lds_b32 v1, v2, s[2:3] offset:-64 ; encoding: [0x02,0xc0,0x1a,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_async_to_lds_b64 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS +// GFX1250: cluster_load_async_to_lds_b64 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x00,0x1b,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_async_to_lds_b64 v1, v[2:3], off offset:64 +// GFX1250: cluster_load_async_to_lds_b64 v1, v[2:3], off offset:64 ; encoding: [0x7c,0x00,0x1b,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_async_to_lds_b64 v1, v[2:3], off offset:-64 +// GFX1250: cluster_load_async_to_lds_b64 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0x00,0x1b,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_async_to_lds_b64 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV +// GFX1250: cluster_load_async_to_lds_b64 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x00,0x1b,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_async_to_lds_b64 v1, v2, s[2:3] offset:64 +// GFX1250: cluster_load_async_to_lds_b64 v1, v2, s[2:3] offset:64 ; encoding: [0x02,0x00,0x1b,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_async_to_lds_b64 v1, v2, s[2:3] offset:-64 +// GFX1250: cluster_load_async_to_lds_b64 v1, v2, s[2:3] offset:-64 ; encoding: [0x02,0x00,0x1b,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_async_to_lds_b128 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS +// GFX1250: cluster_load_async_to_lds_b128 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x40,0x1b,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_async_to_lds_b128 v1, v[2:3], off offset:64 +// GFX1250: cluster_load_async_to_lds_b128 v1, v[2:3], off offset:64 ; encoding: [0x7c,0x40,0x1b,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_async_to_lds_b128 v1, v[2:3], off offset:-64 +// GFX1250: cluster_load_async_to_lds_b128 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0x40,0x1b,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_async_to_lds_b128 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV +// GFX1250: cluster_load_async_to_lds_b128 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x40,0x1b,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_async_to_lds_b128 v1, v2, s[2:3] offset:64 +// GFX1250: cluster_load_async_to_lds_b128 v1, v2, s[2:3] offset:64 ; encoding: [0x02,0x40,0x1b,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_async_to_lds_b128 v1, v2, s[2:3] offset:-64 +// GFX1250: cluster_load_async_to_lds_b128 v1, v2, s[2:3] offset:-64 ; encoding: [0x02,0x40,0x1b,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_async_to_lds_b32 v1, v2, s[4:5] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS +// GFX1250: cluster_load_async_to_lds_b32 v1, v2, s[4:5] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0xc0,0x1a,0xee,0x01,0x00,0x3d,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_async_to_lds_b64 v1, v2, s[4:5] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS +// GFX1250: cluster_load_async_to_lds_b64 v1, v2, s[4:5] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0x00,0x1b,0xee,0x01,0x00,0x3d,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + global_load_async_to_lds_b8 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS // GFX1250: global_load_async_to_lds_b8 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0xc0,0x17,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00] // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_err.s index c9fe702ce53d1..2a761d9a3abb5 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_err.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_err.s @@ -105,3 +105,27 @@ global_load_async_to_lds_b128 v1, v[2:3], off th:TH_STORE_BYPASS scope:SCOPE_SYS global_load_async_to_lds_b128 v1, v2, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for load instructions + +cluster_load_async_to_lds_b8 v1, v[2:3], off th:TH_STORE_BYPASS scope:SCOPE_SYS +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for load instructions + +cluster_load_async_to_lds_b8 v1, v2, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for load instructions + +cluster_load_async_to_lds_b32 v1, v[2:3], off th:TH_STORE_BYPASS scope:SCOPE_SYS +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for load instructions + +cluster_load_async_to_lds_b32 v1, v2, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for load instructions + +cluster_load_async_to_lds_b64 v1, v[2:3], off th:TH_STORE_BYPASS scope:SCOPE_SYS +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for load instructions + +cluster_load_async_to_lds_b64 v1, v2, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for load instructions + +cluster_load_async_to_lds_b128 v1, v[2:3], off th:TH_STORE_BYPASS scope:SCOPE_SYS +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for load instructions + +cluster_load_async_to_lds_b128 v1, v2, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for load instructions diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt index f8dd65ae69da7..d6eb6dd98924b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt @@ -3312,6 +3312,84 @@ # GFX1250: global_atomic_max_num_f64 v[0:1], v[2:3], off ; encoding: [0x7c,0x00,0x17,0xee,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00] 0x7c,0x00,0x17,0xee,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00 +# GFX1250: cluster_load_async_to_lds_b128 v1, v[2:3], off offset:64 ; encoding: [0x7c,0x40,0x1b,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +0x7c,0x40,0x1b,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: cluster_load_async_to_lds_b128 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0x40,0x1b,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +0x7c,0x40,0x1b,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: cluster_load_async_to_lds_b128 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x40,0x1b,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00] +0x7c,0x40,0x1b,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: cluster_load_async_to_lds_b128 v1, v2, s[2:3] offset:64 ; encoding: [0x02,0x40,0x1b,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +0x02,0x40,0x1b,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: cluster_load_async_to_lds_b128 v1, v2, s[2:3] offset:-64 ; encoding: [0x02,0x40,0x1b,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +0x02,0x40,0x1b,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: cluster_load_async_to_lds_b128 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x40,0x1b,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00] +0x02,0x40,0x1b,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: cluster_load_async_to_lds_b32 v1, v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x1a,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +0x7c,0xc0,0x1a,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: cluster_load_async_to_lds_b32 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x1a,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +0x7c,0xc0,0x1a,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: cluster_load_async_to_lds_b32 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0xc0,0x1a,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00] +0x7c,0xc0,0x1a,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: cluster_load_async_to_lds_b32 v1, v2, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x1a,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +0x02,0xc0,0x1a,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: cluster_load_async_to_lds_b32 v1, v2, s[2:3] offset:-64 ; encoding: [0x02,0xc0,0x1a,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +0x02,0xc0,0x1a,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: cluster_load_async_to_lds_b32 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0xc0,0x1a,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00] +0x02,0xc0,0x1a,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: cluster_load_async_to_lds_b64 v1, v[2:3], off offset:64 ; encoding: [0x7c,0x00,0x1b,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +0x7c,0x00,0x1b,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: cluster_load_async_to_lds_b64 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0x00,0x1b,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +0x7c,0x00,0x1b,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: cluster_load_async_to_lds_b64 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x00,0x1b,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00] +0x7c,0x00,0x1b,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: cluster_load_async_to_lds_b64 v1, v2, s[2:3] offset:64 ; encoding: [0x02,0x00,0x1b,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +0x02,0x00,0x1b,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: cluster_load_async_to_lds_b64 v1, v2, s[2:3] offset:-64 ; encoding: [0x02,0x00,0x1b,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +0x02,0x00,0x1b,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: cluster_load_async_to_lds_b64 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x00,0x1b,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00] +0x02,0x00,0x1b,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: cluster_load_async_to_lds_b8 v1, v[2:3], off offset:64 ; encoding: [0x7c,0x80,0x1a,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +0x7c,0x80,0x1a,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: cluster_load_async_to_lds_b8 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0x80,0x1a,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +0x7c,0x80,0x1a,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: cluster_load_async_to_lds_b8 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x80,0x1a,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00] +0x7c,0x80,0x1a,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: cluster_load_async_to_lds_b8 v1, v2, s[2:3] offset:64 ; encoding: [0x02,0x80,0x1a,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +0x02,0x80,0x1a,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: cluster_load_async_to_lds_b8 v1, v2, s[2:3] offset:-64 ; encoding: [0x02,0x80,0x1a,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +0x02,0x80,0x1a,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: cluster_load_async_to_lds_b8 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x80,0x1a,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00] +0x02,0x80,0x1a,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: cluster_load_async_to_lds_b32 v1, v2, s[4:5] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0xc0,0x1a,0xee,0x01,0x00,0x3d,0x00,0x02,0x00,0x00,0x00] +0x04,0xc0,0x1a,0xee,0x01,0x00,0x3d,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: cluster_load_async_to_lds_b64 v1, v2, s[4:5] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0x00,0x1b,0xee,0x01,0x00,0x3d,0x00,0x02,0x00,0x00,0x00] +0x04,0x00,0x1b,0xee,0x01,0x00,0x3d,0x00,0x02,0x00,0x00,0x00 + # GFX1250: global_load_async_to_lds_b128 v1, v[2:3], off offset:64 ; encoding: [0x7c,0x80,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] 0x7c,0x80,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00 >From 7397c5a09e0d520a6b944653a6d31664b3f063fb Mon Sep 17 00:00:00 2001 From: Changpeng Fang <changpeng.f...@amd.com> Date: Wed, 3 Sep 2025 00:00:33 -0700 Subject: [PATCH 2/2] [AMDGPU] Fix clang-format --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index c5bafc492480d..36b27bef350ed 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -5495,12 +5495,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_cluster_load_async_to_lds_b32: case Intrinsic::amdgcn_cluster_load_async_to_lds_b64: case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: { - OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); - OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); - unsigned M0Bank = - getRegBankID(MI.getOperand(5).getReg(), MRI, AMDGPU::SGPRRegBankID); - OpdsMapping[5] = AMDGPU::getValueMapping(M0Bank, 32); - break; + OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + unsigned M0Bank = + getRegBankID(MI.getOperand(5).getReg(), MRI, AMDGPU::SGPRRegBankID); + OpdsMapping[5] = AMDGPU::getValueMapping(M0Bank, 32); + break; } case Intrinsic::amdgcn_global_store_async_from_lds_b8: case Intrinsic::amdgcn_global_store_async_from_lds_b32: _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits