https://github.com/nhaehnle updated https://github.com/llvm/llvm-project/pull/175845
From b585dcfa1931fe03d3840ae67ce78996c3a4b0f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <[email protected]> Date: Mon, 12 Jan 2026 09:42:05 -0800 Subject: [PATCH] [AMDGPU] Return two MMOs for load-to-lds and store-from-lds intrinsics Accurately represent both the load and the store part of those intrinsics. The test changes seem to be mostly fairly insignificant changes caused by subtly different scheduler behavior. commit-id:0269189c --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 108 +++++++++--------- .../CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll | 6 +- ...alizer-lds-dma-volatile-and-nontemporal.ll | 6 +- llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll | 7 +- 4 files changed, 64 insertions(+), 63 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index d21d2e05469ea..1fbde65d90708 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1432,8 +1432,24 @@ void SITargetLowering::getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &Infos, case Intrinsic::amdgcn_struct_buffer_load_lds: case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue(); + + // Entry 0: Load from buffer. + // Don't set an offset, since the pointer value always represents the + // base of the buffer. Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); - Info.ptrVal = CI.getArgOperand(1); + Info.flags &= ~MachineMemOperand::MOStore; + Infos.push_back(Info); + + // Entry 1: Store to LDS. + // Instruction offset is applied, and an additional per-lane offset + // which we simulate using a larger memory type. + Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8 * 64); + Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer + Info.offset = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 2)) + ->getZExtValue(); + Info.fallbackAddressSpace = AMDGPUAS::LOCAL_ADDRESS; + Info.flags &= ~MachineMemOperand::MOLoad; + Info.flags |= MachineMemOperand::MOStore; Infos.push_back(Info); return; } @@ -1626,10 +1642,18 @@ void SITargetLowering::getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &Infos, case Intrinsic::amdgcn_cluster_load_async_to_lds_b32: case Intrinsic::amdgcn_cluster_load_async_to_lds_b64: case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: { + // Entry 0: Load from source (global/flat). Info.opc = ISD::INTRINSIC_VOID; Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); - Info.ptrVal = CI.getArgOperand(1); - Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + Info.ptrVal = CI.getArgOperand(0); // Global pointer + Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue(); + Info.flags |= MachineMemOperand::MOLoad; + Infos.push_back(Info); + + // Entry 1: Store to LDS (same offset). + Info.flags &= ~MachineMemOperand::MOLoad; + Info.flags |= MachineMemOperand::MOStore; + Info.ptrVal = CI.getArgOperand(1); // LDS pointer Infos.push_back(Info); return; } @@ -1637,24 +1661,45 @@ void SITargetLowering::getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &Infos, case Intrinsic::amdgcn_global_store_async_from_lds_b32: case Intrinsic::amdgcn_global_store_async_from_lds_b64: case Intrinsic::amdgcn_global_store_async_from_lds_b128: { + // Entry 0: Load from LDS. Info.opc = ISD::INTRINSIC_VOID; Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); - Info.ptrVal = CI.getArgOperand(0); - Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + Info.ptrVal = CI.getArgOperand(1); // LDS pointer + Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue(); + Info.flags |= MachineMemOperand::MOLoad; + Infos.push_back(Info); + + // Entry 1: Store to global (same offset). + Info.flags &= ~MachineMemOperand::MOLoad; + Info.flags |= MachineMemOperand::MOStore; + Info.ptrVal = CI.getArgOperand(0); // Global pointer Infos.push_back(Info); return; } case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: { - Info.opc = ISD::INTRINSIC_VOID; unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue(); - Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); - Info.ptrVal = CI.getArgOperand(1); - Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1)); - if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE) + bool IsVolatile = Aux->getZExtValue() & AMDGPU::CPol::VOLATILE; + + // Entry 0: Load from source (global/flat). + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); + Info.ptrVal = CI.getArgOperand(0); // Source pointer + Info.offset = cast<ConstantInt>(CI.getArgOperand(3))->getSExtValue(); + Info.flags |= MachineMemOperand::MOLoad; + if (IsVolatile) Info.flags |= MachineMemOperand::MOVolatile; Infos.push_back(Info); + + // Entry 1: Store to LDS. + // Same offset from the instruction, but an additional per-lane offset is + // added. Represent that using a wider memory type. + Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8 * 64); + Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer + Info.flags &= ~MachineMemOperand::MOLoad; + Info.flags |= MachineMemOperand::MOStore; + Infos.push_back(Info); return; } case Intrinsic::amdgcn_ds_bvh_stack_rtn: @@ -11176,7 +11221,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SDLoc DL(Op); SDValue Chain = Op.getOperand(0); unsigned IntrinsicID = Op.getConstantOperandVal(1); - MachineFunction &MF = DAG.getMachineFunction(); switch (IntrinsicID) { case Intrinsic::amdgcn_exp_compr: { @@ -11451,29 +11495,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Ops.push_back(M0Val.getValue(1)); // Glue auto *M = cast<MemSDNode>(Op); - MachineMemOperand *LoadMMO = M->getMemOperand(); - // Don't set the offset value here because the pointer points to the base of - // the buffer. - MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); - - MachinePointerInfo StorePtrI = LoadPtrI; - LoadPtrI.V = PoisonValue::get( - PointerType::get(*DAG.getContext(), AMDGPUAS::BUFFER_RESOURCE)); - LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE; - StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; - - auto F = LoadMMO->getFlags() & - ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); - LoadMMO = - MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size, - LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); - - MachineMemOperand *StoreMMO = MF.getMachineMemOperand( - StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), - LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); - auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops); - DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO}); + DAG.setNodeMemRefs(Load, M->memoperands()); return SDValue(Load, 0); } @@ -11555,25 +11578,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Ops.push_back(M0Val.getValue(1)); // Glue auto *M = cast<MemSDNode>(Op); - MachineMemOperand *LoadMMO = M->getMemOperand(); - MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); - LoadPtrI.Offset = Op->getConstantOperandVal(5); - MachinePointerInfo StorePtrI = LoadPtrI; - LoadPtrI.V = PoisonValue::get( - PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS)); - LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; - StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; - auto F = LoadMMO->getFlags() & - ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); - LoadMMO = - MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size, - LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); - MachineMemOperand *StoreMMO = MF.getMachineMemOperand( - StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4), - LoadMMO->getAAInfo()); - auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); - DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO}); + DAG.setNodeMemRefs(Load, M->memoperands()); return SDValue(Load, 0); } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll index 352af044b0a6d..f66ad928d261d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll @@ -267,9 +267,8 @@ main_body: define amdgpu_ps void @buffer_load_lds_dword_volatile(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) { ; GFX90A-LABEL: buffer_load_lds_dword_volatile: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0 ; GFX90A-NEXT: s_mov_b32 m0, s5 -; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0 ; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds @@ -278,9 +277,8 @@ define amdgpu_ps void @buffer_load_lds_dword_volatile(ptr addrspace(7) nocapture ; ; GFX942-LABEL: buffer_load_lds_dword_volatile: ; GFX942: ; %bb.0: ; %main_body -; GFX942-NEXT: v_add_u32_e32 v0, s4, v0 ; GFX942-NEXT: s_mov_b32 m0, s5 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_add_u32_e32 v0, s4, v0 ; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll index 0b40c81af414d..14d2e4ca5f2c3 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll @@ -340,18 +340,16 @@ define amdgpu_ps void @load_to_lds_p7_dword_volatile(ptr addrspace(7) inreg %gpt define amdgpu_ps void @load_to_lds_p7_dword_nontemporal(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) { ; GFX90A-LABEL: load_to_lds_p7_dword_nontemporal: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0 ; GFX90A-NEXT: s_mov_b32 m0, s5 -; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0 ; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc slc lds ; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: load_to_lds_p7_dword_nontemporal: ; GFX942: ; %bb.0: -; GFX942-NEXT: v_add_u32_e32 v0, s4, v0 ; GFX942-NEXT: s_mov_b32 m0, s5 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_add_u32_e32 v0, s4, v0 ; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds ; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds ; GFX942-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll index a00aca34252b1..74fddfe290818 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll @@ -11,15 +11,14 @@ define amdgpu_kernel void @test_waitcnt(ptr addrspace(1) %global_buffer, ptr add ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0 ; CHECK-NEXT: s_add_u32 s4, s0, 64 ; CHECK-NEXT: s_addc_u32 s5, s1, 0 ; CHECK-NEXT: s_mov_b32 m0, s2 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_lds_dword v0, s[4:5] offset:4 -; CHECK-NEXT: s_load_dword s4, s[0:1], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v3, s4 +; CHECK-NEXT: v_mov_b32_e32 v3, s6 ; CHECK-NEXT: global_store_dword v0, v3, s[0:1] offset:64 +; CHECK-NEXT: global_load_lds_dword v0, s[4:5] offset:4 ; CHECK-NEXT: ; sched_barrier mask(0x00000000) ; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: v_mov_b32_e32 v2, s3 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
