llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Nicolai Hähnle (nhaehnle) <details> <summary>Changes</summary> Accurately represent both the load and the store part of those intrinsics. The test changes seem to be mostly fairly insignificant changes caused by subtly different scheduler behavior. --- **Stack**: - [4/4] #<!-- -->175846 - [3/4] #<!-- -->175845 ⬅ - [2/4] #<!-- -->175844 - [1/4] #<!-- -->175843 ⚠️ *Part of a stack created by [spr](https://github.com/nhaehnle/spr). Merging this PR using the GitHub UI may have unexpected results.* --- Full diff: https://github.com/llvm/llvm-project/pull/175845.diff 4 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+56-51) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll (+2-4) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll (+2-4) - (modified) llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll (+3-4) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index cc239752cd453..5224f2f093755 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1440,8 +1440,23 @@ void SITargetLowering::getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &Infos, case Intrinsic::amdgcn_struct_buffer_load_lds: case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue(); + + // Entry 0: Load from buffer. + // Don't set an offset, since the pointer value always represents the + // base of the buffer. Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); - Info.ptrVal = CI.getArgOperand(1); + Info.flags &= ~MachineMemOperand::MOStore; + Infos.push_back(Info); + + // Entry 1: Store to LDS. + // Instruction offset is applied, and an additional per-lane offset + // which we simulate using a larger memory type. + Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8 * 64); + Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer + Info.offset = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 2))->getZExtValue(); + Info.fallbackAddressSpace = AMDGPUAS::LOCAL_ADDRESS; + Info.flags &= ~MachineMemOperand::MOLoad; + Info.flags |= MachineMemOperand::MOStore; Infos.push_back(Info); return; } @@ -1634,10 +1649,18 @@ void SITargetLowering::getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &Infos, case Intrinsic::amdgcn_cluster_load_async_to_lds_b32: case Intrinsic::amdgcn_cluster_load_async_to_lds_b64: case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: { + // Entry 0: Load from source (global/flat). Info.opc = ISD::INTRINSIC_VOID; Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); - Info.ptrVal = CI.getArgOperand(1); - Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + Info.ptrVal = CI.getArgOperand(0); // Global pointer + Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue(); + Info.flags |= MachineMemOperand::MOLoad; + Infos.push_back(Info); + + // Entry 1: Store to LDS (same offset). + Info.flags &= ~MachineMemOperand::MOLoad; + Info.flags |= MachineMemOperand::MOStore; + Info.ptrVal = CI.getArgOperand(1); // LDS pointer Infos.push_back(Info); return; } @@ -1645,24 +1668,45 @@ void SITargetLowering::getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &Infos, case Intrinsic::amdgcn_global_store_async_from_lds_b32: case Intrinsic::amdgcn_global_store_async_from_lds_b64: case Intrinsic::amdgcn_global_store_async_from_lds_b128: { + // Entry 0: Load from LDS. Info.opc = ISD::INTRINSIC_VOID; Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); - Info.ptrVal = CI.getArgOperand(0); - Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + Info.ptrVal = CI.getArgOperand(1); // LDS pointer + Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue(); + Info.flags |= MachineMemOperand::MOLoad; + Infos.push_back(Info); + + // Entry 1: Store to global (same offset). + Info.flags &= ~MachineMemOperand::MOLoad; + Info.flags |= MachineMemOperand::MOStore; + Info.ptrVal = CI.getArgOperand(0); // Global pointer Infos.push_back(Info); return; } case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: { - Info.opc = ISD::INTRINSIC_VOID; unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue(); - Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); - Info.ptrVal = CI.getArgOperand(1); - Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1)); - if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE) + bool IsVolatile = Aux->getZExtValue() & AMDGPU::CPol::VOLATILE; + + // Entry 0: Load from source (global/flat). + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); + Info.ptrVal = CI.getArgOperand(0); // Source pointer + Info.offset = cast<ConstantInt>(CI.getArgOperand(3))->getSExtValue(); + Info.flags |= MachineMemOperand::MOLoad; + if (IsVolatile) Info.flags |= MachineMemOperand::MOVolatile; Infos.push_back(Info); + + // Entry 1: Store to LDS. + // Same offset from the instruction, but an additional per-lane offset is + // added. Represent that using a wider memory type. + Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8 * 64); + Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer + Info.flags &= ~MachineMemOperand::MOLoad; + Info.flags |= MachineMemOperand::MOStore; + Infos.push_back(Info); return; } case Intrinsic::amdgcn_ds_bvh_stack_rtn: @@ -11184,7 +11228,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SDLoc DL(Op); SDValue Chain = Op.getOperand(0); unsigned IntrinsicID = Op.getConstantOperandVal(1); - MachineFunction &MF = DAG.getMachineFunction(); switch (IntrinsicID) { case Intrinsic::amdgcn_exp_compr: { @@ -11459,29 +11502,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Ops.push_back(M0Val.getValue(1)); // Glue auto *M = cast<MemSDNode>(Op); - MachineMemOperand *LoadMMO = M->getMemOperand(); - // Don't set the offset value here because the pointer points to the base of - // the buffer. - MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); - - MachinePointerInfo StorePtrI = LoadPtrI; - LoadPtrI.V = PoisonValue::get( - PointerType::get(*DAG.getContext(), AMDGPUAS::BUFFER_RESOURCE)); - LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE; - StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; - - auto F = LoadMMO->getFlags() & - ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); - LoadMMO = - MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size, - LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); - - MachineMemOperand *StoreMMO = MF.getMachineMemOperand( - StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), - LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); - auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops); - DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO}); + DAG.setNodeMemRefs(Load, M->memoperands()); return SDValue(Load, 0); } @@ -11563,25 +11585,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Ops.push_back(M0Val.getValue(1)); // Glue auto *M = cast<MemSDNode>(Op); - MachineMemOperand *LoadMMO = M->getMemOperand(); - MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); - LoadPtrI.Offset = Op->getConstantOperandVal(5); - MachinePointerInfo StorePtrI = LoadPtrI; - LoadPtrI.V = PoisonValue::get( - PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS)); - LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; - StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; - auto F = LoadMMO->getFlags() & - ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); - LoadMMO = - MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size, - LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); - MachineMemOperand *StoreMMO = MF.getMachineMemOperand( - StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4), - LoadMMO->getAAInfo()); - auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); - DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO}); + DAG.setNodeMemRefs(Load, M->memoperands()); return SDValue(Load, 0); } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll index 352af044b0a6d..f66ad928d261d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll @@ -267,9 +267,8 @@ main_body: define amdgpu_ps void @buffer_load_lds_dword_volatile(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) { ; GFX90A-LABEL: buffer_load_lds_dword_volatile: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0 ; GFX90A-NEXT: s_mov_b32 m0, s5 -; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0 ; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds @@ -278,9 +277,8 @@ define amdgpu_ps void @buffer_load_lds_dword_volatile(ptr addrspace(7) nocapture ; ; GFX942-LABEL: buffer_load_lds_dword_volatile: ; GFX942: ; %bb.0: ; %main_body -; GFX942-NEXT: v_add_u32_e32 v0, s4, v0 ; GFX942-NEXT: s_mov_b32 m0, s5 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_add_u32_e32 v0, s4, v0 ; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll index 0b40c81af414d..14d2e4ca5f2c3 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll @@ -340,18 +340,16 @@ define amdgpu_ps void @load_to_lds_p7_dword_volatile(ptr addrspace(7) inreg %gpt define amdgpu_ps void @load_to_lds_p7_dword_nontemporal(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) { ; GFX90A-LABEL: load_to_lds_p7_dword_nontemporal: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0 ; GFX90A-NEXT: s_mov_b32 m0, s5 -; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0 ; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc slc lds ; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: load_to_lds_p7_dword_nontemporal: ; GFX942: ; %bb.0: -; GFX942-NEXT: v_add_u32_e32 v0, s4, v0 ; GFX942-NEXT: s_mov_b32 m0, s5 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_add_u32_e32 v0, s4, v0 ; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds ; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds ; GFX942-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll index a00aca34252b1..74fddfe290818 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll @@ -11,15 +11,14 @@ define amdgpu_kernel void @test_waitcnt(ptr addrspace(1) %global_buffer, ptr add ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0 ; CHECK-NEXT: s_add_u32 s4, s0, 64 ; CHECK-NEXT: s_addc_u32 s5, s1, 0 ; CHECK-NEXT: s_mov_b32 m0, s2 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_lds_dword v0, s[4:5] offset:4 -; CHECK-NEXT: s_load_dword s4, s[0:1], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v3, s4 +; CHECK-NEXT: v_mov_b32_e32 v3, s6 ; CHECK-NEXT: global_store_dword v0, v3, s[0:1] offset:64 +; CHECK-NEXT: global_load_lds_dword v0, s[4:5] offset:4 ; CHECK-NEXT: ; sched_barrier mask(0x00000000) ; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: v_mov_b32_e32 v2, s3 `````````` </details> https://github.com/llvm/llvm-project/pull/175845 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
