https://github.com/ritter-x2a created https://github.com/llvm/llvm-project/pull/146076
Also removes the command line option to control this feature. There seem to be mainly two kinds of test changes: - Some operands of addition instructions are swapped; that is to be expected since PTRADD is not commutative. - Improvements in code generation, probably because the legacy lowering enabled some transformations that were sometimes harmful. For SWDEV-516125. >From bef09f8e86940d327f451c7b0c2639a86991037c Mon Sep 17 00:00:00 2001 From: Fabian Ritter <fabian.rit...@amd.com> Date: Fri, 27 Jun 2025 05:38:52 -0400 Subject: [PATCH] [AMDGPU][SDAG] Enable ISD::PTRADD for 64-bit AS by default Also removes the command line option to control this feature. There seem to be mainly two kinds of test changes: - Some operands of addition instructions are swapped; that is to be expected since PTRADD is not commutative. - Improvements in code generation, probably because the legacy lowering enabled some transformations that were sometimes harmful. For SWDEV-516125. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 +- .../AMDGPU/infer-addrspace-flat-atomic.ll | 14 +-- llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll | 8 +- .../AMDGPU/lower-module-lds-via-hybrid.ll | 4 +- .../AMDGPU/lower-module-lds-via-table.ll | 16 ++-- .../match-perm-extract-vector-elt-bug.ll | 22 ++--- llvm/test/CodeGen/AMDGPU/memmove-var-size.ll | 16 ++-- .../AMDGPU/preload-implicit-kernargs.ll | 6 +- .../AMDGPU/promote-constOffset-to-imm.ll | 8 +- llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll | 7 +- .../AMDGPU/ptradd-sdag-optimizations.ll | 94 ++++++------------- .../AMDGPU/ptradd-sdag-undef-poison.ll | 6 +- llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll | 27 +----- 13 files changed, 83 insertions(+), 155 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 71230078edc69..a3c344ecf962c 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -63,14 +63,6 @@ static cl::opt<bool> UseDivergentRegisterIndexing( cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false)); -// TODO: This option should be removed once we switch to always using PTRADD in -// the SelectionDAG. -static cl::opt<bool> UseSelectionDAGPTRADD( - "amdgpu-use-sdag-ptradd", cl::Hidden, - cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " - "SelectionDAG ISel"), - cl::init(false)); - static bool denormalModeIsFlushAllF32(const MachineFunction &MF) { const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign(); @@ -10599,7 +10591,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, bool SITargetLowering::shouldPreservePtrArith(const Function &F, EVT PtrVT) const { - return UseSelectionDAGPTRADD && PtrVT == MVT::i64; + return PtrVT == MVT::i64; } bool SITargetLowering::canTransformPtrArithOutOfBounds(const Function &F, diff --git a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll index 258aa9e299c3d..ed2755ed1e38b 100644 --- a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll @@ -11,8 +11,8 @@ define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) { ; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 -; CHECK-NEXT: s_add_u32 s0, s2, s0 -; CHECK-NEXT: s_addc_u32 s1, s3, s1 +; CHECK-NEXT: s_add_u32 s0, s0, s2 +; CHECK-NEXT: s_addc_u32 s1, s1, s3 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-NEXT: v_add_co_u32_e64 v2, vcc, -8, s0 ; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc @@ -69,13 +69,13 @@ define protected amdgpu_kernel void @InferMixed(i32 %a, ptr addrspace(1) %b, dou ; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 ; CHECK-NEXT: s_add_u32 s0, s0, s2 ; CHECK-NEXT: s_addc_u32 s1, s1, s3 +; CHECK-NEXT: s_add_u32 s0, s0, -8 +; CHECK-NEXT: s_addc_u32 s1, s1, -1 ; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: v_add_co_u32_e64 v0, vcc, -7, s0 -; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] offset:1 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol ; CHECK-NEXT: s_endpgm @@ -113,7 +113,7 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl ; CHECK-NEXT: s_addc_u32 s1, s1, s5 ; CHECK-NEXT: s_add_u32 s4, s0, -8 ; CHECK-NEXT: s_addc_u32 s5, s1, -1 -; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 9 +; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 1 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll index 04abb75c3f912..42ee46bd2c110 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll @@ -46,8 +46,8 @@ define void @use_extern_normal() #0 { ; CHECK-NEXT: s_ashr_i32 s5, s15, 31 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x4048f5c3 ; CHECK-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; CHECK-NEXT: s_add_u32 s4, s4, s6 -; CHECK-NEXT: s_addc_u32 s5, s5, s7 +; CHECK-NEXT: s_add_u32 s4, s6, s4 +; CHECK-NEXT: s_addc_u32 s5, s7, s5 ; CHECK-NEXT: s_load_dword s4, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v1, s4 @@ -70,8 +70,8 @@ define void @use_extern_overalign() #0 { ; CHECK-NEXT: s_ashr_i32 s5, s15, 31 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x42280000 ; CHECK-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; CHECK-NEXT: s_add_u32 s4, s4, s6 -; CHECK-NEXT: s_addc_u32 s5, s5, s7 +; CHECK-NEXT: s_add_u32 s4, s6, s4 +; CHECK-NEXT: s_addc_u32 s5, s7, s5 ; CHECK-NEXT: s_load_dword s4, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll index 2a7553ae5d92b..0b5ba81b3c24f 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll @@ -84,8 +84,8 @@ define void @f2() { ; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+12 ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN-NEXT: s_add_u32 s4, s4, s6 -; GCN-NEXT: s_addc_u32 s5, s5, s7 +; GCN-NEXT: s_add_u32 s4, s6, s4 +; GCN-NEXT: s_addc_u32 s5, s7, s5 ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll index dca9b71a757af..882e05cf9efda 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll @@ -49,8 +49,8 @@ define void @f0() { ; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+12 ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 4 -; GCN-NEXT: s_add_u32 s4, s4, s6 -; GCN-NEXT: s_addc_u32 s5, s5, s7 +; GCN-NEXT: s_add_u32 s4, s6, s4 +; GCN-NEXT: s_addc_u32 s5, s7, s5 ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 @@ -90,8 +90,8 @@ define void @f1() { ; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+8 ; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+16 ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 4 -; GCN-NEXT: s_add_u32 s4, s4, s6 -; GCN-NEXT: s_addc_u32 s5, s5, s7 +; GCN-NEXT: s_add_u32 s4, s6, s4 +; GCN-NEXT: s_addc_u32 s5, s7, s5 ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 @@ -131,8 +131,8 @@ define void @f2() { ; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+12 ; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+20 ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 4 -; GCN-NEXT: s_add_u32 s4, s4, s6 -; GCN-NEXT: s_addc_u32 s5, s5, s7 +; GCN-NEXT: s_add_u32 s4, s6, s4 +; GCN-NEXT: s_addc_u32 s5, s7, s5 ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -172,8 +172,8 @@ define void @f3() { ; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+16 ; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+24 ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 4 -; GCN-NEXT: s_add_u32 s4, s4, s6 -; GCN-NEXT: s_addc_u32 s5, s5, s7 +; GCN-NEXT: s_add_u32 s4, s6, s4 +; GCN-NEXT: s_addc_u32 s5, s7, s5 ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 diff --git a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll index 4896e504cfdf4..229b3ece6e5ea 100644 --- a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll @@ -13,9 +13,9 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_mul_i32 s14, s14, s4 ; GFX9-NEXT: s_add_i32 s5, s5, s14 -; GFX9-NEXT: v_add_u32_e32 v0, s5, v0 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, s5, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_ashrrev_i64 v[4:5], 28, v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc @@ -37,12 +37,12 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX10-NEXT: s_load_dword s4, s[8:9], 0x1c ; GFX10-NEXT: s_load_dword s5, s[8:9], 0x38 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s4, s4, 0xffff ; GFX10-NEXT: s_mul_i32 s14, s14, s4 -; GFX10-NEXT: v_add3_u32 v0, s5, s14, v0 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX10-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] +; GFX10-NEXT: v_add3_u32 v2, s5, s14, v0 +; GFX10-NEXT: v_ashrrev_i64 v[4:5], 28, v[1:2] ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v4 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, s1, v5, vcc_lo ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, s2, v4 @@ -62,21 +62,19 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x1c ; GFX11-NEXT: s_load_b32 s7, s[4:5], 0x38 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s4, s6, 0xffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mul_i32 s13, s13, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_add3_u32 v0, s7, s13, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX11-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] +; GFX11-NEXT: v_add3_u32 v1, s7, s13, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_ashrrev_i64 v[4:5], 28, v[0:1] ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, s0, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, v5, vcc_lo ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, s2, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, s3, v5, vcc_lo ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll index 272daa9dd0b59..7187ece89ae04 100644 --- a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll +++ b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll @@ -388,8 +388,8 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB2_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader -; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v0 -; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v1, s4 +; CHECK-NEXT: v_add_co_u32 v9, s4, v0, v3 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v1, v4, s4 ; CHECK-NEXT: v_add3_u32 v4, v3, v2, -1 ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4 @@ -684,8 +684,8 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB4_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader -; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v0 -; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v1, s4 +; CHECK-NEXT: v_add_co_u32 v9, s4, v0, v3 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v1, v4, s4 ; CHECK-NEXT: v_add3_u32 v4, v3, v2, -1 ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4 @@ -1411,8 +1411,8 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB10_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader -; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v2, s4 +; CHECK-NEXT: v_add_co_u32 v9, s4, v1, v3 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v2, v4, s4 ; CHECK-NEXT: v_add3_u32 v4, v3, v0, -1 ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4 @@ -1889,8 +1889,8 @@ define void @memmove_p5_p0(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB15_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader -; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v2, s4 +; CHECK-NEXT: v_add_co_u32 v9, s4, v1, v3 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v2, v4, s4 ; CHECK-NEXT: v_add3_u32 v4, v3, v0, -1 ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4 diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll index 79b531e3ce785..615740a2d0730 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll @@ -277,8 +277,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) ; GFX942-NEXT: .p2align 8 ; GFX942-NEXT: ; %bb.2: ; GFX942-NEXT: .LBB8_0: -; GFX942-NEXT: s_mov_b32 s4, 8 -; GFX942-NEXT: s_load_dword s0, s[0:1], s4 offset:0x2 +; GFX942-NEXT: s_load_dword s0, s[0:1], 0xa ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s0 @@ -293,8 +292,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB8_0: -; GFX90a-NEXT: s_mov_b32 s0, 8 -; GFX90a-NEXT: s_load_dword s0, s[4:5], s0 offset:0x2 +; GFX90a-NEXT: s_load_dword s0, s[4:5], 0xa ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index c4842c1f4f523..a78c3e854b011 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -612,8 +612,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_movk_i32 s1, 0x7f ; GFX10-NEXT: v_and_b32_e32 v6, 0xfe000000, v1 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 3, v6 -; GFX10-NEXT: v_add_co_u32 v0, s0, v0, s34 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s35, s0 +; GFX10-NEXT: v_add_co_u32 v0, s0, s34, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s35, 0, s0 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x5000, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: .LBB1_1: ; %for.cond.preheader @@ -830,8 +830,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX11-NEXT: v_and_b32_e32 v6, 0xfe000000, v1 ; GFX11-NEXT: v_lshl_or_b32 v0, v0, 3, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v0, s0, v0, s34 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s35, s0 +; GFX11-NEXT: v_add_co_u32 v0, s0, s34, v0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s35, 0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x5000, v0 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll index ff90f1f175c3c..40f39a24d7a99 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX6,GFX6_PTRADD %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX6,GFX6_LEGACY %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck --check-prefixes=GFX6 %s ; Test PTRADD handling in AMDGPUDAGToDAGISel::SelectMUBUF. @@ -34,7 +33,3 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in store i32 %result, ptr addrspace(1) %out ret void } - -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX6_LEGACY: {{.*}} -; GFX6_PTRADD: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index 64e041103a563..6e552bd1317b8 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 < %s | FileCheck --check-prefixes=GFX942 %s ; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG ; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable @@ -24,21 +23,13 @@ define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) { } define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) { -; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc: -; GFX942_PTRADD: ; %bb.0: -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] -; GFX942_PTRADD-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:24 -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) -; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc: -; GFX942_LEGACY: ; %bb.0: -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1] -; GFX942_LEGACY-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:24 -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) -; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_load_gep_add_reassoc: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:24 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %add0 = add nuw nsw i64 %voffset, 24 %gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0 %l = load i64, ptr addrspace(1) %gep0, align 8 @@ -222,23 +213,14 @@ define ptr addrspace(1) @shl_neg_offset(ptr addrspace(1) %p, i64 %noffset, i64 % ; Check that offsets are folded into global addresses if possible. For example, ; this is relevant when using --amdgpu-lower-module-lds-strategy=table. define ptr addrspace(1) @complextype_global_gep(i64 %offset) { -; GFX942_PTRADD-LABEL: complextype_global_gep: -; GFX942_PTRADD: ; %bb.0: -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT: s_getpc_b64 s[0:1] -; GFX942_PTRADD-NEXT: s_add_u32 s0, s0, v0@rel32@lo+14 -; GFX942_PTRADD-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+22 -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] -; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: complextype_global_gep: -; GFX942_LEGACY: ; %bb.0: -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT: s_getpc_b64 s[0:1] -; GFX942_LEGACY-NEXT: s_add_u32 s0, s0, v0@rel32@lo+14 -; GFX942_LEGACY-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+22 -; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: complextype_global_gep: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_getpc_b64 s[0:1] +; GFX942-NEXT: s_add_u32 s0, s0, v0@rel32@lo+14 +; GFX942-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+22 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] %gep0 = getelementptr inbounds %complextype, ptr addrspace(1) @v0, i64 0, i32 1, i64 %offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2 ret ptr addrspace(1) %gep1 @@ -431,36 +413,20 @@ define ptr @gep_disjoint_or(ptr %base) { ; Check that AssertAlign nodes between ptradd nodes don't block offset folding, ; taken from preload-implicit-kernargs.ll define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) #0 { -; GFX942_PTRADD-LABEL: random_incorrect_offset: -; GFX942_PTRADD: ; %bb.1: -; GFX942_PTRADD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_PTRADD-NEXT: s_branch .LBB21_0 -; GFX942_PTRADD-NEXT: .p2align 8 -; GFX942_PTRADD-NEXT: ; %bb.2: -; GFX942_PTRADD-NEXT: .LBB21_0: -; GFX942_PTRADD-NEXT: s_load_dword s0, s[0:1], 0xa -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, 0 -; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s0 -; GFX942_PTRADD-NEXT: global_store_dword v0, v1, s[2:3] -; GFX942_PTRADD-NEXT: s_endpgm -; -; GFX942_LEGACY-LABEL: random_incorrect_offset: -; GFX942_LEGACY: ; %bb.1: -; GFX942_LEGACY-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_LEGACY-NEXT: s_branch .LBB21_0 -; GFX942_LEGACY-NEXT: .p2align 8 -; GFX942_LEGACY-NEXT: ; %bb.2: -; GFX942_LEGACY-NEXT: .LBB21_0: -; GFX942_LEGACY-NEXT: s_mov_b32 s4, 8 -; GFX942_LEGACY-NEXT: s_load_dword s0, s[0:1], s4 offset:0x2 -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, 0 -; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s0 -; GFX942_LEGACY-NEXT: global_store_dword v0, v1, s[2:3] -; GFX942_LEGACY-NEXT: s_endpgm +; GFX942-LABEL: random_incorrect_offset: +; GFX942: ; %bb.1: +; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_branch .LBB21_0 +; GFX942-NEXT: .p2align 8 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: .LBB21_0: +; GFX942-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NEXT: global_store_dword v0, v1, s[2:3] +; GFX942-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2 %load = load i32, ptr addrspace(4) %gep diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll index 1934ce395e63d..e7c715f0a38bf 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=amdgpu-isel -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=amdgpu-isel -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=amdgpu-isel < %s | FileCheck --check-prefixes=GFX942 %s ; Tests for undef and poison DAG folds for the ISD::PTRADD SelectionDAG opcode. ; If any additions are generated for these tests, the folds don't work. @@ -44,6 +43,3 @@ define ptr @undef_base(ptr %p, i64 %offset) { %gep1 = getelementptr i8, ptr undef, i64 %offset ret ptr %gep1 } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX942_LEGACY: {{.*}} -; GFX942_PTRADD: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll index 1c4a9547ed189..42158f18da525 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll @@ -1,14 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX8,GFX8_PTRADD -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX8,GFX8_LEGACY -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX942,GFX942_PTRADD -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX942,GFX942_LEGACY -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX10,GFX10_PTRADD -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX10,GFX10_LEGACY -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX11,GFX11_PTRADD -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX11,GFX11_LEGACY -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX12,GFX12_PTRADD -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX12,GFX12_LEGACY +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck %s -check-prefixes=GFX8 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s -check-prefixes=GFX942 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GFX11 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefixes=GFX12 ; Tests for the ISD::PTRADD SelectionDAG opcode. This only tests 64-bit address ; spaces since PTRADD is currently only used for these. @@ -509,15 +504,3 @@ entry: store i32 %val, ptr addrspace(1) %gep.to, align 4 ret void } - -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10_LEGACY: {{.*}} -; GFX10_PTRADD: {{.*}} -; GFX11_LEGACY: {{.*}} -; GFX11_PTRADD: {{.*}} -; GFX12_LEGACY: {{.*}} -; GFX12_PTRADD: {{.*}} -; GFX8_LEGACY: {{.*}} -; GFX8_PTRADD: {{.*}} -; GFX942_LEGACY: {{.*}} -; GFX942_PTRADD: {{.*}} _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits