[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines (PR #142738)

2025-06-13 Thread Fabian Ritter via llvm-branch-commits

https://github.com/ritter-x2a updated 
https://github.com/llvm/llvm-project/pull/142738

>From 5c8ab8ccceedf0d864e9ba8b839b779d0f408f7b Mon Sep 17 00:00:00 2001
From: Fabian Ritter 
Date: Tue, 3 Jun 2025 09:49:19 -0400
Subject: [PATCH 1/2] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines

Pre-committing tests to show improvements in a follow-up PR with the
combines.
---
 .../AMDGPU/ptradd-sdag-optimizations.ll   | 207 ++
 1 file changed, 207 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll

diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll 
b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
new file mode 100644
index 0..0241be9197e1a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
@@ -0,0 +1,207 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 
-disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | 
FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 
-disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | 
FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s
+
+; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG
+; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable
+; similar transformations in that pass.
+
+; Tests reassociation (ptradd N0:(ptradd p, c1), z) where N0 has only one use.
+define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) {
+; GFX942_PTRADD-LABEL: global_load_ZTwoUses:
+; GFX942_PTRADD:   ; %bb.0:
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, 24
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: global_load_ZTwoUses:
+; GFX942_LEGACY:   ; %bb.0:
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %base, i64 24
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %voffset
+  %l = load i64, ptr addrspace(1) %gep1, align 8
+  %r = add i64 %l, %voffset
+  ret i64 %r
+}
+
+define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) {
+; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc:
+; GFX942_PTRADD:   ; %bb.0:
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], v[2:3], 0, 24
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0)
+; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc:
+; GFX942_LEGACY:   ; %bb.0:
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
+; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0)
+; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31]
+  %add0 = add nuw nsw i64 %voffset, 24
+  %gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0
+  %l = load i64, ptr addrspace(1) %gep0, align 8
+  ret i64 %l
+}
+
+; Tests reassociation (ptradd (ptradd p, c1), c2) with two constants. These
+; would be folded away in most cases, but the index computation introduced by
+; the legalization of wide vector stores can for example introduce them.
+define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) {
+; GFX942_PTRADD-LABEL: store_v16i32:
+; GFX942_PTRADD:   ; %bb.0: ; %entry
+; GFX942_PTRADD-NEXT:s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942_PTRADD-NEXT:s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v4, 0
+; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0)
+; GFX942_PTRADD-NEXT:s_add_u32 s2, s0, 32
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s20
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s21
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s22
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s23
+; GFX942_PTRADD-NEXT:s_addc_u32 s3, s1, 0
+; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
+; GFX942_PTRADD-NEXT:s_nop 1
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s16
+; GFX942_PTRADD-NEXT:v_mo

[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines (PR #142738)

2025-06-13 Thread Fabian Ritter via llvm-branch-commits

https://github.com/ritter-x2a updated 
https://github.com/llvm/llvm-project/pull/142738

>From d38f4e6bbd8894cdfd0b84b2f677cc8a6d742c6a Mon Sep 17 00:00:00 2001
From: Fabian Ritter 
Date: Tue, 3 Jun 2025 09:49:19 -0400
Subject: [PATCH 1/2] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines

Pre-committing tests to show improvements in a follow-up PR with the
combines.
---
 .../AMDGPU/ptradd-sdag-optimizations.ll   | 207 ++
 1 file changed, 207 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll

diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll 
b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
new file mode 100644
index 0..0241be9197e1a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
@@ -0,0 +1,207 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 
-disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | 
FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 
-disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | 
FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s
+
+; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG
+; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable
+; similar transformations in that pass.
+
+; Tests reassociation (ptradd N0:(ptradd p, c1), z) where N0 has only one use.
+define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) {
+; GFX942_PTRADD-LABEL: global_load_ZTwoUses:
+; GFX942_PTRADD:   ; %bb.0:
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, 24
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: global_load_ZTwoUses:
+; GFX942_LEGACY:   ; %bb.0:
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %base, i64 24
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %voffset
+  %l = load i64, ptr addrspace(1) %gep1, align 8
+  %r = add i64 %l, %voffset
+  ret i64 %r
+}
+
+define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) {
+; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc:
+; GFX942_PTRADD:   ; %bb.0:
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], v[2:3], 0, 24
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0)
+; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc:
+; GFX942_LEGACY:   ; %bb.0:
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
+; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0)
+; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31]
+  %add0 = add nuw nsw i64 %voffset, 24
+  %gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0
+  %l = load i64, ptr addrspace(1) %gep0, align 8
+  ret i64 %l
+}
+
+; Tests reassociation (ptradd (ptradd p, c1), c2) with two constants. These
+; would be folded away in most cases, but the index computation introduced by
+; the legalization of wide vector stores can for example introduce them.
+define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) {
+; GFX942_PTRADD-LABEL: store_v16i32:
+; GFX942_PTRADD:   ; %bb.0: ; %entry
+; GFX942_PTRADD-NEXT:s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942_PTRADD-NEXT:s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v4, 0
+; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0)
+; GFX942_PTRADD-NEXT:s_add_u32 s2, s0, 32
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s20
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s21
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s22
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s23
+; GFX942_PTRADD-NEXT:s_addc_u32 s3, s1, 0
+; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
+; GFX942_PTRADD-NEXT:s_nop 1
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s16
+; GFX942_PTRADD-NEXT:v_mo

[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines (PR #142738)

2025-06-13 Thread Fabian Ritter via llvm-branch-commits

https://github.com/ritter-x2a updated 
https://github.com/llvm/llvm-project/pull/142738

>From d38f4e6bbd8894cdfd0b84b2f677cc8a6d742c6a Mon Sep 17 00:00:00 2001
From: Fabian Ritter 
Date: Tue, 3 Jun 2025 09:49:19 -0400
Subject: [PATCH 1/2] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines

Pre-committing tests to show improvements in a follow-up PR with the
combines.
---
 .../AMDGPU/ptradd-sdag-optimizations.ll   | 207 ++
 1 file changed, 207 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll

diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll 
b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
new file mode 100644
index 0..0241be9197e1a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
@@ -0,0 +1,207 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 
-disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | 
FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 
-disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | 
FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s
+
+; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG
+; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable
+; similar transformations in that pass.
+
+; Tests reassociation (ptradd N0:(ptradd p, c1), z) where N0 has only one use.
+define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) {
+; GFX942_PTRADD-LABEL: global_load_ZTwoUses:
+; GFX942_PTRADD:   ; %bb.0:
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, 24
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: global_load_ZTwoUses:
+; GFX942_LEGACY:   ; %bb.0:
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %base, i64 24
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %voffset
+  %l = load i64, ptr addrspace(1) %gep1, align 8
+  %r = add i64 %l, %voffset
+  ret i64 %r
+}
+
+define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) {
+; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc:
+; GFX942_PTRADD:   ; %bb.0:
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], v[2:3], 0, 24
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0)
+; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc:
+; GFX942_LEGACY:   ; %bb.0:
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
+; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0)
+; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31]
+  %add0 = add nuw nsw i64 %voffset, 24
+  %gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0
+  %l = load i64, ptr addrspace(1) %gep0, align 8
+  ret i64 %l
+}
+
+; Tests reassociation (ptradd (ptradd p, c1), c2) with two constants. These
+; would be folded away in most cases, but the index computation introduced by
+; the legalization of wide vector stores can for example introduce them.
+define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) {
+; GFX942_PTRADD-LABEL: store_v16i32:
+; GFX942_PTRADD:   ; %bb.0: ; %entry
+; GFX942_PTRADD-NEXT:s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942_PTRADD-NEXT:s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v4, 0
+; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0)
+; GFX942_PTRADD-NEXT:s_add_u32 s2, s0, 32
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s20
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s21
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s22
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s23
+; GFX942_PTRADD-NEXT:s_addc_u32 s3, s1, 0
+; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
+; GFX942_PTRADD-NEXT:s_nop 1
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s16
+; GFX942_PTRADD-NEXT:v_mo

[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines (PR #142738)

2025-06-13 Thread Fabian Ritter via llvm-branch-commits

https://github.com/ritter-x2a updated 
https://github.com/llvm/llvm-project/pull/142738

>From 5c8ab8ccceedf0d864e9ba8b839b779d0f408f7b Mon Sep 17 00:00:00 2001
From: Fabian Ritter 
Date: Tue, 3 Jun 2025 09:49:19 -0400
Subject: [PATCH 1/2] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines

Pre-committing tests to show improvements in a follow-up PR with the
combines.
---
 .../AMDGPU/ptradd-sdag-optimizations.ll   | 207 ++
 1 file changed, 207 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll

diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll 
b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
new file mode 100644
index 0..0241be9197e1a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
@@ -0,0 +1,207 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 
-disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | 
FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 
-disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | 
FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s
+
+; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG
+; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable
+; similar transformations in that pass.
+
+; Tests reassociation (ptradd N0:(ptradd p, c1), z) where N0 has only one use.
+define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) {
+; GFX942_PTRADD-LABEL: global_load_ZTwoUses:
+; GFX942_PTRADD:   ; %bb.0:
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, 24
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: global_load_ZTwoUses:
+; GFX942_LEGACY:   ; %bb.0:
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %base, i64 24
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %voffset
+  %l = load i64, ptr addrspace(1) %gep1, align 8
+  %r = add i64 %l, %voffset
+  ret i64 %r
+}
+
+define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) {
+; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc:
+; GFX942_PTRADD:   ; %bb.0:
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], v[2:3], 0, 24
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0)
+; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc:
+; GFX942_LEGACY:   ; %bb.0:
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
+; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0)
+; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31]
+  %add0 = add nuw nsw i64 %voffset, 24
+  %gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0
+  %l = load i64, ptr addrspace(1) %gep0, align 8
+  ret i64 %l
+}
+
+; Tests reassociation (ptradd (ptradd p, c1), c2) with two constants. These
+; would be folded away in most cases, but the index computation introduced by
+; the legalization of wide vector stores can for example introduce them.
+define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) {
+; GFX942_PTRADD-LABEL: store_v16i32:
+; GFX942_PTRADD:   ; %bb.0: ; %entry
+; GFX942_PTRADD-NEXT:s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942_PTRADD-NEXT:s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v4, 0
+; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0)
+; GFX942_PTRADD-NEXT:s_add_u32 s2, s0, 32
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s20
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s21
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s22
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s23
+; GFX942_PTRADD-NEXT:s_addc_u32 s3, s1, 0
+; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
+; GFX942_PTRADD-NEXT:s_nop 1
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s16
+; GFX942_PTRADD-NEXT:v_mo

[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines (PR #142738)

2025-06-13 Thread Fabian Ritter via llvm-branch-commits

https://github.com/ritter-x2a updated 
https://github.com/llvm/llvm-project/pull/142738

>From cdea7dfe63d04d4b2879d7f73408753ff70e20dc Mon Sep 17 00:00:00 2001
From: Fabian Ritter 
Date: Tue, 3 Jun 2025 09:49:19 -0400
Subject: [PATCH 1/2] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines

Pre-committing tests to show improvements in a follow-up PR with the
combines.
---
 .../AMDGPU/ptradd-sdag-optimizations.ll   | 207 ++
 1 file changed, 207 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll

diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll 
b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
new file mode 100644
index 0..0241be9197e1a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
@@ -0,0 +1,207 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 
-disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | 
FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 
-disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | 
FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s
+
+; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG
+; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable
+; similar transformations in that pass.
+
+; Tests reassociation (ptradd N0:(ptradd p, c1), z) where N0 has only one use.
+define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) {
+; GFX942_PTRADD-LABEL: global_load_ZTwoUses:
+; GFX942_PTRADD:   ; %bb.0:
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, 24
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: global_load_ZTwoUses:
+; GFX942_LEGACY:   ; %bb.0:
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %base, i64 24
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %voffset
+  %l = load i64, ptr addrspace(1) %gep1, align 8
+  %r = add i64 %l, %voffset
+  ret i64 %r
+}
+
+define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) {
+; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc:
+; GFX942_PTRADD:   ; %bb.0:
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], v[2:3], 0, 24
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0)
+; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc:
+; GFX942_LEGACY:   ; %bb.0:
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
+; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0)
+; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31]
+  %add0 = add nuw nsw i64 %voffset, 24
+  %gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0
+  %l = load i64, ptr addrspace(1) %gep0, align 8
+  ret i64 %l
+}
+
+; Tests reassociation (ptradd (ptradd p, c1), c2) with two constants. These
+; would be folded away in most cases, but the index computation introduced by
+; the legalization of wide vector stores can for example introduce them.
+define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) {
+; GFX942_PTRADD-LABEL: store_v16i32:
+; GFX942_PTRADD:   ; %bb.0: ; %entry
+; GFX942_PTRADD-NEXT:s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942_PTRADD-NEXT:s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v4, 0
+; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0)
+; GFX942_PTRADD-NEXT:s_add_u32 s2, s0, 32
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s20
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s21
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s22
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s23
+; GFX942_PTRADD-NEXT:s_addc_u32 s3, s1, 0
+; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
+; GFX942_PTRADD-NEXT:s_nop 1
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s16
+; GFX942_PTRADD-NEXT:v_mo

[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines (PR #142738)

2025-06-13 Thread Fabian Ritter via llvm-branch-commits

https://github.com/ritter-x2a updated 
https://github.com/llvm/llvm-project/pull/142738

>From cdea7dfe63d04d4b2879d7f73408753ff70e20dc Mon Sep 17 00:00:00 2001
From: Fabian Ritter 
Date: Tue, 3 Jun 2025 09:49:19 -0400
Subject: [PATCH 1/2] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines

Pre-committing tests to show improvements in a follow-up PR with the
combines.
---
 .../AMDGPU/ptradd-sdag-optimizations.ll   | 207 ++
 1 file changed, 207 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll

diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll 
b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
new file mode 100644
index 0..0241be9197e1a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
@@ -0,0 +1,207 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 
-disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | 
FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 
-disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | 
FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s
+
+; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG
+; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable
+; similar transformations in that pass.
+
+; Tests reassociation (ptradd N0:(ptradd p, c1), z) where N0 has only one use.
+define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) {
+; GFX942_PTRADD-LABEL: global_load_ZTwoUses:
+; GFX942_PTRADD:   ; %bb.0:
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, 24
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: global_load_ZTwoUses:
+; GFX942_LEGACY:   ; %bb.0:
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %base, i64 24
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %voffset
+  %l = load i64, ptr addrspace(1) %gep1, align 8
+  %r = add i64 %l, %voffset
+  ret i64 %r
+}
+
+define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) {
+; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc:
+; GFX942_PTRADD:   ; %bb.0:
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], v[2:3], 0, 24
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0)
+; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc:
+; GFX942_LEGACY:   ; %bb.0:
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
+; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0)
+; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31]
+  %add0 = add nuw nsw i64 %voffset, 24
+  %gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0
+  %l = load i64, ptr addrspace(1) %gep0, align 8
+  ret i64 %l
+}
+
+; Tests reassociation (ptradd (ptradd p, c1), c2) with two constants. These
+; would be folded away in most cases, but the index computation introduced by
+; the legalization of wide vector stores can for example introduce them.
+define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) {
+; GFX942_PTRADD-LABEL: store_v16i32:
+; GFX942_PTRADD:   ; %bb.0: ; %entry
+; GFX942_PTRADD-NEXT:s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942_PTRADD-NEXT:s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v4, 0
+; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0)
+; GFX942_PTRADD-NEXT:s_add_u32 s2, s0, 32
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s20
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s21
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s22
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s23
+; GFX942_PTRADD-NEXT:s_addc_u32 s3, s1, 0
+; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
+; GFX942_PTRADD-NEXT:s_nop 1
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s16
+; GFX942_PTRADD-NEXT:v_mo

[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines (PR #142738)

2025-06-04 Thread Alexander Richardson via llvm-branch-commits


@@ -0,0 +1,207 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 
-disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | 
FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 
-disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | 
FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s
+
+; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG
+; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable
+; similar transformations in that pass.
+
+; Tests reassociation (ptradd N0:(ptradd p, c1), z) where N0 has only one use.
+define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) {
+; GFX942_PTRADD-LABEL: global_load_ZTwoUses:
+; GFX942_PTRADD:   ; %bb.0:
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, 24
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: global_load_ZTwoUses:
+; GFX942_LEGACY:   ; %bb.0:
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %base, i64 24
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %voffset
+  %l = load i64, ptr addrspace(1) %gep1, align 8
+  %r = add i64 %l, %voffset
+  ret i64 %r
+}
+
+define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) {
+; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc:
+; GFX942_PTRADD:   ; %bb.0:
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], v[2:3], 0, 24
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0)
+; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc:
+; GFX942_LEGACY:   ; %bb.0:
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
+; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0)
+; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31]
+  %add0 = add nuw nsw i64 %voffset, 24
+  %gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0
+  %l = load i64, ptr addrspace(1) %gep0, align 8
+  ret i64 %l
+}
+
+; Tests reassociation (ptradd (ptradd p, c1), c2) with two constants. These
+; would be folded away in most cases, but the index computation introduced by
+; the legalization of wide vector stores can for example introduce them.
+define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) {
+; GFX942_PTRADD-LABEL: store_v16i32:
+; GFX942_PTRADD:   ; %bb.0: ; %entry
+; GFX942_PTRADD-NEXT:s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942_PTRADD-NEXT:s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v4, 0
+; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0)
+; GFX942_PTRADD-NEXT:s_add_u32 s2, s0, 32
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s20
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s21
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s22
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s23
+; GFX942_PTRADD-NEXT:s_addc_u32 s3, s1, 0
+; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
+; GFX942_PTRADD-NEXT:s_nop 1
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s16
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s17
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s18
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s19
+; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; GFX942_PTRADD-NEXT:s_nop 1
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s12
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s13
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s14
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s15
+; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX942_PTRADD-NEXT:s_nop 1
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s8
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s9
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s10
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s11
+; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942_PTRAD

[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines (PR #142738)

2025-06-04 Thread Fabian Ritter via llvm-branch-commits

https://github.com/ritter-x2a ready_for_review 
https://github.com/llvm/llvm-project/pull/142738
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines (PR #142738)

2025-06-04 Thread Fabian Ritter via llvm-branch-commits

https://github.com/ritter-x2a updated 
https://github.com/llvm/llvm-project/pull/142738

>From 8f51e2d76b4336f81027905b3c9b711eac7b6406 Mon Sep 17 00:00:00 2001
From: Fabian Ritter 
Date: Tue, 3 Jun 2025 09:49:19 -0400
Subject: [PATCH] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines

Pre-committing tests to show improvements in a follow-up PR with the
combines.
---
 .../AMDGPU/ptradd-sdag-optimizations.ll   | 207 ++
 1 file changed, 207 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll

diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll 
b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
new file mode 100644
index 0..0241be9197e1a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
@@ -0,0 +1,207 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 
-disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | 
FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 
-disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | 
FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s
+
+; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG
+; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable
+; similar transformations in that pass.
+
+; Tests reassociation (ptradd N0:(ptradd p, c1), z) where N0 has only one use.
+define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) {
+; GFX942_PTRADD-LABEL: global_load_ZTwoUses:
+; GFX942_PTRADD:   ; %bb.0:
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, 24
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: global_load_ZTwoUses:
+; GFX942_LEGACY:   ; %bb.0:
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %base, i64 24
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %voffset
+  %l = load i64, ptr addrspace(1) %gep1, align 8
+  %r = add i64 %l, %voffset
+  ret i64 %r
+}
+
+define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) {
+; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc:
+; GFX942_PTRADD:   ; %bb.0:
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], v[2:3], 0, 24
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0)
+; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc:
+; GFX942_LEGACY:   ; %bb.0:
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
+; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0)
+; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31]
+  %add0 = add nuw nsw i64 %voffset, 24
+  %gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0
+  %l = load i64, ptr addrspace(1) %gep0, align 8
+  ret i64 %l
+}
+
+; Tests reassociation (ptradd (ptradd p, c1), c2) with two constants. These
+; would be folded away in most cases, but the index computation introduced by
+; the legalization of wide vector stores can for example introduce them.
+define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) {
+; GFX942_PTRADD-LABEL: store_v16i32:
+; GFX942_PTRADD:   ; %bb.0: ; %entry
+; GFX942_PTRADD-NEXT:s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942_PTRADD-NEXT:s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v4, 0
+; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0)
+; GFX942_PTRADD-NEXT:s_add_u32 s2, s0, 32
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s20
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s21
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s22
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s23
+; GFX942_PTRADD-NEXT:s_addc_u32 s3, s1, 0
+; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
+; GFX942_PTRADD-NEXT:s_nop 1
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s16
+; GFX942_PTRADD-NEXT:v_mov_b3

[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines (PR #142738)

2025-06-04 Thread Fabian Ritter via llvm-branch-commits

https://github.com/ritter-x2a updated 
https://github.com/llvm/llvm-project/pull/142738

>From 8f51e2d76b4336f81027905b3c9b711eac7b6406 Mon Sep 17 00:00:00 2001
From: Fabian Ritter 
Date: Tue, 3 Jun 2025 09:49:19 -0400
Subject: [PATCH] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines

Pre-committing tests to show improvements in a follow-up PR with the
combines.
---
 .../AMDGPU/ptradd-sdag-optimizations.ll   | 207 ++
 1 file changed, 207 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll

diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll 
b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
new file mode 100644
index 0..0241be9197e1a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
@@ -0,0 +1,207 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 
-disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | 
FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 
-disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | 
FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s
+
+; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG
+; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable
+; similar transformations in that pass.
+
+; Tests reassociation (ptradd N0:(ptradd p, c1), z) where N0 has only one use.
+define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) {
+; GFX942_PTRADD-LABEL: global_load_ZTwoUses:
+; GFX942_PTRADD:   ; %bb.0:
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, 24
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: global_load_ZTwoUses:
+; GFX942_LEGACY:   ; %bb.0:
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %base, i64 24
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %voffset
+  %l = load i64, ptr addrspace(1) %gep1, align 8
+  %r = add i64 %l, %voffset
+  ret i64 %r
+}
+
+define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) {
+; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc:
+; GFX942_PTRADD:   ; %bb.0:
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], v[2:3], 0, 24
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0)
+; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc:
+; GFX942_LEGACY:   ; %bb.0:
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
+; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0)
+; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31]
+  %add0 = add nuw nsw i64 %voffset, 24
+  %gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0
+  %l = load i64, ptr addrspace(1) %gep0, align 8
+  ret i64 %l
+}
+
+; Tests reassociation (ptradd (ptradd p, c1), c2) with two constants. These
+; would be folded away in most cases, but the index computation introduced by
+; the legalization of wide vector stores can for example introduce them.
+define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) {
+; GFX942_PTRADD-LABEL: store_v16i32:
+; GFX942_PTRADD:   ; %bb.0: ; %entry
+; GFX942_PTRADD-NEXT:s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942_PTRADD-NEXT:s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v4, 0
+; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0)
+; GFX942_PTRADD-NEXT:s_add_u32 s2, s0, 32
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s20
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s21
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s22
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s23
+; GFX942_PTRADD-NEXT:s_addc_u32 s3, s1, 0
+; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
+; GFX942_PTRADD-NEXT:s_nop 1
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s16
+; GFX942_PTRADD-NEXT:v_mov_b3

[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines (PR #142738)

2025-06-04 Thread Fabian Ritter via llvm-branch-commits


@@ -0,0 +1,207 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 
-disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | 
FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 
-disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | 
FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s
+
+; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG
+; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable
+; similar transformations in that pass.
+
+; Tests reassociation (ptradd N0:(ptradd p, c1), z) where N0 has only one use.
+define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) {
+; GFX942_PTRADD-LABEL: global_load_ZTwoUses:
+; GFX942_PTRADD:   ; %bb.0:
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, 24
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: global_load_ZTwoUses:
+; GFX942_LEGACY:   ; %bb.0:
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %base, i64 24
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %voffset
+  %l = load i64, ptr addrspace(1) %gep1, align 8
+  %r = add i64 %l, %voffset
+  ret i64 %r
+}
+
+define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) {
+; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc:
+; GFX942_PTRADD:   ; %bb.0:
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], v[2:3], 0, 24
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0)
+; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc:
+; GFX942_LEGACY:   ; %bb.0:
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
+; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0)
+; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31]
+  %add0 = add nuw nsw i64 %voffset, 24
+  %gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0
+  %l = load i64, ptr addrspace(1) %gep0, align 8
+  ret i64 %l
+}
+
+; Tests reassociation (ptradd (ptradd p, c1), c2) with two constants. These
+; would be folded away in most cases, but the index computation introduced by
+; the legalization of wide vector stores can for example introduce them.
+define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) {
+; GFX942_PTRADD-LABEL: store_v16i32:
+; GFX942_PTRADD:   ; %bb.0: ; %entry
+; GFX942_PTRADD-NEXT:s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942_PTRADD-NEXT:s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v4, 0
+; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0)
+; GFX942_PTRADD-NEXT:s_add_u32 s2, s0, 32
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s20
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s21
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s22
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s23
+; GFX942_PTRADD-NEXT:s_addc_u32 s3, s1, 0
+; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
+; GFX942_PTRADD-NEXT:s_nop 1
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s16
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s17
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s18
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s19
+; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; GFX942_PTRADD-NEXT:s_nop 1
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s12
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s13
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s14
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s15
+; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX942_PTRADD-NEXT:s_nop 1
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s8
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s9
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s10
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s11
+; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942_PTRAD

[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines (PR #142738)

2025-06-04 Thread Fabian Ritter via llvm-branch-commits

ritter-x2a wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/142738?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#142739** https://app.graphite.dev/github/pr/llvm/llvm-project/142739?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142738** https://app.graphite.dev/github/pr/llvm/llvm-project/142738?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142738?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#141725** https://app.graphite.dev/github/pr/llvm/llvm-project/141725?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`




This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/142738
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines (PR #142738)

2025-06-04 Thread Fabian Ritter via llvm-branch-commits

https://github.com/ritter-x2a created 
https://github.com/llvm/llvm-project/pull/142738

Pre-committing tests to show improvements in a follow-up PR with the
combines.

>From d363847d4c4f3922875c23c69fd0e6e0148c7eff Mon Sep 17 00:00:00 2001
From: Fabian Ritter 
Date: Tue, 3 Jun 2025 09:49:19 -0400
Subject: [PATCH] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines

Pre-committing tests to show improvements in a follow-up PR with the
combines.
---
 .../AMDGPU/ptradd-sdag-optimizations.ll   | 207 ++
 1 file changed, 207 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll

diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll 
b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
new file mode 100644
index 0..0241be9197e1a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
@@ -0,0 +1,207 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 
-disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | 
FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 
-disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | 
FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s
+
+; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG
+; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable
+; similar transformations in that pass.
+
+; Tests reassociation (ptradd N0:(ptradd p, c1), z) where N0 has only one use.
+define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) {
+; GFX942_PTRADD-LABEL: global_load_ZTwoUses:
+; GFX942_PTRADD:   ; %bb.0:
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, 24
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: global_load_ZTwoUses:
+; GFX942_LEGACY:   ; %bb.0:
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %base, i64 24
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %voffset
+  %l = load i64, ptr addrspace(1) %gep1, align 8
+  %r = add i64 %l, %voffset
+  ret i64 %r
+}
+
+define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) {
+; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc:
+; GFX942_PTRADD:   ; %bb.0:
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], v[2:3], 0, 24
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0)
+; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc:
+; GFX942_LEGACY:   ; %bb.0:
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
+; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0)
+; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31]
+  %add0 = add nuw nsw i64 %voffset, 24
+  %gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0
+  %l = load i64, ptr addrspace(1) %gep0, align 8
+  ret i64 %l
+}
+
+; Tests reassociation (ptradd (ptradd p, c1), c2) with two constants. These
+; would be folded away in most cases, but the index computation introduced by
+; the legalization of wide vector stores can for example introduce them.
+define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) {
+; GFX942_PTRADD-LABEL: store_v16i32:
+; GFX942_PTRADD:   ; %bb.0: ; %entry
+; GFX942_PTRADD-NEXT:s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942_PTRADD-NEXT:s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v4, 0
+; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0)
+; GFX942_PTRADD-NEXT:s_add_u32 s2, s0, 32
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s20
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s21
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s22
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s23
+; GFX942_PTRADD-NEXT:s_addc_u32 s3, s1, 0
+; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
+; GFX942_PTRADD-NEXT:s_nop 1
+;

[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines (PR #142738)

2025-06-04 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Fabian Ritter (ritter-x2a)


Changes

Pre-committing tests to show improvements in a follow-up PR with the
combines.

---
Full diff: https://github.com/llvm/llvm-project/pull/142738.diff


1 Files Affected:

- (added) llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll (+207) 


``diff
diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll 
b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
new file mode 100644
index 0..0241be9197e1a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
@@ -0,0 +1,207 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 
-disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | 
FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 
-disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | 
FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s
+
+; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG
+; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable
+; similar transformations in that pass.
+
+; Tests reassociation (ptradd N0:(ptradd p, c1), z) where N0 has only one use.
+define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) {
+; GFX942_PTRADD-LABEL: global_load_ZTwoUses:
+; GFX942_PTRADD:   ; %bb.0:
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, 24
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: global_load_ZTwoUses:
+; GFX942_LEGACY:   ; %bb.0:
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %base, i64 24
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %voffset
+  %l = load i64, ptr addrspace(1) %gep1, align 8
+  %r = add i64 %l, %voffset
+  ret i64 %r
+}
+
+define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) {
+; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc:
+; GFX942_PTRADD:   ; %bb.0:
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], v[2:3], 0, 24
+; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0)
+; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc:
+; GFX942_LEGACY:   ; %bb.0:
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
+; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0)
+; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31]
+  %add0 = add nuw nsw i64 %voffset, 24
+  %gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0
+  %l = load i64, ptr addrspace(1) %gep0, align 8
+  ret i64 %l
+}
+
+; Tests reassociation (ptradd (ptradd p, c1), c2) with two constants. These
+; would be folded away in most cases, but the index computation introduced by
+; the legalization of wide vector stores can for example introduce them.
+define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) {
+; GFX942_PTRADD-LABEL: store_v16i32:
+; GFX942_PTRADD:   ; %bb.0: ; %entry
+; GFX942_PTRADD-NEXT:s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942_PTRADD-NEXT:s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v4, 0
+; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0)
+; GFX942_PTRADD-NEXT:s_add_u32 s2, s0, 32
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s20
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s21
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s22
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s23
+; GFX942_PTRADD-NEXT:s_addc_u32 s3, s1, 0
+; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
+; GFX942_PTRADD-NEXT:s_nop 1
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s16
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s17
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s18
+; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s19
+; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; GFX942