This speeds up address space conversions by about 8x in
micro-benchmarks. Those may be slightly unrepresentative, because the
RTL optimizer is free to do whatever it wants with the two registers
address space conversion uses (a temporary register and shared_base).
gcc/ChangeLog:
* config/gcn/gcn.cc (gcn_addr_space_convert): Use
SHARED_BASE_REG to find flat address space base of LDS.
gcc/testsuite/ChangeLog:
* gcc.target/gcn/addr-space-convert-1.c: New test.
* gcc.target/gcn/addr-space-convert-2.c: New test.
---
gcc/config/gcn/gcn.cc | 32 +++++++++++--------
.../gcc.target/gcn/addr-space-convert-1.c | 8 +++++
.../gcc.target/gcn/addr-space-convert-2.c | 13 ++++++++
3 files changed, 40 insertions(+), 13 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/gcn/addr-space-convert-1.c
create mode 100644 gcc/testsuite/gcc.target/gcn/addr-space-convert-2.c
diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index a8cadb8651f6..70fdf4e7f1bb 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -1898,21 +1898,27 @@ gcn_addr_space_convert (rtx op, tree from_type, tree
to_type)
if (AS_LDS_P (as_from) && AS_FLAT_P (as_to))
{
- /* The high bits of the QUEUE_PTR_ARG register are used by
- GCN_BUILTIN_FIRST_CALL_THIS_THREAD_P, so mask them out. */
- rtx queue_reg = gen_rtx_REG (DImode,
- cfun->machine->args.reg[QUEUE_PTR_ARG]);
- rtx queue_ptr = gen_reg_rtx (DImode);
- emit_insn (gen_anddi3 (queue_ptr, queue_reg, GEN_INT (0xffffffffffff)));
- rtx group_seg_aperture_hi = gen_rtx_MEM (SImode,
- gen_rtx_PLUS (DImode, queue_ptr,
- gen_int_mode (64, SImode)));
- rtx tmp = gen_reg_rtx (DImode);
+ /* The LDS based pointer is held in SHARED_BASE.
+ Per:
+
+ For GFX9-GFX11 the aperture base addresses are directly available as
+ inline constant registers SRC_SHARED_BASE/LIMIT and
+ SRC_PRIVATE_BASE/LIMIT. In 64-bit address mode the aperture sizes
+ are 2^32 bytes and the base is aligned to 2^32 which makes it easier
+ to convert from flat to segment or segment to flat.
+ -- User Guide for AMDGPU Backend (LLVM)
+
+ ... we can safely assume that the SImode low-part of SHARED_BASE_REG
+ contains all zeroes. As OP is an LDS address, it is 32-bit. Ergo,
+ SHARED_BASE_REG+OP is equivalent to SHARED_BASE_REG|OP. If
+ SHARED_BASE_REG is in r[N:N+1], then, writing OP to rN should suffice.
+ Ergo, this conversion can be implemented as two moves. */
+ rtx group_seg_aperture = gen_rtx_REG (Pmode, SHARED_BASE_REG);
+ rtx tmp = gen_reg_rtx (Pmode);
+
+ emit_move_insn (tmp, group_seg_aperture);
emit_move_insn (gen_lowpart (SImode, tmp), op);
- emit_move_insn (gen_highpart_mode (SImode, DImode, tmp),
- group_seg_aperture_hi);
-
return tmp;
}
else if (as_from == as_to)
diff --git a/gcc/testsuite/gcc.target/gcn/addr-space-convert-1.c
b/gcc/testsuite/gcc.target/gcn/addr-space-convert-1.c
new file mode 100644
index 000000000000..3928842c3bb0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/gcn/addr-space-convert-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile }
+ { dg-options "-O -Wall" } */
+
+void __flat *
+convert_lds_addr (void __lds *x)
+{ return x; }
+
+/* { dg-final { scan-assembler "shared_base" } } */
diff --git a/gcc/testsuite/gcc.target/gcn/addr-space-convert-2.c
b/gcc/testsuite/gcc.target/gcn/addr-space-convert-2.c
new file mode 100644
index 000000000000..37f6c3771a96
--- /dev/null
+++ b/gcc/testsuite/gcc.target/gcn/addr-space-convert-2.c
@@ -0,0 +1,13 @@
+/* { dg-do run }
+ { dg-options "-O -Wall" } */
+
+int
+main ()
+{
+ int __lds *testptr = (int __lds *)(__UINTPTR_TYPE__)8;
+ *testptr = 4;
+
+ int __flat *testptr_flat = testptr;
+ if (*testptr_flat != 4)
+ return 1;
+}
--
2.53.0