This speeds up address space conversions by about 8x in
micro-benchmarks.  Those may be slightly unrepresentative, because the
RTL optimizer is free to do whatever it wants with the two registers
address space conversion uses (a temporary register and shared_base).

gcc/ChangeLog:

        * config/gcn/gcn.cc (gcn_addr_space_convert): Use
        SHARED_BASE_REG to find flat address space base of LDS.

gcc/testsuite/ChangeLog:

        * gcc.target/gcn/addr-space-convert-1.c: New test.
        * gcc.target/gcn/addr-space-convert-2.c: New test.
---
 gcc/config/gcn/gcn.cc                         | 32 +++++++++++--------
 .../gcc.target/gcn/addr-space-convert-1.c     |  8 +++++
 .../gcc.target/gcn/addr-space-convert-2.c     | 13 ++++++++
 3 files changed, 40 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/gcn/addr-space-convert-1.c
 create mode 100644 gcc/testsuite/gcc.target/gcn/addr-space-convert-2.c

diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index a8cadb8651f6..70fdf4e7f1bb 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -1898,21 +1898,27 @@ gcn_addr_space_convert (rtx op, tree from_type, tree 
to_type)
 
   if (AS_LDS_P (as_from) && AS_FLAT_P (as_to))
     {
-      /* The high bits of the QUEUE_PTR_ARG register are used by
-        GCN_BUILTIN_FIRST_CALL_THIS_THREAD_P, so mask them out.  */
-      rtx queue_reg = gen_rtx_REG (DImode,
-                                  cfun->machine->args.reg[QUEUE_PTR_ARG]);
-      rtx queue_ptr = gen_reg_rtx (DImode);
-      emit_insn (gen_anddi3 (queue_ptr, queue_reg, GEN_INT (0xffffffffffff)));
-      rtx group_seg_aperture_hi = gen_rtx_MEM (SImode,
-                                    gen_rtx_PLUS (DImode, queue_ptr,
-                                                  gen_int_mode (64, SImode)));
-      rtx tmp = gen_reg_rtx (DImode);
+      /* The LDS based pointer is held in SHARED_BASE.
 
+        Per:
+
+          For GFX9-GFX11 the aperture base addresses are directly available as
+          inline constant registers SRC_SHARED_BASE/LIMIT and
+          SRC_PRIVATE_BASE/LIMIT. In 64-bit address mode the aperture sizes
+          are 2^32 bytes and the base is aligned to 2^32 which makes it easier
+          to convert from flat to segment or segment to flat.
+          -- User Guide for AMDGPU Backend (LLVM)
+
+        ... we can safely assume that the SImode low-part of SHARED_BASE_REG
+        contains all zeroes.  As OP is an LDS address, it is 32-bit.  Ergo,
+        SHARED_BASE_REG+OP is equivalent to SHARED_BASE_REG|OP.  If
+        SHARED_BASE_REG is in r[N:N+1], then, writing OP to rN should suffice.
+        Ergo, this conversion can be implemented as two moves.  */
+      rtx group_seg_aperture = gen_rtx_REG (Pmode, SHARED_BASE_REG);
+      rtx tmp = gen_reg_rtx (Pmode);
+
+      emit_move_insn (tmp, group_seg_aperture);
       emit_move_insn (gen_lowpart (SImode, tmp), op);
-      emit_move_insn (gen_highpart_mode (SImode, DImode, tmp),
-                     group_seg_aperture_hi);
-
       return tmp;
     }
   else if (as_from == as_to)
diff --git a/gcc/testsuite/gcc.target/gcn/addr-space-convert-1.c 
b/gcc/testsuite/gcc.target/gcn/addr-space-convert-1.c
new file mode 100644
index 000000000000..3928842c3bb0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/gcn/addr-space-convert-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile }
+   { dg-options "-O -Wall" } */
+
+void __flat *
+convert_lds_addr (void __lds *x)
+{ return x; }
+
+/* { dg-final { scan-assembler "shared_base" } }  */
diff --git a/gcc/testsuite/gcc.target/gcn/addr-space-convert-2.c 
b/gcc/testsuite/gcc.target/gcn/addr-space-convert-2.c
new file mode 100644
index 000000000000..37f6c3771a96
--- /dev/null
+++ b/gcc/testsuite/gcc.target/gcn/addr-space-convert-2.c
@@ -0,0 +1,13 @@
+/* { dg-do run }
+   { dg-options "-O -Wall" } */
+
+int
+main ()
+{
+  int __lds *testptr = (int __lds *)(__UINTPTR_TYPE__)8;
+  *testptr = 4;
+
+  int __flat *testptr_flat = testptr;
+  if (*testptr_flat != 4)
+    return 1;
+}
-- 
2.53.0

Reply via email to