https://gcc.gnu.org/g:41f40585570f3995dc53713e86c996c8c535dd75
commit 41f40585570f3995dc53713e86c996c8c535dd75 Author: Arsen Arsenović <[email protected]> Date: Mon Mar 16 11:32:44 2026 +0100 gcc/gcn: Use 'shared_base' register for LDS<->flat conversion This speeds up address space conversions by about 8x in micro-benchmarks. Those may be slightly unrepresentative, because the RTL optimizer is free to do whatever it wants with the two registers address space conversion uses (a temporary register and shared_base). gcc/ChangeLog: * config/gcn/gcn.cc (gcn_addr_space_convert): Use SHARED_BASE_REG to find flat address space base of LDS. gcc/testsuite/ChangeLog: * gcc.target/gcn/addr-space-convert-1.c: New test. * gcc.target/gcn/addr-space-convert-2.c: New test. (cherry picked from commit 85d0911203158baa1eda6f5191b1d5f6ad153b69) Diff: --- gcc/config/gcn/gcn.cc | 34 +++++++++++++--------- .../gcc.target/gcn/addr-space-convert-1.c | 8 +++++ .../gcc.target/gcn/addr-space-convert-2.c | 13 +++++++++ 3 files changed, 41 insertions(+), 14 deletions(-) diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc index 9e2c2003dc2a..c4bfc707f047 100644 --- a/gcc/config/gcn/gcn.cc +++ b/gcc/config/gcn/gcn.cc @@ -1899,21 +1899,27 @@ gcn_addr_space_convert (rtx op, tree from_type, tree to_type) if (AS_LDS_P (as_from) && AS_FLAT_P (as_to)) { - /* The high bits of the QUEUE_PTR_ARG register are used by - GCN_BUILTIN_FIRST_CALL_THIS_THREAD_P, so mask them out. */ - rtx queue_reg = gen_rtx_REG (DImode, - cfun->machine->args.reg[QUEUE_PTR_ARG]); - rtx queue_ptr = gen_reg_rtx (DImode); - emit_insn (gen_anddi3 (queue_ptr, queue_reg, GEN_INT (0xffffffffffff))); - rtx group_seg_aperture_hi = gen_rtx_MEM (SImode, - gen_rtx_PLUS (DImode, queue_ptr, - gen_int_mode (64, SImode))); - rtx tmp = gen_reg_rtx (DImode); - + /* The LDS based pointer is held in SHARED_BASE. + + Per: + + For GFX9-GFX11 the aperture base addresses are directly available as + inline constant registers SRC_SHARED_BASE/LIMIT and + SRC_PRIVATE_BASE/LIMIT. In 64-bit address mode the aperture sizes + are 2^32 bytes and the base is aligned to 2^32 which makes it easier + to convert from flat to segment or segment to flat. + -- User Guide for AMDGPU Backend (LLVM) + + ... we can safely assume that the SImode low-part of SHARED_BASE_REG + contains all zeroes. As OP is an LDS address, it is 32-bit. Ergo, + SHARED_BASE_REG+OP is equivalent to SHARED_BASE_REG|OP. If + SHARED_BASE_REG is in r[N:N+1], then, writing OP to rN should suffice. + Ergo, this conversion can be implemented as two moves. */ + rtx group_seg_aperture = gen_rtx_REG (Pmode, SHARED_BASE_REG); + rtx tmp = gen_reg_rtx (Pmode); + + emit_move_insn (tmp, group_seg_aperture); emit_move_insn (gen_lowpart (SImode, tmp), op); - emit_move_insn (gen_highpart_mode (SImode, DImode, tmp), - group_seg_aperture_hi); - return tmp; } else if (as_from == as_to) diff --git a/gcc/testsuite/gcc.target/gcn/addr-space-convert-1.c b/gcc/testsuite/gcc.target/gcn/addr-space-convert-1.c new file mode 100644 index 000000000000..3928842c3bb0 --- /dev/null +++ b/gcc/testsuite/gcc.target/gcn/addr-space-convert-1.c @@ -0,0 +1,8 @@ +/* { dg-do compile } + { dg-options "-O -Wall" } */ + +void __flat * +convert_lds_addr (void __lds *x) +{ return x; } + +/* { dg-final { scan-assembler "shared_base" } } */ diff --git a/gcc/testsuite/gcc.target/gcn/addr-space-convert-2.c b/gcc/testsuite/gcc.target/gcn/addr-space-convert-2.c new file mode 100644 index 000000000000..37f6c3771a96 --- /dev/null +++ b/gcc/testsuite/gcc.target/gcn/addr-space-convert-2.c @@ -0,0 +1,13 @@ +/* { dg-do run } + { dg-options "-O -Wall" } */ + +int +main () +{ + int __lds *testptr = (int __lds *)(__UINTPTR_TYPE__)8; + *testptr = 4; + + int __flat *testptr_flat = testptr; + if (*testptr_flat != 4) + return 1; +}
