https://gcc.gnu.org/g:6b76dfad9b2c80a43b2e775d0027ba4b636d6022

commit r16-2261-g6b76dfad9b2c80a43b2e775d0027ba4b636d6022
Author: Soumya AR <soum...@nvidia.com>
Date:   Tue Jul 15 19:28:44 2025 +0530

    aarch64: Enable selective LDAPUR generation for cores with RCPC2
    
    This patch adds the ability to fold the address computation into the 
addressing
    mode for LDAPR instructions using LDAPUR when RCPC2 is available.
    
    LDAPUR emission is enabled by default when RCPC2 is available, but can be
    disabled using the avoid_ldapur tune flag on a per-core basis.
    
    Currently, it is disabled for neoverse-v2, neoverse-v3, cortex-x925, and
    architecutres before armv8.8-a.
    
    Earlier, the following code:
    
    uint64_t
    foo (std::atomic<uint64_t> *x)
    {
      return x[1].load(std::memory_order_acquire);
    }
    
    would generate:
    
    foo(std::atomic<unsigned long>*):
            add     x0, x0, 8
            ldapr   x0, [x0]
            ret
    
    but now generates:
    
    foo(std::atomic<unsigned long>*):
            ldapur  x0, [x0, 8]
            ret
    
    The patch was bootstrapped and regtested on aarch64-linux-gnu, no 
regression.
    OK for mainline?
    
    Signed-off-by: Soumya AR <soum...@nvidia.com>
    
    gcc/ChangeLog:
    
            * config/aarch64/aarch64-tuning-flags.def 
(AARCH64_EXTRA_TUNING_OPTION):
            Add AVOID_LDAPUR tuning flag.
            * config/aarch64/aarch64.cc (aarch64_adjust_generic_arch_tuning):
            Set AVOID_LDAPUR for architectures before armv8.8-a.
            (aarch64_override_options_internal): Apply generic tuning 
adjustments
            to generic_armv8_a_tunings and generic_armv9_a_tunings.
            * config/aarch64/aarch64.h (TARGET_ENABLE_LDAPUR): New macro to
            control LDAPUR usage based on RCPC2 and tuning flags.
            * config/aarch64/aarch64.md: Add enable_ldapur attribute.
            * config/aarch64/atomics.md (aarch64_atomic_load<mode>_rcpc): Modify
            to emit LDAPUR for cores with RCPC2.
            (*aarch64_atomic_load<ALLX:mode>_rcpc_zext): Likewise.
            (*aarch64_atomic_load<ALLX:mode>_rcpc_sext): Update constraint to 
Ust.
            * config/aarch64/tuning_models/cortexx925.h: Add AVOID_LDAPUR flag.
            * config/aarch64/tuning_models/neoversev2.h: Likewise.
            * config/aarch64/tuning_models/neoversev3.h: Likewise.
            * config/aarch64/tuning_models/neoversev3ae.h: Likewise.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/ldapr-sext.c: Update expected output to include
            offsets.
            * gcc.target/aarch64/ldapur.c: New test for LDAPUR.
            * gcc.target/aarch64/ldapur_avoid.c: New test for AVOID_LDAPUR flag.

Diff:
---
 gcc/config/aarch64/aarch64-tuning-flags.def     |  2 +
 gcc/config/aarch64/aarch64.cc                   |  7 ++-
 gcc/config/aarch64/aarch64.h                    |  5 ++
 gcc/config/aarch64/aarch64.md                   | 11 +++-
 gcc/config/aarch64/atomics.md                   | 20 ++++---
 gcc/config/aarch64/tuning_models/cortexx925.h   |  3 +-
 gcc/config/aarch64/tuning_models/neoversev2.h   |  3 +-
 gcc/config/aarch64/tuning_models/neoversev3.h   |  3 +-
 gcc/config/aarch64/tuning_models/neoversev3ae.h |  3 +-
 gcc/testsuite/gcc.target/aarch64/ldapr-sext.c   |  6 +-
 gcc/testsuite/gcc.target/aarch64/ldapur.c       | 77 +++++++++++++++++++++++++
 gcc/testsuite/gcc.target/aarch64/ldapur_avoid.c | 37 ++++++++++++
 12 files changed, 161 insertions(+), 16 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def 
b/gcc/config/aarch64/aarch64-tuning-flags.def
index f2c916e9d770..dd91324e9c80 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -44,6 +44,8 @@ AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", 
AVOID_CROSS_LOOP_FMA)
 
 AARCH64_EXTRA_TUNING_OPTION ("fully_pipelined_fma", FULLY_PIPELINED_FMA)
 
+AARCH64_EXTRA_TUNING_OPTION ("avoid_ldapur", AVOID_LDAPUR)
+
 /* Enable is the target prefers to use a fresh register for predicate outputs
    rather than re-use an input predicate register.  */
 AARCH64_EXTRA_TUNING_OPTION ("avoid_pred_rmw", AVOID_PRED_RMW)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 6e16763f9571..0485f695941c 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -18793,6 +18793,8 @@ aarch64_adjust_generic_arch_tuning (struct tune_params 
&current_tune)
   if (TARGET_SVE2)
     current_tune.extra_tuning_flags
       &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS;
+  if (!AARCH64_HAVE_ISA(V8_8A))
+    aarch64_tune_params.extra_tuning_flags |= AARCH64_EXTRA_TUNE_AVOID_LDAPUR;
 }
 
 static void
@@ -18857,7 +18859,10 @@ aarch64_override_options_internal (struct gcc_options 
*opts)
   /* Make a copy of the tuning parameters attached to the core, which
      we may later overwrite.  */
   aarch64_tune_params = *(tune->tune);
-  if (tune->tune == &generic_tunings)
+
+  if (tune->tune == &generic_tunings
+      || tune->tune == &generic_armv8_a_tunings
+      || tune->tune == &generic_armv9_a_tunings)
     aarch64_adjust_generic_arch_tuning (aarch64_tune_params);
 
   if (opts->x_aarch64_override_tune_string)
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index d5c4a42e96d9..096c853af7ff 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -493,6 +493,11 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED
   (bool (aarch64_tune_params.extra_tuning_flags \
         & AARCH64_EXTRA_TUNE_CHEAP_FPMR_WRITE))
 
+/* Enable folding address computation into LDAPUR when RCPC2 is available.  */
+#define TARGET_ENABLE_LDAPUR (TARGET_RCPC2 \
+                             && !(aarch64_tune_params.extra_tuning_flags \
+                                  & AARCH64_EXTRA_TUNE_AVOID_LDAPUR))
+
 /* Combinatorial tests.  */
 
 #define TARGET_SVE2_OR_SME2 \
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 27efc9155dcb..a4ae6859da01 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -482,6 +482,8 @@
 ;; clobber for SVE predicates.
 (define_attr "pred_clobber" "any,no,yes" (const_string "any"))
 
+(define_attr "enable_ldapur" "any,no,yes" (const_string "any"))
+
 ;; [For compatibility with Arm in pipeline models]
 ;; Attribute that specifies whether or not the instruction touches fp
 ;; registers.
@@ -506,7 +508,14 @@
          (eq_attr "pred_clobber" "yes")
          (match_test "TARGET_SVE_PRED_CLOBBER"))
        (eq_attr "pred_clobber" "any"))
-
+      (ior
+       (and
+         (eq_attr "enable_ldapur" "yes")
+         (match_test "TARGET_ENABLE_LDAPUR"))
+       (and
+         (eq_attr "enable_ldapur" "no")
+         (match_test "!TARGET_ENABLE_LDAPUR"))
+       (eq_attr "enable_ldapur" "any"))
       (ior
        (eq_attr "arch" "any")
 
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 36b0dbd1f57f..ea4a9367fc88 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -679,13 +679,16 @@
 )
 
 (define_insn "aarch64_atomic_load<mode>_rcpc"
-  [(set (match_operand:ALLI 0 "register_operand" "=r")
+  [(set (match_operand:ALLI 0 "register_operand")
     (unspec_volatile:ALLI
-      [(match_operand:ALLI 1 "aarch64_sync_memory_operand" "Q")
+      [(match_operand:ALLI 1 "aarch64_rcpc_memory_operand")
        (match_operand:SI 2 "const_int_operand")]                       ;; model
       UNSPECV_LDAP))]
   "TARGET_RCPC"
-  "ldapr<atomic_sfx>\t%<w>0, %1"
+  {@ [ cons: =0 , 1   ; attrs: enable_ldapur  ]
+     [ r        , Q   ; any                   ] ldapr<atomic_sfx>\t%<w>0, %1
+     [ r        , Ust ; yes                   ] ldapur<atomic_sfx>\t%<w>0, %1
+  }
 )
 
 (define_insn "aarch64_atomic_load<mode>"
@@ -705,21 +708,24 @@
 )
 
 (define_insn "*aarch64_atomic_load<ALLX:mode>_rcpc_zext"
-  [(set (match_operand:SD_HSDI 0 "register_operand" "=r")
+  [(set (match_operand:SD_HSDI 0 "register_operand")
     (zero_extend:SD_HSDI
       (unspec_volatile:ALLX
-        [(match_operand:ALLX 1 "aarch64_sync_memory_operand" "Q")
+        [(match_operand:ALLX 1 "aarch64_rcpc_memory_operand")
          (match_operand:SI 2 "const_int_operand")]                     ;; model
        UNSPECV_LDAP)))]
   "TARGET_RCPC && (<SD_HSDI:sizen> > <ALLX:sizen>)"
-  "ldapr<ALLX:atomic_sfx>\t%w0, %1"
+  {@ [ cons: =0 , 1   ; attrs: enable_ldapur ]
+     [ r        , Q   ; any                  ] ldapr<ALLX:atomic_sfx>\t%w0, %1
+     [ r        , Ust ; yes                  ] ldapur<ALLX:atomic_sfx>\t%w0, %1
+  }
 )
 
 (define_insn "*aarch64_atomic_load<ALLX:mode>_rcpc_sext"
   [(set (match_operand:GPI  0 "register_operand" "=r")
     (sign_extend:GPI
       (unspec_volatile:ALLX
-        [(match_operand:ALLX 1 "aarch64_sync_memory_operand" "Q")
+        [(match_operand:ALLX 1 "aarch64_rcpc_memory_operand" "Ust")
          (match_operand:SI 2 "const_int_operand")]                     ;; model
        UNSPECV_LDAP)))]
   "TARGET_RCPC2 && (<GPI:sizen> > <ALLX:sizen>)"
diff --git a/gcc/config/aarch64/tuning_models/cortexx925.h 
b/gcc/config/aarch64/tuning_models/cortexx925.h
index 7d0162eae54c..f448493b1bc5 100644
--- a/gcc/config/aarch64/tuning_models/cortexx925.h
+++ b/gcc/config/aarch64/tuning_models/cortexx925.h
@@ -222,7 +222,8 @@ static const struct tune_params cortexx925_tunings =
   (AARCH64_EXTRA_TUNE_BASE
    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
-   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),       /* tune_flags.  */
+   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW
+   | AARCH64_EXTRA_TUNE_AVOID_LDAPUR), /* tune_flags.  */
   &generic_armv9a_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
   AARCH64_LDP_STP_POLICY_ALWAYS           /* stp_policy_model.  */
diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h 
b/gcc/config/aarch64/tuning_models/neoversev2.h
index b000fb465709..266d8f190a25 100644
--- a/gcc/config/aarch64/tuning_models/neoversev2.h
+++ b/gcc/config/aarch64/tuning_models/neoversev2.h
@@ -220,7 +220,8 @@ static const struct tune_params neoversev2_tunings =
   (AARCH64_EXTRA_TUNE_BASE
    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
-   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),       /* tune_flags.  */
+   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW
+   | AARCH64_EXTRA_TUNE_AVOID_LDAPUR), /* tune_flags.  */
   &generic_armv9a_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
   AARCH64_LDP_STP_POLICY_ALWAYS           /* stp_policy_model.  */
diff --git a/gcc/config/aarch64/tuning_models/neoversev3.h 
b/gcc/config/aarch64/tuning_models/neoversev3.h
index ad3cd222512d..f5566d270dac 100644
--- a/gcc/config/aarch64/tuning_models/neoversev3.h
+++ b/gcc/config/aarch64/tuning_models/neoversev3.h
@@ -220,7 +220,8 @@ static const struct tune_params neoversev3_tunings =
   (AARCH64_EXTRA_TUNE_BASE
    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
-   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),       /* tune_flags.  */
+   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW
+   | AARCH64_EXTRA_TUNE_AVOID_LDAPUR), /* tune_flags.  */
   &generic_armv9a_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
   AARCH64_LDP_STP_POLICY_ALWAYS           /* stp_policy_model.  */
diff --git a/gcc/config/aarch64/tuning_models/neoversev3ae.h 
b/gcc/config/aarch64/tuning_models/neoversev3ae.h
index a0adef00824d..5796e52a2667 100644
--- a/gcc/config/aarch64/tuning_models/neoversev3ae.h
+++ b/gcc/config/aarch64/tuning_models/neoversev3ae.h
@@ -220,7 +220,8 @@ static const struct tune_params neoversev3ae_tunings =
   (AARCH64_EXTRA_TUNE_BASE
    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
-   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),       /* tune_flags.  */
+   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW
+   | AARCH64_EXTRA_TUNE_AVOID_LDAPUR), /* tune_flags.  */
   &generic_armv9a_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
   AARCH64_LDP_STP_POLICY_ALWAYS           /* stp_policy_model.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/ldapr-sext.c 
b/gcc/testsuite/gcc.target/aarch64/ldapr-sext.c
index f57c09d05806..e8a545a01f9c 100644
--- a/gcc/testsuite/gcc.target/aarch64/ldapr-sext.c
+++ b/gcc/testsuite/gcc.target/aarch64/ldapr-sext.c
@@ -33,7 +33,7 @@ TEST(s8_s64, s8, long long)
 /*
 **test_s16_s64:
 **...
-**     ldapursh        x0, \[x[0-9]+\]
+**     ldapursh        x0, \[x[0-9]+, [0-9]+\]
 **     ret
 */
 
@@ -42,7 +42,7 @@ TEST(s16_s64, s16, long long)
 /*
 **test_s32_s64:
 **...
-**     ldapursw        x0, \[x[0-9]+\]
+**     ldapursw        x0, \[x[0-9]+, [0-9]+\]
 **     ret
 */
 
@@ -60,7 +60,7 @@ TEST(s8_s32, s8, int)
 /*
 **test_s16_s32:
 **...
-**     ldapursh        w0, \[x[0-9]+\]
+**     ldapursh        w0, \[x[0-9]+, [0-9]+\]
 **     ret
 */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/ldapur.c 
b/gcc/testsuite/gcc.target/aarch64/ldapur.c
new file mode 100644
index 000000000000..5c68bdde35dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ldapur.c
@@ -0,0 +1,77 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <stdatomic.h>
+#include <stdint.h>
+
+#pragma GCC target "arch=armv8.8-a"
+
+atomic_ullong u64;
+atomic_uint u32;
+atomic_ushort u16;
+atomic_uchar u8[2]; /* Force an offset for u8 */
+
+#define TEST(name, ldsize, rettype)                            \
+rettype                                                                \
+test_##name (void)                                             \
+{                                                              \
+  return atomic_load_explicit (&ldsize, memory_order_acquire); \
+}                                                              \
+
+
+/*
+** test_u8_u64:
+**     ...
+**     ldapurb w[0-9]+, \[x[0-9]+, [0-9]+\]
+**     ret
+*/
+TEST(u8_u64, u8[1], uint64_t)
+
+/*
+** test_u16_u64:
+**     ...
+**     ldapurh w[0-9]+, \[x[0-9]+, [0-9]+\]
+**     ret
+*/
+TEST(u16_u64, u16, uint64_t)
+
+/*
+**test_u32_u64:
+**     ...
+**     ldapur  w[0-9]+, \[x[0-9]+, [0-9]+\]
+**     ret
+*/
+TEST(u32_u64, u32, uint64_t)
+
+/*
+**test_u64_u64:
+**     ...
+**     ldapur  x[0-9]+, \[x[0-9]+, [0-9]+\]
+**     ret
+*/
+TEST(u64_u64, u64, uint64_t)
+
+/*
+**test_u8_u32:
+**     ...
+**     ldapurb w[0-9]+, \[x[0-9]+, [0-9]+\]
+**     ret
+*/
+TEST(u8_u32, u8[1], uint32_t)
+
+/*
+**test_u16_u32:
+**     ...
+**     ldapurh w[0-9]+, \[x[0-9]+, [0-9]+\]
+**     ret
+*/
+TEST(u16_u32, u16, uint32_t)
+
+/*
+**test_u32_u32:
+**     ...
+**     ldapur  w[0-9]+, \[x[0-9]+, [0-9]+\]
+**     ret
+*/
+TEST(u32_u32, u32, uint32_t)
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/aarch64/ldapur_avoid.c 
b/gcc/testsuite/gcc.target/aarch64/ldapur_avoid.c
new file mode 100644
index 000000000000..ad87a30752a2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ldapur_avoid.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -std=c99 -moverride=tune=avoid_ldapur" } */
+
+#include <stdatomic.h>
+#include <stdint.h>
+
+#pragma GCC target "arch=armv8.8-a"
+/* LDAPUR is only avoided for armv8.4 to armv8.7. This checks for the working
+of avoid_ldapur flag. */
+
+/* { dg-final { scan-assembler-not "ldapur\t" } } */
+
+atomic_ullong u64;
+atomic_uint u32;
+atomic_ushort u16;
+atomic_uchar u8[2]; /* Force an offset for u8 */
+
+#define TEST(name, ldsize, rettype)                            \
+rettype                                                                \
+test_##name (void)                                             \
+{                                                              \
+  return atomic_load_explicit (&ldsize, memory_order_acquire); \
+}                                                              \
+
+TEST(u8_u64, u8[1], uint64_t)
+TEST(u16_u64, u16, uint64_t)
+TEST(u32_u64, u32, uint64_t)
+TEST(u64_u64, u64, uint64_t)
+TEST(u8_u32, u8[1], uint32_t)
+TEST(u16_u32, u16, uint32_t)
+TEST(u32_u32, u32, uint32_t)
+
+/* { dg-final { scan-assembler-times "ldapr\t" 3 } } */
+/* { dg-final { scan-assembler-times "ldaprh\t" 2 } } */
+/* { dg-final { scan-assembler-times "ldaprb\t" 2 } } */
+
+

Reply via email to