[gcc r14-10399] Aarch64, bugfix: Fix NEON bigendian addp intrinsic [PR114890]

2024-07-09 Thread Wilco Dijkstra via Gcc-cvs
https://gcc.gnu.org/g:72753ec82076d15443c32aac88a8c0fa0ab4bc2f

commit r14-10399-g72753ec82076d15443c32aac88a8c0fa0ab4bc2f
Author: Alfie Richards 
Date:   Thu Jul 4 09:09:19 2024 +0200

Aarch64, bugfix: Fix NEON bigendian addp intrinsic [PR114890]

This change removes code that switches the operands in bigendian mode 
erroneously.
This fixes the related test also.

gcc/ChangeLog:

PR target/114890
* config/aarch64/aarch64-simd.md: Remove bigendian operand swap.

gcc/testsuite/ChangeLog:

PR target/114890
* gcc.target/aarch64/vector_intrinsics_asm.c: Remove xfail.

(cherry picked from commit 11049cdf204bc96bc407e5dd44ed3b8a492f405a)

Diff:
---
 gcc/config/aarch64/aarch64-simd.md |   2 -
 .../gcc.target/aarch64/vector_intrinsics_asm.c | 371 +
 2 files changed, 371 insertions(+), 2 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index f8bb973a278c..33ab0741e87c 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7363,8 +7363,6 @@
   nunits /= 2;
 rtx par_even = aarch64_gen_stepped_int_parallel (nunits, 0, 2);
 rtx par_odd = aarch64_gen_stepped_int_parallel (nunits, 1, 2);
-if (BYTES_BIG_ENDIAN)
-  std::swap (operands[1], operands[2]);
 emit_insn (gen_aarch64_addp_insn (operands[0], operands[1],
operands[2], par_even, par_odd));
 DONE;
diff --git a/gcc/testsuite/gcc.target/aarch64/vector_intrinsics_asm.c 
b/gcc/testsuite/gcc.target/aarch64/vector_intrinsics_asm.c
new file mode 100644
index ..e3dcd0830c84
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vector_intrinsics_asm.c
@@ -0,0 +1,371 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include "arm_neon.h"
+
+// SIGNED VADD INTRINSICS
+
+/*
+**test_vadd_s8:
+** addpv0\.8b, v0\.8b, v1\.8b
+** ret
+*/
+int8x8_t test_vadd_s8(int8x8_t v1, int8x8_t v2) {
+ int8x8_t v3 = vpadd_s8(v1, v2);
+ return v3;
+}
+
+/*
+**test_vadd_s16:
+**addp v0\.4h, v0\.4h, v1\.4h
+**ret
+*/
+int16x4_t test_vadd_s16(int16x4_t v1, int16x4_t v2) {
+ int16x4_t v3 = vpadd_s16(v1, v2);
+ return v3;
+}
+
+/*
+**test_vadd_s32:
+** addpv0\.2s, v0\.2s, v1\.2s
+** ret
+*/
+int32x2_t test_vadd_s32(int32x2_t v1, int32x2_t v2) {
+ int32x2_t v3 = vpadd_s32(v1, v2);
+ return v3;
+}
+
+/*
+**test_vaddq_s8:
+**...
+** addpv0\.16b, v0\.16b, v1\.16b
+** ret
+*/
+int8x16_t test_vaddq_s8(int8x16_t v1, int8x16_t v2) {
+ int8x16_t v3 = vpaddq_s8(v1, v2);
+ return v3;
+}
+
+/*
+**test_vaddq_s16:
+**...
+** addpv0\.8h, v0\.8h, v1\.8h
+** ret
+*/
+int16x8_t test_vaddq_s16(int16x8_t v1, int16x8_t v2) {
+ int16x8_t v3 = vpaddq_s16(v1, v2);
+ return v3;
+}
+
+/*
+**test_vaddq_s32:
+**...
+** addpv0\.4s, v0\.4s, v1\.4s
+** ret
+*/
+int32x4_t test_vaddq_s32(int32x4_t v1, int32x4_t v2) {
+ int32x4_t v3 = vpaddq_s32(v1, v2);
+ return v3;
+}
+
+/*
+**test_vaddq_s64:
+**...
+** addpv0\.2d, v0\.2d, v1\.2d
+** ret
+*/
+int64x2_t test_vaddq_s64(int64x2_t v1, int64x2_t v2) {
+ int64x2_t v3 = vpaddq_s64(v1, v2);
+ return v3;
+}
+
+/*
+**test_vaddd_s64:
+**...
+** addp(d[0-9]+), v0\.2d
+** fmovx0, \1
+** ret
+*/
+int64_t test_vaddd_s64(int64x2_t v1) {
+ int64_t v2 = vpaddd_s64(v1);
+ return v2;
+}
+
+/*
+**test_vaddl_s8:
+**...
+** saddlp  v0\.4h, v0\.8b
+** ret
+*/
+int16x4_t test_vaddl_s8(int8x8_t v1) {
+ int16x4_t v2 = vpaddl_s8(v1);
+ return v2;
+}
+
+/*
+**test_vaddlq_s8:
+**...
+** saddlp  v0\.8h, v0\.16b
+** ret
+*/
+int16x8_t test_vaddlq_s8(int8x16_t v1) {
+ int16x8_t v2 = vpaddlq_s8(v1);
+ return v2;
+}
+/*
+**test_vaddl_s16:
+**...
+** saddlp  v0\.2s, v0\.4h
+** ret
+*/
+int32x2_t test_vaddl_s16(int16x4_t v1) {
+ int32x2_t v2 = vpaddl_s16(v1);
+ return v2;
+}
+
+/*
+**test_vaddlq_s16:
+**...
+** saddlp  v0\.4s, v0\.8h
+** ret
+*/
+int32x4_t test_vaddlq_s16(int16x8_t v1) {
+ int32x4_t v2 = vpaddlq_s16(v1);
+ return v2;
+}
+
+/*
+**test_vaddl_s32:
+**...
+** saddlp  v0\.1d, v0\.2s
+** ret
+*/
+int64x1_t test_vaddl_s32(int32x2_t v1) {
+ int64x1_t v2 = vpaddl_s32(v1);
+ return v2;
+}
+
+/*
+**test_vaddlq_s32:
+**...
+** saddlp  v0\.2d, v0\.4s
+** ret
+*/
+int64x2_t test_vaddlq_s32(int32x4_t v1) {
+ int64x2_t v2 = vpaddlq_s32(v1);
+ return v2;
+}
+
+// UNSIGNED VADD INTRINSICS
+
+/*
+**test_vadd_u8:
+**...
+** addpv0\.8b, v0\.8b, v1\.8b
+** ret
+*/
+uint8x8_t test_vadd_u8(uint8x8_t v1, uint8x8_t v2) {
+ uint8x8_t v3 = vpadd_u8(v1, v2);
+ return v3;
+}
+
+/*
+**test_vadd_u16:
+**...
+** addpv0\.4h, v0\.4h, v1\.4h
+** ret
+*/
+uint16x4_t test_vadd_u16(uint16x4_t v1, uint16x4_t v2) {
+ uint16x4_t v3 = vpadd_u16(v1, v2);
+ return v3;
+}
+
+/*
+**test_vadd_u32:
+**...
+** addp

[gcc r14-10398] Arm: Fix ldrd offset range [PR115153]

2024-07-09 Thread Wilco Dijkstra via Gcc-cvs
https://gcc.gnu.org/g:83332e3f808b146ca06dbc6a91d15bd3e5650658

commit r14-10398-g83332e3f808b146ca06dbc6a91d15bd3e5650658
Author: Wilco Dijkstra 
Date:   Fri Jul 5 17:31:25 2024 +0100

Arm: Fix ldrd offset range [PR115153]

The valid offset range of LDRD in arm_legitimate_index_p is increased to
-1024..1020 if NEON is enabled since VALID_NEON_DREG_MODE includes DImode.
Fix this by moving the LDRD check earlier.

gcc:
PR target/115153
* config/arm/arm.cc (arm_legitimate_index_p): Move LDRD case before
NEON.
(thumb2_legitimate_index_p): Update comments.
(output_move_neon): Use DFmode for vldr/vstr and non-checking
adjust_address.

gcc/testsuite:
PR target/115153
* gcc.target/arm/pr115153.c: Add new test.
* lib/target-supports.exp: Add arm_arch_v7ve_neon target support.

(cherry picked from commit 44e5ecfd261afe72aa04eba4bf1a9ec782579cab)

Diff:
---
 gcc/config/arm/arm.cc   | 59 +
 gcc/testsuite/gcc.target/arm/pr115153.c | 16 +
 gcc/testsuite/lib/target-supports.exp   |  2 ++
 3 files changed, 48 insertions(+), 29 deletions(-)

diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index b8c32db0a1d7..912f2c315769 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -8852,6 +8852,28 @@ arm_legitimate_index_p (machine_mode mode, rtx index, 
RTX_CODE outer,
&& INTVAL (index) > -1024
&& (INTVAL (index) & 3) == 0);
 
+  if (arm_address_register_rtx_p (index, strict_p)
+  && (GET_MODE_SIZE (mode) <= 4))
+return 1;
+
+  /* This handles DFmode only if !TARGET_HARD_FLOAT.  */
+  if (mode == DImode || mode == DFmode)
+{
+  if (code == CONST_INT)
+   {
+ HOST_WIDE_INT val = INTVAL (index);
+
+ /* Assume we emit ldrd or 2x ldr if !TARGET_LDRD.
+If vldr is selected it uses arm_coproc_mem_operand.  */
+ if (TARGET_LDRD)
+   return val > -256 && val < 256;
+ else
+   return val > -4096 && val < 4092;
+   }
+
+  return TARGET_LDRD && arm_address_register_rtx_p (index, strict_p);
+}
+
   /* For quad modes, we restrict the constant offset to be slightly less
  than what the instruction format permits.  We do this because for
  quad mode moves, we will actually decompose them into two separate
@@ -8864,7 +8886,7 @@ arm_legitimate_index_p (machine_mode mode, rtx index, 
RTX_CODE outer,
&& (INTVAL (index) & 3) == 0);
 
   /* We have no such constraint on double mode offsets, so we permit the
- full range of the instruction format.  */
+ full range of the instruction format.  Note DImode is included here.  */
   if (TARGET_NEON && VALID_NEON_DREG_MODE (mode))
 return (code == CONST_INT
&& INTVAL (index) < 1024
@@ -8877,27 +8899,6 @@ arm_legitimate_index_p (machine_mode mode, rtx index, 
RTX_CODE outer,
&& INTVAL (index) > -1024
&& (INTVAL (index) & 3) == 0);
 
-  if (arm_address_register_rtx_p (index, strict_p)
-  && (GET_MODE_SIZE (mode) <= 4))
-return 1;
-
-  if (mode == DImode || mode == DFmode)
-{
-  if (code == CONST_INT)
-   {
- HOST_WIDE_INT val = INTVAL (index);
-
- /* Assume we emit ldrd or 2x ldr if !TARGET_LDRD.
-If vldr is selected it uses arm_coproc_mem_operand.  */
- if (TARGET_LDRD)
-   return val > -256 && val < 256;
- else
-   return val > -4096 && val < 4092;
-   }
-
-  return TARGET_LDRD && arm_address_register_rtx_p (index, strict_p);
-}
-
   if (GET_MODE_SIZE (mode) <= 4
   && ! (arm_arch4
&& (mode == HImode
@@ -9000,7 +9001,7 @@ thumb2_legitimate_index_p (machine_mode mode, rtx index, 
int strict_p)
&& (INTVAL (index) & 3) == 0);
 
   /* We have no such constraint on double mode offsets, so we permit the
- full range of the instruction format.  */
+ full range of the instruction format.  Note DImode is included here.  */
   if (TARGET_NEON && VALID_NEON_DREG_MODE (mode))
 return (code == CONST_INT
&& INTVAL (index) < 1024
@@ -9011,6 +9012,7 @@ thumb2_legitimate_index_p (machine_mode mode, rtx index, 
int strict_p)
   && (GET_MODE_SIZE (mode) <= 4))
 return 1;
 
+  /* This handles DImode if !TARGET_NEON, and DFmode if !TARGET_VFP_BASE.  */
   if (mode == DImode || mode == DFmode)
 {
   if (code == CONST_INT)
@@ -20859,10 +20861,9 @@ output_move_neon (rtx *operands)
int overlap = -1;
for (i = 0; i < nregs; i++)
  {
-   /* We're only using DImode here because it's a convenient
-  size.  */
-   ops[0] = gen_rtx_REG (DImode, REGNO (reg) + 2 * i);
-   ops[1] = adjust_address (mem, DImode, 8 * i);
+   /* Use DFmode for vldr/vstr.  */
+   ops[0] = gen_rtx_REG (DFmode, REGNO (reg) + 

[gcc r15-1865] Arm: Fix ldrd offset range [PR115153]

2024-07-05 Thread Wilco Dijkstra via Gcc-cvs
https://gcc.gnu.org/g:44e5ecfd261afe72aa04eba4bf1a9ec782579cab

commit r15-1865-g44e5ecfd261afe72aa04eba4bf1a9ec782579cab
Author: Wilco Dijkstra 
Date:   Fri Jul 5 17:31:25 2024 +0100

Arm: Fix ldrd offset range [PR115153]

The valid offset range of LDRD in arm_legitimate_index_p is increased to
-1024..1020 if NEON is enabled since VALID_NEON_DREG_MODE includes DImode.
Fix this by moving the LDRD check earlier.

gcc:
PR target/115153
* config/arm/arm.cc (arm_legitimate_index_p): Move LDRD case before
NEON.
(thumb2_legitimate_index_p): Update comments.
(output_move_neon): Use DFmode for vldr/vstr and non-checking
adjust_address.

gcc/testsuite:
PR target/115153
* gcc.target/arm/pr115153.c: Add new test.
* lib/target-supports.exp: Add arm_arch_v7ve_neon target support.

Diff:
---
 gcc/config/arm/arm.cc   | 59 +
 gcc/testsuite/gcc.target/arm/pr115153.c | 16 +
 gcc/testsuite/lib/target-supports.exp   |  2 ++
 3 files changed, 48 insertions(+), 29 deletions(-)

diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index bb9c7c3b5c4..459b7e648ab 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -8858,6 +8858,28 @@ arm_legitimate_index_p (machine_mode mode, rtx index, 
RTX_CODE outer,
&& INTVAL (index) > -1024
&& (INTVAL (index) & 3) == 0);
 
+  if (arm_address_register_rtx_p (index, strict_p)
+  && (GET_MODE_SIZE (mode) <= 4))
+return 1;
+
+  /* This handles DFmode only if !TARGET_HARD_FLOAT.  */
+  if (mode == DImode || mode == DFmode)
+{
+  if (code == CONST_INT)
+   {
+ HOST_WIDE_INT val = INTVAL (index);
+
+ /* Assume we emit ldrd or 2x ldr if !TARGET_LDRD.
+If vldr is selected it uses arm_coproc_mem_operand.  */
+ if (TARGET_LDRD)
+   return val > -256 && val < 256;
+ else
+   return val > -4096 && val < 4092;
+   }
+
+  return TARGET_LDRD && arm_address_register_rtx_p (index, strict_p);
+}
+
   /* For quad modes, we restrict the constant offset to be slightly less
  than what the instruction format permits.  We do this because for
  quad mode moves, we will actually decompose them into two separate
@@ -8870,7 +8892,7 @@ arm_legitimate_index_p (machine_mode mode, rtx index, 
RTX_CODE outer,
&& (INTVAL (index) & 3) == 0);
 
   /* We have no such constraint on double mode offsets, so we permit the
- full range of the instruction format.  */
+ full range of the instruction format.  Note DImode is included here.  */
   if (TARGET_NEON && VALID_NEON_DREG_MODE (mode))
 return (code == CONST_INT
&& INTVAL (index) < 1024
@@ -8883,27 +8905,6 @@ arm_legitimate_index_p (machine_mode mode, rtx index, 
RTX_CODE outer,
&& INTVAL (index) > -1024
&& (INTVAL (index) & 3) == 0);
 
-  if (arm_address_register_rtx_p (index, strict_p)
-  && (GET_MODE_SIZE (mode) <= 4))
-return 1;
-
-  if (mode == DImode || mode == DFmode)
-{
-  if (code == CONST_INT)
-   {
- HOST_WIDE_INT val = INTVAL (index);
-
- /* Assume we emit ldrd or 2x ldr if !TARGET_LDRD.
-If vldr is selected it uses arm_coproc_mem_operand.  */
- if (TARGET_LDRD)
-   return val > -256 && val < 256;
- else
-   return val > -4096 && val < 4092;
-   }
-
-  return TARGET_LDRD && arm_address_register_rtx_p (index, strict_p);
-}
-
   if (GET_MODE_SIZE (mode) <= 4
   && ! (arm_arch4
&& (mode == HImode
@@ -9006,7 +9007,7 @@ thumb2_legitimate_index_p (machine_mode mode, rtx index, 
int strict_p)
&& (INTVAL (index) & 3) == 0);
 
   /* We have no such constraint on double mode offsets, so we permit the
- full range of the instruction format.  */
+ full range of the instruction format.  Note DImode is included here.  */
   if (TARGET_NEON && VALID_NEON_DREG_MODE (mode))
 return (code == CONST_INT
&& INTVAL (index) < 1024
@@ -9017,6 +9018,7 @@ thumb2_legitimate_index_p (machine_mode mode, rtx index, 
int strict_p)
   && (GET_MODE_SIZE (mode) <= 4))
 return 1;
 
+  /* This handles DImode if !TARGET_NEON, and DFmode if !TARGET_VFP_BASE.  */
   if (mode == DImode || mode == DFmode)
 {
   if (code == CONST_INT)
@@ -20865,10 +20867,9 @@ output_move_neon (rtx *operands)
int overlap = -1;
for (i = 0; i < nregs; i++)
  {
-   /* We're only using DImode here because it's a convenient
-  size.  */
-   ops[0] = gen_rtx_REG (DImode, REGNO (reg) + 2 * i);
-   ops[1] = adjust_address (mem, DImode, 8 * i);
+   /* Use DFmode for vldr/vstr.  */
+   ops[0] = gen_rtx_REG (DFmode, REGNO (reg) + 2 * i);
+   ops[1] = adjust_address_nv (mem, DFmode, 8 * i);

[gcc r12-10603] AArch64: Fix strict-align cpymem/setmem [PR103100]

2024-07-05 Thread Wilco Dijkstra via Gcc-cvs
https://gcc.gnu.org/g:b9d16d8361a9e3a82a2f21e759e760d235d43322

commit r12-10603-gb9d16d8361a9e3a82a2f21e759e760d235d43322
Author: Wilco Dijkstra 
Date:   Wed Oct 25 16:28:04 2023 +0100

AArch64: Fix strict-align cpymem/setmem [PR103100]

The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
Clean up the condition when to use MOPS.

gcc/ChangeLog/
PR target/103100
* config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
(setmemdi): Likewise.
* config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
strict-align.  Cleanup condition for using MOPS.
(aarch64_expand_setmem): Likewise.

(cherry picked from commit 318f5232cfb3e0c9694889565e1f5424d0354463)

Diff:
---
 gcc/config/aarch64/aarch64.cc | 52 ++-
 gcc/config/aarch64/aarch64.md |  4 ++--
 2 files changed, 24 insertions(+), 32 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index f8082c4035e..cd2f4053a1a 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -24782,27 +24782,23 @@ aarch64_expand_cpymem (rtx *operands)
   int mode_bits;
   rtx dst = operands[0];
   rtx src = operands[1];
+  unsigned align = UINTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode;
+  bool size_p = optimize_function_for_size_p (cfun);
 
-  /* Variable-sized memcpy can go through the MOPS expansion if available.  */
-  if (!CONST_INT_P (operands[2]))
+  /* Variable-sized or strict-align copies may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
 return aarch64_expand_cpymem_mops (operands);
 
-  unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
-
-  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
-  unsigned HOST_WIDE_INT max_copy_size
-= TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
+  unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
 
-  bool size_p = optimize_function_for_size_p (cfun);
+  /* Try to inline up to 256 bytes.  */
+  unsigned max_copy_size = 256;
+  unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
 
-  /* Large constant-sized cpymem should go through MOPS when possible.
- It should be a win even for size optimization in the general case.
- For speed optimization the choice between MOPS and the SIMD sequence
- depends on the size of the copy, rather than number of instructions,
- alignment etc.  */
-  if (size > max_copy_size)
+  /* Large copies use MOPS when available or a library call.  */
+  if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
 return aarch64_expand_cpymem_mops (operands);
 
   int copy_bits = 256;
@@ -24966,12 +24962,13 @@ aarch64_expand_setmem (rtx *operands)
   unsigned HOST_WIDE_INT len;
   rtx dst = operands[0];
   rtx val = operands[2], src;
+  unsigned align = UINTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode, next_mode;
 
-  /* If we don't have SIMD registers or the size is variable use the MOPS
- inlined sequence if possible.  */
-  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
+  /* Variable-sized or strict-align memset may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
+  || (STRICT_ALIGNMENT && align < 16))
 return aarch64_expand_setmem_mops (operands);
 
   bool size_p = optimize_function_for_size_p (cfun);
@@ -24979,10 +24976,13 @@ aarch64_expand_setmem (rtx *operands)
   /* Default the maximum to 256-bytes when considering only libcall vs
  SIMD broadcast sequence.  */
   unsigned max_set_size = 256;
+  unsigned mops_threshold = aarch64_mops_memset_size_threshold;
 
-  len = INTVAL (operands[1]);
-  if (len > max_set_size && !TARGET_MOPS)
-return false;
+  len = UINTVAL (operands[1]);
+
+  /* Large memset uses MOPS when available or a library call.  */
+  if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
+return aarch64_expand_setmem_mops (operands);
 
   int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
   /* The MOPS sequence takes:
@@ -24995,12 +24995,6 @@ aarch64_expand_setmem (rtx *operands)
  the arguments + 1 for the call.  */
   unsigned libcall_cost = 4;
 
-  /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
- when available.  */
-  if (TARGET_MOPS
-  && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
-return aarch64_expand_setmem_mops (operands);
-
   /* Attempt a sequence with a vector broadcast followed by stores.
  Count the number of operations involved to see if it's worth it
  against the alternatives.  A simple counter simd_ops on the
@@ -25042,10 +25036,8 @@ aarch64_expand_setmem (rtx *operands)
   simd_ops++;
   n -= mode_bits;
 
-  /* Do 

[gcc r14-10383] Arm: Fix disassembly error in Thumb-1 relaxed load/store [PR115188]

2024-07-05 Thread Wilco Dijkstra via Gcc-cvs
https://gcc.gnu.org/g:100d353e545564931efaac90a089a4e8f3d42e6e

commit r14-10383-g100d353e545564931efaac90a089a4e8f3d42e6e
Author: Wilco Dijkstra 
Date:   Tue Jul 2 17:37:04 2024 +0100

Arm: Fix disassembly error in Thumb-1 relaxed load/store [PR115188]

A Thumb-1 memory operand allows single-register LDMIA/STMIA. This doesn't 
get
printed as LDR/STR with writeback in unified syntax, resulting in strange
assembler errors if writeback is selected.  To work around this, use the 
'Uw'
constraint that blocks writeback.  Also use a new 'mem_and_no_t1_wback_op'
which is a general memory operand that disallows writeback in Thumb-1.
A few other patterns were using 'm' for Thumb-1 in a similar way, update 
these
to also use 'mem_and_no_t1_wback_op' and 'Uw'.

gcc:
PR target/115188
* config/arm/arm.md (unaligned_loadsi): Use 'Uw' constraint and
'mem_and_no_t1_wback_op'.
(unaligned_loadhiu): Likewise.
(unaligned_storesi): Likewise.
(unaligned_storehi): Likewise.
* config/arm/predicates.md (mem_and_no_t1_wback_op): Add new 
predicate.
* config/arm/sync.md (arm_atomic_load): Use 'Uw' constraint.
(arm_atomic_store): Likewise.

gcc/testsuite:
PR target/115188
* gcc.target/arm/pr115188.c: Add new test.

(cherry picked from commit d04c5537f5ae4a3acd3f5135347d7e2d8c218811)

Diff:
---
 gcc/config/arm/arm.md   |  8 
 gcc/config/arm/predicates.md|  5 +
 gcc/config/arm/sync.md  |  4 ++--
 gcc/testsuite/gcc.target/arm/pr115188.c | 10 ++
 4 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 1fd00146ca9..13a8fbf7a14 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -5011,7 +5011,7 @@
 
 (define_insn "unaligned_loadsi"
   [(set (match_operand:SI 0 "s_register_operand" "=l,l,r")
-   (unspec:SI [(match_operand:SI 1 "memory_operand" "m,Uw,m")]
+   (unspec:SI [(match_operand:SI 1 "mem_and_no_t1_wback_op" "Uw,Uw,m")]
   UNSPEC_UNALIGNED_LOAD))]
   "unaligned_access"
   "@
@@ -5041,7 +5041,7 @@
 (define_insn "unaligned_loadhiu"
   [(set (match_operand:SI 0 "s_register_operand" "=l,l,r")
(zero_extend:SI
- (unspec:HI [(match_operand:HI 1 "memory_operand" "m,Uw,m")]
+ (unspec:HI [(match_operand:HI 1 "mem_and_no_t1_wback_op" "Uw,Uw,m")]
 UNSPEC_UNALIGNED_LOAD)))]
   "unaligned_access"
   "@
@@ -5066,7 +5066,7 @@
(set_attr "type" "store_8")])
 
 (define_insn "unaligned_storesi"
-  [(set (match_operand:SI 0 "memory_operand" "=m,Uw,m")
+  [(set (match_operand:SI 0 "mem_and_no_t1_wback_op" "=Uw,Uw,m")
(unspec:SI [(match_operand:SI 1 "s_register_operand" "l,l,r")]
   UNSPEC_UNALIGNED_STORE))]
   "unaligned_access"
@@ -5081,7 +5081,7 @@
(set_attr "type" "store_4")])
 
 (define_insn "unaligned_storehi"
-  [(set (match_operand:HI 0 "memory_operand" "=m,Uw,m")
+  [(set (match_operand:HI 0 "mem_and_no_t1_wback_op" "=Uw,Uw,m")
(unspec:HI [(match_operand:HI 1 "s_register_operand" "l,l,r")]
   UNSPEC_UNALIGNED_STORE))]
   "unaligned_access"
diff --git a/gcc/config/arm/predicates.md b/gcc/config/arm/predicates.md
index 4994c0c57d6..197054b6118 100644
--- a/gcc/config/arm/predicates.md
+++ b/gcc/config/arm/predicates.md
@@ -907,3 +907,8 @@
 ;; A special predicate that doesn't match a particular mode.
 (define_special_predicate "arm_any_register_operand"
   (match_code "reg"))
+
+;; General memory operand that disallows Thumb-1 POST_INC.
+(define_predicate "mem_and_no_t1_wback_op"
+  (and (match_operand 0 "memory_operand")
+   (match_test "!(TARGET_THUMB1 && GET_CODE (XEXP (op, 0)) == POST_INC)")))
diff --git a/gcc/config/arm/sync.md b/gcc/config/arm/sync.md
index df8dbe170ca..0a8347fc598 100644
--- a/gcc/config/arm/sync.md
+++ b/gcc/config/arm/sync.md
@@ -65,7 +65,7 @@
 (define_insn "arm_atomic_load"
   [(set (match_operand:QHSI 0 "register_operand" "=r,l")
 (unspec_volatile:QHSI
-  [(match_operand:QHSI 1 "memory_operand" "m,m")]
+  [(match_operand:QHSI 1 "mem_and_no_t1_wback_op" "m,Uw")]
   VUNSPEC_LDR))]
   ""
   "ldr\t%0, %1"
@@ -81,7 +81,7 @@
 )
 
 (define_insn "arm_atomic_store"
-  [(set (match_operand:QHSI 0 "memory_operand" "=m,m")
+  [(set (match_operand:QHSI 0 "mem_and_no_t1_wback_op" "=m,Uw")
 (unspec_volatile:QHSI
   [(match_operand:QHSI 1 "register_operand" "r,l")]
   VUNSPEC_STR))]
diff --git a/gcc/testsuite/gcc.target/arm/pr115188.c 
b/gcc/testsuite/gcc.target/arm/pr115188.c
new file mode 100644
index 000..9a4022b5679
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/pr115188.c
@@ -0,0 +1,10 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_arch_v6m_ok }
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v6m } */
+
+void init 

[gcc r15-1786] Arm: Fix disassembly error in Thumb-1 relaxed load/store [PR115188]

2024-07-02 Thread Wilco Dijkstra via Gcc-cvs
https://gcc.gnu.org/g:d04c5537f5ae4a3acd3f5135347d7e2d8c218811

commit r15-1786-gd04c5537f5ae4a3acd3f5135347d7e2d8c218811
Author: Wilco Dijkstra 
Date:   Tue Jul 2 17:37:04 2024 +0100

Arm: Fix disassembly error in Thumb-1 relaxed load/store [PR115188]

A Thumb-1 memory operand allows single-register LDMIA/STMIA. This doesn't 
get
printed as LDR/STR with writeback in unified syntax, resulting in strange
assembler errors if writeback is selected.  To work around this, use the 
'Uw'
constraint that blocks writeback.  Also use a new 'mem_and_no_t1_wback_op'
which is a general memory operand that disallows writeback in Thumb-1.
A few other patterns were using 'm' for Thumb-1 in a similar way, update 
these
to also use 'mem_and_no_t1_wback_op' and 'Uw'.

gcc:
PR target/115188
* config/arm/arm.md (unaligned_loadsi): Use 'Uw' constraint and
'mem_and_no_t1_wback_op'.
(unaligned_loadhiu): Likewise.
(unaligned_storesi): Likewise.
(unaligned_storehi): Likewise.
* config/arm/predicates.md (mem_and_no_t1_wback_op): Add new 
predicate.
* config/arm/sync.md (arm_atomic_load): Use 'Uw' constraint.
(arm_atomic_store): Likewise.

gcc/testsuite:
PR target/115188
* gcc.target/arm/pr115188.c: Add new test.

Diff:
---
 gcc/config/arm/arm.md   |  8 
 gcc/config/arm/predicates.md|  5 +
 gcc/config/arm/sync.md  |  4 ++--
 gcc/testsuite/gcc.target/arm/pr115188.c | 10 ++
 4 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index f47e036a803..aae47897199 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -5011,7 +5011,7 @@
 
 (define_insn "unaligned_loadsi"
   [(set (match_operand:SI 0 "s_register_operand" "=l,l,r")
-   (unspec:SI [(match_operand:SI 1 "memory_operand" "m,Uw,m")]
+   (unspec:SI [(match_operand:SI 1 "mem_and_no_t1_wback_op" "Uw,Uw,m")]
   UNSPEC_UNALIGNED_LOAD))]
   "unaligned_access"
   "@
@@ -5041,7 +5041,7 @@
 (define_insn "unaligned_loadhiu"
   [(set (match_operand:SI 0 "s_register_operand" "=l,l,r")
(zero_extend:SI
- (unspec:HI [(match_operand:HI 1 "memory_operand" "m,Uw,m")]
+ (unspec:HI [(match_operand:HI 1 "mem_and_no_t1_wback_op" "Uw,Uw,m")]
 UNSPEC_UNALIGNED_LOAD)))]
   "unaligned_access"
   "@
@@ -5066,7 +5066,7 @@
(set_attr "type" "store_8")])
 
 (define_insn "unaligned_storesi"
-  [(set (match_operand:SI 0 "memory_operand" "=m,Uw,m")
+  [(set (match_operand:SI 0 "mem_and_no_t1_wback_op" "=Uw,Uw,m")
(unspec:SI [(match_operand:SI 1 "s_register_operand" "l,l,r")]
   UNSPEC_UNALIGNED_STORE))]
   "unaligned_access"
@@ -5081,7 +5081,7 @@
(set_attr "type" "store_4")])
 
 (define_insn "unaligned_storehi"
-  [(set (match_operand:HI 0 "memory_operand" "=m,Uw,m")
+  [(set (match_operand:HI 0 "mem_and_no_t1_wback_op" "=Uw,Uw,m")
(unspec:HI [(match_operand:HI 1 "s_register_operand" "l,l,r")]
   UNSPEC_UNALIGNED_STORE))]
   "unaligned_access"
diff --git a/gcc/config/arm/predicates.md b/gcc/config/arm/predicates.md
index 4994c0c57d6..197054b6118 100644
--- a/gcc/config/arm/predicates.md
+++ b/gcc/config/arm/predicates.md
@@ -907,3 +907,8 @@
 ;; A special predicate that doesn't match a particular mode.
 (define_special_predicate "arm_any_register_operand"
   (match_code "reg"))
+
+;; General memory operand that disallows Thumb-1 POST_INC.
+(define_predicate "mem_and_no_t1_wback_op"
+  (and (match_operand 0 "memory_operand")
+   (match_test "!(TARGET_THUMB1 && GET_CODE (XEXP (op, 0)) == POST_INC)")))
diff --git a/gcc/config/arm/sync.md b/gcc/config/arm/sync.md
index df8dbe170ca..0a8347fc598 100644
--- a/gcc/config/arm/sync.md
+++ b/gcc/config/arm/sync.md
@@ -65,7 +65,7 @@
 (define_insn "arm_atomic_load"
   [(set (match_operand:QHSI 0 "register_operand" "=r,l")
 (unspec_volatile:QHSI
-  [(match_operand:QHSI 1 "memory_operand" "m,m")]
+  [(match_operand:QHSI 1 "mem_and_no_t1_wback_op" "m,Uw")]
   VUNSPEC_LDR))]
   ""
   "ldr\t%0, %1"
@@ -81,7 +81,7 @@
 )
 
 (define_insn "arm_atomic_store"
-  [(set (match_operand:QHSI 0 "memory_operand" "=m,m")
+  [(set (match_operand:QHSI 0 "mem_and_no_t1_wback_op" "=m,Uw")
 (unspec_volatile:QHSI
   [(match_operand:QHSI 1 "register_operand" "r,l")]
   VUNSPEC_STR))]
diff --git a/gcc/testsuite/gcc.target/arm/pr115188.c 
b/gcc/testsuite/gcc.target/arm/pr115188.c
new file mode 100644
index 000..9a4022b5679
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/pr115188.c
@@ -0,0 +1,10 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_arch_v6m_ok }
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v6m } */
+
+void init (int *p, int n)
+{
+  for (int i = 0; i < n; i++)
+__atomic_store_4 (p + i, 0, 

[gcc r13-8874] AArch64: Fix strict-align cpymem/setmem [PR103100]

2024-06-27 Thread Wilco Dijkstra via Gcc-cvs
https://gcc.gnu.org/g:5aa9ed0f353f835005c3df8932c7bc6e26f53904

commit r13-8874-g5aa9ed0f353f835005c3df8932c7bc6e26f53904
Author: Wilco Dijkstra 
Date:   Wed Oct 25 16:28:04 2023 +0100

AArch64: Fix strict-align cpymem/setmem [PR103100]

The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
Clean up the condition when to use MOPS.

gcc/ChangeLog/
PR target/103100
* config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
(setmemdi): Likewise.
* config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
strict-align.  Cleanup condition for using MOPS.
(aarch64_expand_setmem): Likewise.

(cherry picked from commit 318f5232cfb3e0c9694889565e1f5424d0354463)

Diff:
---
 gcc/config/aarch64/aarch64.cc | 52 ++-
 gcc/config/aarch64/aarch64.md |  4 ++--
 2 files changed, 24 insertions(+), 32 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index b8a4ab1b980..2f01580a797 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -24897,27 +24897,23 @@ aarch64_expand_cpymem (rtx *operands)
   int mode_bits;
   rtx dst = operands[0];
   rtx src = operands[1];
+  unsigned align = UINTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode;
+  bool size_p = optimize_function_for_size_p (cfun);
 
-  /* Variable-sized memcpy can go through the MOPS expansion if available.  */
-  if (!CONST_INT_P (operands[2]))
+  /* Variable-sized or strict-align copies may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
 return aarch64_expand_cpymem_mops (operands);
 
-  unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
-
-  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
-  unsigned HOST_WIDE_INT max_copy_size
-= TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
+  unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
 
-  bool size_p = optimize_function_for_size_p (cfun);
+  /* Try to inline up to 256 bytes.  */
+  unsigned max_copy_size = 256;
+  unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
 
-  /* Large constant-sized cpymem should go through MOPS when possible.
- It should be a win even for size optimization in the general case.
- For speed optimization the choice between MOPS and the SIMD sequence
- depends on the size of the copy, rather than number of instructions,
- alignment etc.  */
-  if (size > max_copy_size)
+  /* Large copies use MOPS when available or a library call.  */
+  if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
 return aarch64_expand_cpymem_mops (operands);
 
   int copy_bits = 256;
@@ -25081,12 +25077,13 @@ aarch64_expand_setmem (rtx *operands)
   unsigned HOST_WIDE_INT len;
   rtx dst = operands[0];
   rtx val = operands[2], src;
+  unsigned align = UINTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode, next_mode;
 
-  /* If we don't have SIMD registers or the size is variable use the MOPS
- inlined sequence if possible.  */
-  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
+  /* Variable-sized or strict-align memset may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
+  || (STRICT_ALIGNMENT && align < 16))
 return aarch64_expand_setmem_mops (operands);
 
   bool size_p = optimize_function_for_size_p (cfun);
@@ -25094,10 +25091,13 @@ aarch64_expand_setmem (rtx *operands)
   /* Default the maximum to 256-bytes when considering only libcall vs
  SIMD broadcast sequence.  */
   unsigned max_set_size = 256;
+  unsigned mops_threshold = aarch64_mops_memset_size_threshold;
 
-  len = INTVAL (operands[1]);
-  if (len > max_set_size && !TARGET_MOPS)
-return false;
+  len = UINTVAL (operands[1]);
+
+  /* Large memset uses MOPS when available or a library call.  */
+  if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
+return aarch64_expand_setmem_mops (operands);
 
   int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
   /* The MOPS sequence takes:
@@ -25110,12 +25110,6 @@ aarch64_expand_setmem (rtx *operands)
  the arguments + 1 for the call.  */
   unsigned libcall_cost = 4;
 
-  /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
- when available.  */
-  if (TARGET_MOPS
-  && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
-return aarch64_expand_setmem_mops (operands);
-
   /* Attempt a sequence with a vector broadcast followed by stores.
  Count the number of operations involved to see if it's worth it
  against the alternatives.  A simple counter simd_ops on the
@@ -25157,10 +25151,8 @@ aarch64_expand_setmem (rtx *operands)
   simd_ops++;
   n -= mode_bits;
 
-  /* Do 

[gcc r14-10338] AArch64: Fix cpu features initialization [PR115342]

2024-06-21 Thread Wilco Dijkstra via Gcc-cvs
https://gcc.gnu.org/g:9421f02916676d27e24fcda918f85e359329ac69

commit r14-10338-g9421f02916676d27e24fcda918f85e359329ac69
Author: Wilco Dijkstra 
Date:   Wed Jun 5 14:04:33 2024 +0100

AArch64: Fix cpu features initialization [PR115342]

The CPU features initialization code uses CPUID registers (rather than
HWCAP).  The equality comparisons it uses are incorrect: for example 
FEAT_SVE
is not set if SVE2 is available.  Using HWCAPs for these is both simpler and
correct.  The initialization must also be done atomically to avoid multiple
threads causing corruption due to non-atomic RMW accesses to the global.

libgcc:
PR target/115342
* config/aarch64/cpuinfo.c (__init_cpu_features_constructor):
Use HWCAP where possible.  Use atomic write for initialization.
Fix FEAT_PREDRES comparison.
(__init_cpu_features_resolver): Use atomic load for correct
initialization.
(__init_cpu_features): Likewise.
(cherry picked from commit d7cbcfe7c33645eaf95f175f19884d443817857b)

Diff:
---
 libgcc/config/aarch64/cpuinfo.c | 181 +---
 1 file changed, 75 insertions(+), 106 deletions(-)

diff --git a/libgcc/config/aarch64/cpuinfo.c b/libgcc/config/aarch64/cpuinfo.c
index eb0ac97255d..ec36d105738 100644
--- a/libgcc/config/aarch64/cpuinfo.c
+++ b/libgcc/config/aarch64/cpuinfo.c
@@ -230,14 +230,22 @@ struct {
 #ifndef HWCAP2_SVE_EBF16
 #define HWCAP2_SVE_EBF16 (1UL << 33)
 #endif
+#ifndef HWCAP2_SME2
+#define HWCAP2_SME2 (1UL << 37)
+#endif
+#ifndef HWCAP2_LRCPC3
+#define HWCAP2_LRCPC3  (1UL << 46)
+#endif
 
 static void
-__init_cpu_features_constructor(unsigned long hwcap,
-   const __ifunc_arg_t *arg) {
-#define setCPUFeature(F) __aarch64_cpu_features.features |= 1ULL << F
+__init_cpu_features_constructor (unsigned long hwcap,
+const __ifunc_arg_t *arg)
+{
+  unsigned long feat = 0;
+#define setCPUFeature(F) feat |= 1UL << F
 #define getCPUFeature(id, ftr) __asm__("mrs %0, " #id : "=r"(ftr))
 #define extractBits(val, start, number) \
-  (val & ((1ULL << number) - 1ULL) << start) >> start
+  (val & ((1UL << number) - 1UL) << start) >> start
   unsigned long hwcap2 = 0;
   if (hwcap & _IFUNC_ARG_HWCAP)
 hwcap2 = arg->_hwcap2;
@@ -247,26 +255,20 @@ __init_cpu_features_constructor(unsigned long hwcap,
 setCPUFeature(FEAT_PMULL);
   if (hwcap & HWCAP_FLAGM)
 setCPUFeature(FEAT_FLAGM);
-  if (hwcap2 & HWCAP2_FLAGM2) {
-setCPUFeature(FEAT_FLAGM);
+  if (hwcap2 & HWCAP2_FLAGM2)
 setCPUFeature(FEAT_FLAGM2);
-  }
-  if (hwcap & HWCAP_SM3 && hwcap & HWCAP_SM4)
+  if (hwcap & HWCAP_SM4)
 setCPUFeature(FEAT_SM4);
   if (hwcap & HWCAP_ASIMDDP)
 setCPUFeature(FEAT_DOTPROD);
   if (hwcap & HWCAP_ASIMDFHM)
 setCPUFeature(FEAT_FP16FML);
-  if (hwcap & HWCAP_FPHP) {
+  if (hwcap & HWCAP_FPHP)
 setCPUFeature(FEAT_FP16);
-setCPUFeature(FEAT_FP);
-  }
   if (hwcap & HWCAP_DIT)
 setCPUFeature(FEAT_DIT);
   if (hwcap & HWCAP_ASIMDRDM)
 setCPUFeature(FEAT_RDM);
-  if (hwcap & HWCAP_ILRCPC)
-setCPUFeature(FEAT_RCPC2);
   if (hwcap & HWCAP_AES)
 setCPUFeature(FEAT_AES);
   if (hwcap & HWCAP_SHA1)
@@ -280,22 +282,21 @@ __init_cpu_features_constructor(unsigned long hwcap,
   if (hwcap & HWCAP_SB)
 setCPUFeature(FEAT_SB);
   if (hwcap & HWCAP_SSBS)
-setCPUFeature(FEAT_SSBS2);
-  if (hwcap2 & HWCAP2_MTE) {
-setCPUFeature(FEAT_MEMTAG);
-setCPUFeature(FEAT_MEMTAG2);
-  }
-  if (hwcap2 & HWCAP2_MTE3) {
-setCPUFeature(FEAT_MEMTAG);
-setCPUFeature(FEAT_MEMTAG2);
+{
+  setCPUFeature(FEAT_SSBS);
+  setCPUFeature(FEAT_SSBS2);
+}
+  if (hwcap2 & HWCAP2_MTE)
+{
+  setCPUFeature(FEAT_MEMTAG);
+  setCPUFeature(FEAT_MEMTAG2);
+}
+  if (hwcap2 & HWCAP2_MTE3)
 setCPUFeature(FEAT_MEMTAG3);
-  }
   if (hwcap2 & HWCAP2_SVEAES)
 setCPUFeature(FEAT_SVE_AES);
-  if (hwcap2 & HWCAP2_SVEPMULL) {
-setCPUFeature(FEAT_SVE_AES);
+  if (hwcap2 & HWCAP2_SVEPMULL)
 setCPUFeature(FEAT_SVE_PMULL128);
-  }
   if (hwcap2 & HWCAP2_SVEBITPERM)
 setCPUFeature(FEAT_SVE_BITPERM);
   if (hwcap2 & HWCAP2_SVESHA3)
@@ -332,108 +333,76 @@ __init_cpu_features_constructor(unsigned long hwcap,
 setCPUFeature(FEAT_WFXT);
   if (hwcap2 & HWCAP2_SME)
 setCPUFeature(FEAT_SME);
+  if (hwcap2 & HWCAP2_SME2)
+setCPUFeature(FEAT_SME2);
   if (hwcap2 & HWCAP2_SME_I16I64)
 setCPUFeature(FEAT_SME_I64);
   if (hwcap2 & HWCAP2_SME_F64F64)
 setCPUFeature(FEAT_SME_F64);
-  if (hwcap & HWCAP_CPUID) {
-unsigned long ftr;
-getCPUFeature(ID_AA64PFR1_EL1, ftr);
-/* ID_AA64PFR1_EL1.MTE >= 0b0001  */
-if (extractBits(ftr, 8, 4) >= 0x1)
-  setCPUFeature(FEAT_MEMTAG);
-/* ID_AA64PFR1_EL1.SSBS == 0b0001  */
-if (extractBits(ftr, 4, 4) == 0x1)
-  setCPUFeature(FEAT_SSBS);
-/* ID_AA64PFR1_EL1.SME == 0b0010  */
-if 

[gcc r15-1036] AArch64: Fix cpu features initialization [PR115342]

2024-06-05 Thread Wilco Dijkstra via Gcc-cvs
https://gcc.gnu.org/g:d7cbcfe7c33645eaf95f175f19884d443817857b

commit r15-1036-gd7cbcfe7c33645eaf95f175f19884d443817857b
Author: Wilco Dijkstra 
Date:   Wed Jun 5 14:04:33 2024 +0100

AArch64: Fix cpu features initialization [PR115342]

The CPU features initialization code uses CPUID registers (rather than
HWCAP).  The equality comparisons it uses are incorrect: for example 
FEAT_SVE
is not set if SVE2 is available.  Using HWCAPs for these is both simpler and
correct.  The initialization must also be done atomically to avoid multiple
threads causing corruption due to non-atomic RMW accesses to the global.

libgcc:
PR target/115342
* config/aarch64/cpuinfo.c (__init_cpu_features_constructor):
Use HWCAP where possible.  Use atomic write for initialization.
Fix FEAT_PREDRES comparison.
(__init_cpu_features_resolver): Use atomic load for correct
initialization.
(__init_cpu_features): Likewise.

Diff:
---
 libgcc/config/aarch64/cpuinfo.c | 181 +---
 1 file changed, 75 insertions(+), 106 deletions(-)

diff --git a/libgcc/config/aarch64/cpuinfo.c b/libgcc/config/aarch64/cpuinfo.c
index 4b94fca8695..544c5516133 100644
--- a/libgcc/config/aarch64/cpuinfo.c
+++ b/libgcc/config/aarch64/cpuinfo.c
@@ -227,14 +227,22 @@ struct {
 #ifndef HWCAP2_SVE_EBF16
 #define HWCAP2_SVE_EBF16 (1UL << 33)
 #endif
+#ifndef HWCAP2_SME2
+#define HWCAP2_SME2 (1UL << 37)
+#endif
+#ifndef HWCAP2_LRCPC3
+#define HWCAP2_LRCPC3  (1UL << 46)
+#endif
 
 static void
-__init_cpu_features_constructor(unsigned long hwcap,
-   const __ifunc_arg_t *arg) {
-#define setCPUFeature(F) __aarch64_cpu_features.features |= 1ULL << F
+__init_cpu_features_constructor (unsigned long hwcap,
+const __ifunc_arg_t *arg)
+{
+  unsigned long feat = 0;
+#define setCPUFeature(F) feat |= 1UL << F
 #define getCPUFeature(id, ftr) __asm__("mrs %0, " #id : "=r"(ftr))
 #define extractBits(val, start, number) \
-  (val & ((1ULL << number) - 1ULL) << start) >> start
+  (val & ((1UL << number) - 1UL) << start) >> start
   unsigned long hwcap2 = 0;
   if (hwcap & _IFUNC_ARG_HWCAP)
 hwcap2 = arg->_hwcap2;
@@ -244,26 +252,20 @@ __init_cpu_features_constructor(unsigned long hwcap,
 setCPUFeature(FEAT_PMULL);
   if (hwcap & HWCAP_FLAGM)
 setCPUFeature(FEAT_FLAGM);
-  if (hwcap2 & HWCAP2_FLAGM2) {
-setCPUFeature(FEAT_FLAGM);
+  if (hwcap2 & HWCAP2_FLAGM2)
 setCPUFeature(FEAT_FLAGM2);
-  }
-  if (hwcap & HWCAP_SM3 && hwcap & HWCAP_SM4)
+  if (hwcap & HWCAP_SM4)
 setCPUFeature(FEAT_SM4);
   if (hwcap & HWCAP_ASIMDDP)
 setCPUFeature(FEAT_DOTPROD);
   if (hwcap & HWCAP_ASIMDFHM)
 setCPUFeature(FEAT_FP16FML);
-  if (hwcap & HWCAP_FPHP) {
+  if (hwcap & HWCAP_FPHP)
 setCPUFeature(FEAT_FP16);
-setCPUFeature(FEAT_FP);
-  }
   if (hwcap & HWCAP_DIT)
 setCPUFeature(FEAT_DIT);
   if (hwcap & HWCAP_ASIMDRDM)
 setCPUFeature(FEAT_RDM);
-  if (hwcap & HWCAP_ILRCPC)
-setCPUFeature(FEAT_RCPC2);
   if (hwcap & HWCAP_AES)
 setCPUFeature(FEAT_AES);
   if (hwcap & HWCAP_SHA1)
@@ -277,22 +279,21 @@ __init_cpu_features_constructor(unsigned long hwcap,
   if (hwcap & HWCAP_SB)
 setCPUFeature(FEAT_SB);
   if (hwcap & HWCAP_SSBS)
-setCPUFeature(FEAT_SSBS2);
-  if (hwcap2 & HWCAP2_MTE) {
-setCPUFeature(FEAT_MEMTAG);
-setCPUFeature(FEAT_MEMTAG2);
-  }
-  if (hwcap2 & HWCAP2_MTE3) {
-setCPUFeature(FEAT_MEMTAG);
-setCPUFeature(FEAT_MEMTAG2);
+{
+  setCPUFeature(FEAT_SSBS);
+  setCPUFeature(FEAT_SSBS2);
+}
+  if (hwcap2 & HWCAP2_MTE)
+{
+  setCPUFeature(FEAT_MEMTAG);
+  setCPUFeature(FEAT_MEMTAG2);
+}
+  if (hwcap2 & HWCAP2_MTE3)
 setCPUFeature(FEAT_MEMTAG3);
-  }
   if (hwcap2 & HWCAP2_SVEAES)
 setCPUFeature(FEAT_SVE_AES);
-  if (hwcap2 & HWCAP2_SVEPMULL) {
-setCPUFeature(FEAT_SVE_AES);
+  if (hwcap2 & HWCAP2_SVEPMULL)
 setCPUFeature(FEAT_SVE_PMULL128);
-  }
   if (hwcap2 & HWCAP2_SVEBITPERM)
 setCPUFeature(FEAT_SVE_BITPERM);
   if (hwcap2 & HWCAP2_SVESHA3)
@@ -329,108 +330,76 @@ __init_cpu_features_constructor(unsigned long hwcap,
 setCPUFeature(FEAT_WFXT);
   if (hwcap2 & HWCAP2_SME)
 setCPUFeature(FEAT_SME);
+  if (hwcap2 & HWCAP2_SME2)
+setCPUFeature(FEAT_SME2);
   if (hwcap2 & HWCAP2_SME_I16I64)
 setCPUFeature(FEAT_SME_I64);
   if (hwcap2 & HWCAP2_SME_F64F64)
 setCPUFeature(FEAT_SME_F64);
-  if (hwcap & HWCAP_CPUID) {
-unsigned long ftr;
-getCPUFeature(ID_AA64PFR1_EL1, ftr);
-/* ID_AA64PFR1_EL1.MTE >= 0b0001  */
-if (extractBits(ftr, 8, 4) >= 0x1)
-  setCPUFeature(FEAT_MEMTAG);
-/* ID_AA64PFR1_EL1.SSBS == 0b0001  */
-if (extractBits(ftr, 4, 4) == 0x1)
-  setCPUFeature(FEAT_SSBS);
-/* ID_AA64PFR1_EL1.SME == 0b0010  */
-if (extractBits(ftr, 24, 4) == 0x2)
-  setCPUFeature(FEAT_SME2);
-

[gcc r15-1035] testsuite: Improve check-function-bodies

2024-06-05 Thread Wilco Dijkstra via Gcc-cvs
https://gcc.gnu.org/g:acdc9df371fbe99e814a3f35a439531e08af79e7

commit r15-1035-gacdc9df371fbe99e814a3f35a439531e08af79e7
Author: Wilco Dijkstra 
Date:   Wed Jun 5 14:05:59 2024 +0100

testsuite: Improve check-function-bodies

Improve check-function-bodies by allowing single-character function names.

gcc/testsuite:
* lib/scanasm.exp (configure_check-function-bodies): Allow 
single-char
function names.

Diff:
---
 gcc/testsuite/lib/scanasm.exp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/lib/scanasm.exp b/gcc/testsuite/lib/scanasm.exp
index 6cf9997240d..42c719c512c 100644
--- a/gcc/testsuite/lib/scanasm.exp
+++ b/gcc/testsuite/lib/scanasm.exp
@@ -869,15 +869,15 @@ proc configure_check-function-bodies { config } {
 # Regexp for the start of a function definition (name in \1).
 if { [istarget nvptx*-*-*] } {
set up_config(start) {
-   {^// BEGIN(?: GLOBAL|) FUNCTION DEF: ([a-zA-Z_]\S+)$}
+   {^// BEGIN(?: GLOBAL|) FUNCTION DEF: ([a-zA-Z_]\S*)$}
}
 } elseif { [istarget *-*-darwin*] } {
set up_config(start) {
-   {^_([a-zA-Z_]\S+):$}
+   {^_([a-zA-Z_]\S*):$}
{^LFB[0-9]+:}
}
 } else {
-   set up_config(start) {{^([a-zA-Z_]\S+):$}}
+   set up_config(start) {{^([a-zA-Z_]\S*):$}}
 }
 
 # Regexp for the end of a function definition.


[gcc r15-696] AArch64: Improve costing of ctz

2024-05-20 Thread Wilco Dijkstra via Gcc-cvs
https://gcc.gnu.org/g:e14c673ea9ab2eca5de4db91b478f0b5297ef321

commit r15-696-ge14c673ea9ab2eca5de4db91b478f0b5297ef321
Author: Wilco Dijkstra 
Date:   Wed Apr 17 17:18:23 2024 +0100

AArch64: Improve costing of ctz

Improve costing of ctz - both TARGET_CSSC and vector cases were not handled 
yet.

gcc:
* config/aarch64/aarch64.cc (aarch64_rtx_costs): Improve CTZ 
costing.

Diff:
---
 gcc/config/aarch64/aarch64.cc | 22 ++
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 662ff5a9b0c7..ee12d8897a88 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -14349,10 +14349,24 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int 
outer ATTRIBUTE_UNUSED,
   return false;
 
 case CTZ:
-  *cost = COSTS_N_INSNS (2);
-
-  if (speed)
-   *cost += extra_cost->alu.clz + extra_cost->alu.rev;
+  if (VECTOR_MODE_P (mode))
+   {
+ *cost = COSTS_N_INSNS (3);
+ if (speed)
+   *cost += extra_cost->vect.alu * 3;
+   }
+  else if (TARGET_CSSC)
+   {
+ *cost = COSTS_N_INSNS (1);
+ if (speed)
+   *cost += extra_cost->alu.clz;
+   }
+  else
+   {
+ *cost = COSTS_N_INSNS (2);
+ if (speed)
+   *cost += extra_cost->alu.clz + extra_cost->alu.rev;
+   }
   return false;
 
 case COMPARE:


[gcc r15-695] AArch64: Fix printing of 2-instruction alternatives

2024-05-20 Thread Wilco Dijkstra via Gcc-cvs
https://gcc.gnu.org/g:804fa0bb92f8073394b3859edb810c3e23375530

commit r15-695-g804fa0bb92f8073394b3859edb810c3e23375530
Author: Wilco Dijkstra 
Date:   Thu Apr 25 17:33:00 2024 +0100

AArch64: Fix printing of 2-instruction alternatives

Add missing '\' in 2-instruction movsi/di alternatives so that they are
printed on separate lines.

gcc:
* config/aarch64/aarch64.md (movsi_aarch64): Use '\;' to force
newline in 2-instruction pattern.
(movdi_aarch64): Likewise.

Diff:
---
 gcc/config/aarch64/aarch64.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index dbde066f7478..9dff2d7a2b00 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1447,7 +1447,7 @@
  [w  , m  ; load_4   , fp  , 4] ldr\t%s0, %1
  [m  , r Z; store_4  , *   , 4] str\t%w1, %0
  [m  , w  ; store_4  , fp  , 4] str\t%s1, %0
- [r  , Usw; load_4   , *   , 8] adrp\t%x0, %A1;ldr\t%w0, [%x0, %L1]
+ [r  , Usw; load_4   , *   , 8] adrp\t%x0, %A1\;ldr\t%w0, [%x0, %L1]
  [r  , Usa; adr  , *   , 4] adr\t%x0, %c1
  [r  , Ush; adr  , *   , 4] adrp\t%x0, %A1
  [w  , r Z; f_mcr, fp  , 4] fmov\t%s0, %w1
@@ -1484,7 +1484,7 @@
  [w, m  ; load_8   , fp  , 4] ldr\t%d0, %1
  [m, r Z; store_8  , *   , 4] str\t%x1, %0
  [m, w  ; store_8  , fp  , 4] str\t%d1, %0
- [r, Usw; load_8   , *   , 8] << TARGET_ILP32 ? "adrp\t%0, %A1;ldr\t%w0, 
[%0, %L1]" : "adrp\t%0, %A1;ldr\t%0, [%0, %L1]";
+ [r, Usw; load_8   , *   , 8] << TARGET_ILP32 ? "adrp\t%0, %A1\;ldr\t%w0, 
[%0, %L1]" : "adrp\t%0, %A1\;ldr\t%0, [%0, %L1]";
  [r, Usa; adr  , *   , 4] adr\t%x0, %c1
  [r, Ush; adr  , *   , 4] adrp\t%x0, %A1
  [w, r Z; f_mcr, fp  , 4] fmov\t%d0, %x1


[gcc r15-513] AArch64: Use UZP1 instead of INS

2024-05-15 Thread Wilco Dijkstra via Gcc-cvs
https://gcc.gnu.org/g:43fb827f259e6fdea39bc4021950c810be769d58

commit r15-513-g43fb827f259e6fdea39bc4021950c810be769d58
Author: Wilco Dijkstra 
Date:   Wed May 15 13:07:27 2024 +0100

AArch64: Use UZP1 instead of INS

Use UZP1 instead of INS when combining low and high halves of vectors.
UZP1 has 3 operands which improves register allocation, and is faster on
some microarchitectures.

gcc:
* config/aarch64/aarch64-simd.md (aarch64_combine_internal):
Use UZP1 instead of INS.
(aarch64_combine_internal_be): Likewise.

gcc/testsuite:
* gcc.target/aarch64/ldp_stp_16.c: Update to check for UZP1.
* gcc.target/aarch64/pr109072_1.c: Likewise.
* gcc.target/aarch64/vec-init-14.c: Likewise.
* gcc.target/aarch64/vec-init-9.c: Likewise.

Diff:
---
 gcc/config/aarch64/aarch64-simd.md |  4 ++--
 gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c  | 16 
 gcc/testsuite/gcc.target/aarch64/pr109072_1.c  |  4 ++--
 gcc/testsuite/gcc.target/aarch64/vec-init-14.c |  4 ++--
 gcc/testsuite/gcc.target/aarch64/vec-init-9.c  | 12 ++--
 5 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index f8bb973a278c..16b7445d9f72 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4388,7 +4388,7 @@
&& (register_operand (operands[0], mode)
|| register_operand (operands[2], mode))"
   {@ [ cons: =0 , 1  , 2   ; attrs: type   , arch  ]
- [ w, 0  , w   ; neon_ins, simd  ] 
ins\t%0.[1], %2.[0]
+ [ w, w  , w   ; neon_permute, simd  ] 
uzp1\t%0.2, %1.2, %2.2
  [ w, 0  , ?r  ; neon_from_gp, simd  ] 
ins\t%0.[1], %2
  [ w, 0  , ?r  ; f_mcr , * ] 
fmov\t%0.d[1], %2
  [ w, 0  , Utv ; neon_load1_one_lane , simd  ] 
ld1\t{%0.}[1], %2
@@ -4407,7 +4407,7 @@
&& (register_operand (operands[0], mode)
|| register_operand (operands[2], mode))"
   {@ [ cons: =0 , 1  , 2   ; attrs: type   , arch  ]
- [ w, 0  , w   ; neon_ins, simd  ] 
ins\t%0.[1], %2.[0]
+ [ w, w  , w   ; neon_permute, simd  ] 
uzp1\t%0.2, %1.2, %2.2
  [ w, 0  , ?r  ; neon_from_gp, simd  ] 
ins\t%0.[1], %2
  [ w, 0  , ?r  ; f_mcr , * ] 
fmov\t%0.d[1], %2
  [ w, 0  , Utv ; neon_load1_one_lane , simd  ] 
ld1\t{%0.}[1], %2
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c 
b/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
index f1f46e051a86..95835aa2eb41 100644
--- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
+++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
@@ -80,16 +80,16 @@ CONS2_FN (2, float);
 
 /*
 ** cons2_4_float:  { target aarch64_little_endian }
-** ins v0.s\[1\], v1.s\[0\]
-** stp d0, d0, \[x0\]
-** stp d0, d0, \[x0, #?16\]
+** uzp1v([0-9])\.2s, v0\.2s, v1\.2s
+** stp d\1, d\1, \[x0\]
+** stp d\1, d\1, \[x0, #?16\]
 ** ret
 */
 /*
 ** cons2_4_float:  { target aarch64_big_endian }
-** ins v1.s\[1\], v0.s\[0\]
-** stp d1, d1, \[x0\]
-** stp d1, d1, \[x0, #?16\]
+** uzp1v([0-9])\.2s, v1\.2s, v0\.2s
+** stp d\1, d\1, \[x0\]
+** stp d\1, d\1, \[x0, #?16\]
 ** ret
 */
 CONS2_FN (4, float);
@@ -125,8 +125,8 @@ CONS4_FN (2, float);
 
 /*
 ** cons4_4_float:
-** ins v[0-9]+\.s[^\n]+
-** ins v[0-9]+\.s[^\n]+
+** uzp1v[0-9]+\.2s[^\n]+
+** uzp1v[0-9]+\.2s[^\n]+
 ** zip1v([0-9]+).4s, [^\n]+
 ** stp q\1, q\1, \[x0\]
 ** stp q\1, q\1, \[x0, #?32\]
diff --git a/gcc/testsuite/gcc.target/aarch64/pr109072_1.c 
b/gcc/testsuite/gcc.target/aarch64/pr109072_1.c
index 6c1d2b0bdccf..0fc195a598f3 100644
--- a/gcc/testsuite/gcc.target/aarch64/pr109072_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/pr109072_1.c
@@ -54,7 +54,7 @@ f32x2_1 (float32_t x)
 
 /*
 ** f32x2_2:
-** ins v0\.s\[1\], v1.s\[0\]
+** uzp1v0\.2s, v0\.2s, v1\.2s
 ** ret
 */
 float32x2_t
@@ -165,7 +165,7 @@ f64x2_1 (float64_t x)
 
 /*
 ** f64x2_2:
-** ins v0\.d\[1\], v1.d\[0\]
+** uzp1v0\.2d, v0\.2d, v1\.2d
 ** ret
 */
 float64x2_t
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-14.c 
b/gcc/testsuite/gcc.target/aarch64/vec-init-14.c
index 02875088cd98..1a2cc9fbf473 100644
--- a/gcc/testsuite/gcc.target/aarch64/vec-init-14.c
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-14.c
@@ -67,7 +67,7 @@ int32x2_t s32_6(int32_t a0, int32_t a1) {
 
 /*
 ** f32_1:
-** ins v0\.s\[1\], v1\.s\[0\]
+** uzp1v0\.2s, v0\.2s, v1\.2s
 ** ret
 */
 float32x2_t f32_1(float32_t a0, float32_t a1) {
@@ -90,7 +90,7 @@ float32x2_t f32_2(float32_t a0, float32_t *ptr) {
 /*
 ** f32_3:
 ** ldr s0, 

[gcc r15-8] AArch64: Cleanup memset expansion

2024-04-26 Thread Wilco Dijkstra via Gcc-cvs
https://gcc.gnu.org/g:6b86f71165de9ee64fb76489c04ce032dd74ac21

commit r15-8-g6b86f71165de9ee64fb76489c04ce032dd74ac21
Author: Wilco Dijkstra 
Date:   Wed Feb 21 23:34:37 2024 +

AArch64: Cleanup memset expansion

Cleanup memset implementation.  Similar to memcpy/memmove, use an offset and
bytes throughout.  Simplify the complex calculations when optimizing for 
size
by using a fixed limit.

gcc/ChangeLog:
* config/aarch64/aarch64.cc (MAX_SET_SIZE): New define.
(aarch64_progress_pointer): Remove function.
(aarch64_set_one_block_and_progress_pointer): Simplify and clean up.
(aarch64_expand_setmem): Clean up implementation, use byte offsets,
simplify size calculation.

Diff:
---
 gcc/config/aarch64/aarch64.cc | 157 ++
 1 file changed, 35 insertions(+), 122 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index bf5fb129b45..a4b7db62546 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -103,6 +103,10 @@
 /* Defined for convenience.  */
 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
 
+/* Maximum bytes set for an inline memset expansion.  With -Os use 3 STP
+   and 1 MOVI/DUP (same size as a call).  */
+#define MAX_SET_SIZE(speed) (speed ? 256 : 96)
+
 /* Flags that describe how a function shares certain architectural state
with its callers.
 
@@ -26565,15 +26569,6 @@ aarch64_move_pointer (rtx pointer, poly_int64 amount)
next, amount);
 }
 
-/* Return a new RTX holding the result of moving POINTER forward by the
-   size of the mode it points to.  */
-
-static rtx
-aarch64_progress_pointer (rtx pointer)
-{
-  return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
-}
-
 /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
rather than memcpy.  Return true iff we succeeded.  */
@@ -26699,48 +26694,6 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   return true;
 }
 
-/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
-   SRC is a register we have created with the duplicated value to be set.  */
-static void
-aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
-   machine_mode mode)
-{
-  /* If we are copying 128bits or 256bits, we can do that straight from
- the SIMD register we prepared.  */
-  if (known_eq (GET_MODE_BITSIZE (mode), 256))
-{
-  mode = GET_MODE (src);
-  /* "Cast" the *dst to the correct mode.  */
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memset.  */
-  emit_move_insn (*dst, src);
-  emit_move_insn (aarch64_move_pointer (*dst, 16), src);
-
-  /* Move the pointers forward.  */
-  *dst = aarch64_move_pointer (*dst, 32);
-  return;
-}
-  if (known_eq (GET_MODE_BITSIZE (mode), 128))
-{
-  /* "Cast" the *dst to the correct mode.  */
-  *dst = adjust_address (*dst, GET_MODE (src), 0);
-  /* Emit the memset.  */
-  emit_move_insn (*dst, src);
-  /* Move the pointers forward.  */
-  *dst = aarch64_move_pointer (*dst, 16);
-  return;
-}
-  /* For copying less, we have to extract the right amount from src.  */
-  rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
-
-  /* "Cast" the *dst to the correct mode.  */
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memset.  */
-  emit_move_insn (*dst, reg);
-  /* Move the pointer forward.  */
-  *dst = aarch64_progress_pointer (*dst);
-}
-
 /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
as for the setmem pattern.  Return true iff we succeed.  */
 static bool
@@ -26767,24 +26720,21 @@ aarch64_expand_setmem_mops (rtx *operands)
 bool
 aarch64_expand_setmem (rtx *operands)
 {
-  int n, mode_bits;
+  int mode_bytes;
   unsigned HOST_WIDE_INT len;
   rtx dst = operands[0];
   rtx val = operands[2], src;
   unsigned align = UINTVAL (operands[3]);
   rtx base;
-  machine_mode cur_mode = BLKmode, next_mode;
+  machine_mode mode = BLKmode, next_mode;
 
   /* Variable-sized or strict-align memset may use the MOPS expansion.  */
   if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
   || (STRICT_ALIGNMENT && align < 16))
 return aarch64_expand_setmem_mops (operands);
 
-  bool size_p = optimize_function_for_size_p (cfun);
-
-  /* Default the maximum to 256-bytes when considering only libcall vs
- SIMD broadcast sequence.  */
-  unsigned max_set_size = 256;
+  /* Set inline limits for memset.  MOPS has a separate threshold.  */
+  unsigned max_set_size = MAX_SET_SIZE (optimize_function_for_speed_p (cfun));
   unsigned mops_threshold = aarch64_mops_memset_size_threshold;
 
   len = UINTVAL (operands[1]);
@@ -26793,88 +26743,51 @@ aarch64_expand_setmem (rtx *operands)
   if (len > 

[gcc r15-7] AArch64: Remove AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS

2024-04-26 Thread Wilco Dijkstra via Gcc-cvs
https://gcc.gnu.org/g:768fbb56b3285b2a3cf067881e745e0f8caec215

commit r15-7-g768fbb56b3285b2a3cf067881e745e0f8caec215
Author: Wilco Dijkstra 
Date:   Fri Apr 26 15:09:31 2024 +0100

AArch64: Remove AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS

Remove the tune AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS since it is only
used by an old core and doesn't properly support -Os.  SPECINT_2017
shows that removing it has no performance difference, while codesize
is reduced by 0.07%.

gcc/ChangeLog:
* config/aarch64/aarch64.cc (aarch64_mode_valid_for_sched_fusion_p):
Remove check for AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS.
(aarch64_advsimd_ldp_stp_p): Likewise.
(aarch64_stp_sequence_cost): Likewise.
(aarch64_expand_cpymem): Likewise.
(aarch64_expand_setmem): Likewise.
* config/aarch64/aarch64-ldp-fusion.cc (ldp_operand_mode_ok_p):
Likewise.
* config/aarch64/aarch64-ldpstp.md: Likewise.
* config/aarch64/aarch64-tuning-flags.def: Remove NO_LDP_STP_QREGS.
* config/aarch64/tuning_models/emag.h: Likewise.
* config/aarch64/tuning_models/xgene1.h: Likewise.

gcc/testsuite:
* gcc.target/aarch64/ldp_stp_q_disable.c: Remove test.

Diff:
---
 gcc/config/aarch64/aarch64-ldp-fusion.cc   |  8 ---
 gcc/config/aarch64/aarch64-ldpstp.md   |  8 ++-
 gcc/config/aarch64/aarch64-tuning-flags.def|  3 ---
 gcc/config/aarch64/aarch64.cc  | 22 --
 gcc/config/aarch64/tuning_models/emag.h|  2 +-
 gcc/config/aarch64/tuning_models/xgene1.h  |  2 +-
 .../gcc.target/aarch64/ldp_stp_q_disable.c | 26 --
 7 files changed, 8 insertions(+), 63 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
b/gcc/config/aarch64/aarch64-ldp-fusion.cc
index 365dcf48b22..0bc225dae7b 100644
--- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
+++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
@@ -315,17 +315,9 @@ any_post_modify_p (rtx x)
 static bool
 ldp_operand_mode_ok_p (machine_mode mode)
 {
-  const bool allow_qregs
-= !(aarch64_tune_params.extra_tuning_flags
-   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS);
-
   if (!aarch64_ldpstp_operand_mode_p (mode))
 return false;
 
-  const auto size = GET_MODE_SIZE (mode).to_constant ();
-  if (size == 16 && !allow_qregs)
-return false;
-
   // We don't pair up TImode accesses before RA because TImode is
   // special in that it can be allocated to a pair of GPRs or a single
   // FPR, and the RA is best placed to make that decision.
diff --git a/gcc/config/aarch64/aarch64-ldpstp.md 
b/gcc/config/aarch64/aarch64-ldpstp.md
index b7c0bf05cd1..7890a8cc32b 100644
--- a/gcc/config/aarch64/aarch64-ldpstp.md
+++ b/gcc/config/aarch64/aarch64-ldpstp.md
@@ -96,9 +96,7 @@
(set (match_operand:VQ2 2 "register_operand" "")
(match_operand:VQ2 3 "memory_operand" ""))]
   "TARGET_FLOAT
-   && aarch64_operands_ok_for_ldpstp (operands, true)
-   && (aarch64_tune_params.extra_tuning_flags
-   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0"
+   && aarch64_operands_ok_for_ldpstp (operands, true)"
   [(const_int 0)]
 {
   aarch64_finish_ldpstp_peephole (operands, true);
@@ -111,9 +109,7 @@
(set (match_operand:VQ2 2 "memory_operand" "")
(match_operand:VQ2 3 "register_operand" ""))]
   "TARGET_FLOAT
-   && aarch64_operands_ok_for_ldpstp (operands, false)
-   && (aarch64_tune_params.extra_tuning_flags
-   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0"
+   && aarch64_operands_ok_for_ldpstp (operands, false)"
   [(const_int 0)]
 {
   aarch64_finish_ldpstp_peephole (operands, false);
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def 
b/gcc/config/aarch64/aarch64-tuning-flags.def
index d917da720b2..d5bcaebce77 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -36,9 +36,6 @@ AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", 
RENAME_FMA_REGS)
are not considered cheap.  */
 AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND)
 
-/* Disallow load/store pair instructions on Q-registers.  */
-AARCH64_EXTRA_TUNING_OPTION ("no_ldp_stp_qregs", NO_LDP_STP_QREGS)
-
 AARCH64_EXTRA_TUNING_OPTION ("rename_load_regs", RENAME_LOAD_REGS)
 
 AARCH64_EXTRA_TUNING_OPTION ("cse_sve_vl_constants", CSE_SVE_VL_CONSTANTS)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 1beec94629d..bf5fb129b45 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -10400,9 +10400,7 @@ aarch64_mode_valid_for_sched_fusion_p (machine_mode 
mode)
 || mode == SDmode || mode == DDmode
 || (aarch64_vector_mode_supported_p (mode)
 && (known_eq (GET_MODE_SIZE (mode), 8)
-|| (known_eq (GET_MODE_SIZE (mode), 16)
-   && (aarch64_tune_params.extra_tuning_flags
- 

[gcc r15-6] libatomic: Cleanup macros in atomic_16.S

2024-04-26 Thread Wilco Dijkstra via Gcc-cvs
https://gcc.gnu.org/g:5716f8daf3f2abc54ececa61350fff0af2e7ce90

commit r15-6-g5716f8daf3f2abc54ececa61350fff0af2e7ce90
Author: Wilco Dijkstra 
Date:   Tue Mar 26 15:42:16 2024 +

libatomic: Cleanup macros in atomic_16.S

Cleanup the macros to add the libat_ prefixes in atomic_16.S.  Emit the
alias to __atomic_ when ifuncs are not enabled in the ENTRY macro.

libatomic:
* config/linux/aarch64/atomic_16.S: Add __libat_ prefix in the
LSE2/LSE128/CORE macros, remove elsewhere.  Add ATOMIC macro.

Diff:
---
 libatomic/config/linux/aarch64/atomic_16.S | 220 +
 1 file changed, 102 insertions(+), 118 deletions(-)

diff --git a/libatomic/config/linux/aarch64/atomic_16.S 
b/libatomic/config/linux/aarch64/atomic_16.S
index 4e3fa870b03..b63e97ac5a2 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -45,7 +45,7 @@
 # define HAVE_FEAT_LSE128 0
 #endif
 
-#define HAVE_FEAT_LSE2  HAVE_IFUNC
+#define HAVE_FEAT_LSE2 HAVE_IFUNC
 
 #if HAVE_FEAT_LSE128
.arch   armv9-a+lse128
@@ -53,31 +53,37 @@
.arch   armv8-a+lse
 #endif
 
-#define LSE128(NAME)   NAME##_i1
-#define LSE2(NAME) NAME##_i2
-#define CORE(NAME) NAME
+#define LSE128(NAME)   libat_##NAME##_i1
+#define LSE2(NAME) libat_##NAME##_i2
+#define CORE(NAME) libat_##NAME
+#define ATOMIC(NAME)   __atomic_##NAME
 
-#define ENTRY_FEAT(NAME, FEAT)  \
-   ENTRY (FEAT (NAME))
+#if HAVE_IFUNC
+# define ENTRY(NAME)   ENTRY2 (CORE (NAME), )
+# define ENTRY_FEAT(NAME, FEAT) ENTRY2 (FEAT (NAME), )
+# define END_FEAT(NAME, FEAT)  END2 (FEAT (NAME))
+#else
+/* Emit __atomic_* entrypoints if no ifuncs.  */
+# define ENTRY(NAME)   ENTRY2 (CORE (NAME), ALIAS (NAME, ATOMIC, CORE))
+#endif
+
+#define END(NAME)  END2 (CORE (NAME))
 
-#define ENTRY(NAME)\
+#define ENTRY2(NAME, ALIASES)  \
.global NAME;   \
.hidden NAME;   \
.type NAME,%function;   \
.p2align 4; \
+   ALIASES;\
 NAME:  \
-   .cfi_startproc; \
-   hint34  // bti c
-
-#define END_FEAT(NAME, FEAT)   \
-   END (FEAT (NAME))
+   .cfi_startproc; \
+   hint34; // bti c
 
-#define END(NAME)  \
+#define END2(NAME) \
.cfi_endproc;   \
.size NAME, .-NAME;
 
-#define ALIAS(NAME, FROM, TO)  ALIAS1 (FROM (NAME),TO (NAME))
-#define ALIAS2(NAME)   ALIAS1 (__atomic_##NAME, libat_##NAME)
+#define ALIAS(NAME, FROM, TO)  ALIAS1 (FROM (NAME), TO (NAME))
 
 #define ALIAS1(ALIAS, NAME)\
.global ALIAS;  \
@@ -116,7 +122,7 @@ NAME:   \
 #define SEQ_CST 5
 
 
-ENTRY (libat_load_16)
+ENTRY (load_16)
mov x5, x0
cbnzw1, 2f
 
@@ -131,11 +137,11 @@ ENTRY (libat_load_16)
stxpw4, res0, res1, [x5]
cbnzw4, 2b
ret
-END (libat_load_16)
+END (load_16)
 
 
 #if HAVE_FEAT_LSE2
-ENTRY_FEAT (libat_load_16, LSE2)
+ENTRY_FEAT (load_16, LSE2)
cbnzw1, 1f
 
/* RELAXED.  */
@@ -155,11 +161,11 @@ ENTRY_FEAT (libat_load_16, LSE2)
ldp res0, res1, [x0]
dmb ishld
ret
-END_FEAT (libat_load_16, LSE2)
+END_FEAT (load_16, LSE2)
 #endif
 
 
-ENTRY (libat_store_16)
+ENTRY (store_16)
cbnzw4, 2f
 
/* RELAXED.  */
@@ -173,11 +179,11 @@ ENTRY (libat_store_16)
stlxp   w4, in0, in1, [x0]
cbnzw4, 2b
ret
-END (libat_store_16)
+END (store_16)
 
 
 #if HAVE_FEAT_LSE2
-ENTRY_FEAT (libat_store_16, LSE2)
+ENTRY_FEAT (store_16, LSE2)
cbnzw4, 1f
 
/* RELAXED.  */
@@ -189,11 +195,11 @@ ENTRY_FEAT (libat_store_16, LSE2)
stlxp   w4, in0, in1, [x0]
cbnzw4, 1b
ret
-END_FEAT (libat_store_16, LSE2)
+END_FEAT (store_16, LSE2)
 #endif
 
 
-ENTRY (libat_exchange_16)
+ENTRY (exchange_16)
mov x5, x0
cbnzw4, 2f
 
@@ -217,11 +223,11 @@ ENTRY (libat_exchange_16)
stlxp   w4, in0, in1, [x5]
cbnzw4, 4b
ret
-END (libat_exchange_16)
+END (exchange_16)
 
 
 #if HAVE_FEAT_LSE128
-ENTRY_FEAT (libat_exchange_16, LSE128)
+ENTRY_FEAT (exchange_16, LSE128)
mov tmp0, x0
mov res0, in0
mov res1, in1
@@ -241,11 +247,11 @@ ENTRY_FEAT (libat_exchange_16, LSE128)
/* RELEASE/ACQ_REL/SEQ_CST.  */
 2: swppal  res0, res1, [tmp0]
ret
-END_FEAT (libat_exchange_16, LSE128)
+END_FEAT (exchange_16, LSE128)
 #endif
 
 
-ENTRY (libat_compare_exchange_16)
+ENTRY (compare_exchange_16)
ldp exp0, exp1, [x1]
cbz w4, 3f
cmp w4, RELEASE
@@ -289,11 +295,11 @@ ENTRY (libat_compare_exchange_16)
stp tmp0, tmp1, [x1]
 6: csetx0, eq
ret
-END (libat_compare_exchange_16)
+END (compare_exchange_16)
 
 
 #if HAVE_FEAT_LSE2
-ENTRY_FEAT 

[gcc r14-9796] libatomic: Fix build for --disable-gnu-indirect-function [PR113986]

2024-04-04 Thread Wilco Dijkstra via Gcc-cvs
https://gcc.gnu.org/g:27b6d081f68528435066be2234c7329e31e0e84f

commit r14-9796-g27b6d081f68528435066be2234c7329e31e0e84f
Author: Wilco Dijkstra 
Date:   Tue Mar 26 15:08:02 2024 +

libatomic: Fix build for --disable-gnu-indirect-function [PR113986]

Fix libatomic build to support --disable-gnu-indirect-function on AArch64.
Always build atomic_16.S, add aliases to the __atomic_ functions if 
!HAVE_IFUNC.
Include auto-config.h in atomic_16.S to avoid having to pass defines via
makefiles.  Fix build if HWCAP_ATOMICS/CPUID are not defined.

libatomic:
PR target/113986
* Makefile.in: Regenerated.
* Makefile.am: Make atomic_16.S not depend on HAVE_IFUNC.
Remove predefine of HAVE_FEAT_LSE128.
* acinclude.m4: Remove ARCH_AARCH64_HAVE_LSE128.
* configure: Regenerated.
* config/linux/aarch64/atomic_16.S: Add __atomic_ alias if 
!HAVE_IFUNC.
* config/linux/aarch64/host-config.h: Correctly handle !HAVE_IFUNC.
Add defines for HWCAP_ATOMICS and HWCAP_CPUID.

Diff:
---
 libatomic/Makefile.am|  8 ++---
 libatomic/Makefile.in| 18 +-
 libatomic/acinclude.m4   |  1 -
 libatomic/config/linux/aarch64/atomic_16.S   | 47 ++---
 libatomic/config/linux/aarch64/host-config.h | 52 +++-
 libatomic/configure  | 16 -
 6 files changed, 82 insertions(+), 60 deletions(-)

diff --git a/libatomic/Makefile.am b/libatomic/Makefile.am
index d49c44c7d5f..980677f3533 100644
--- a/libatomic/Makefile.am
+++ b/libatomic/Makefile.am
@@ -130,12 +130,8 @@ libatomic_la_LIBADD = $(foreach s,$(SIZES),$(addsuffix 
_$(s)_.lo,$(SIZEOBJS)))
 ## On a target-specific basis, include alternates to be selected by IFUNC.
 if HAVE_IFUNC
 if ARCH_AARCH64_LINUX
-if ARCH_AARCH64_HAVE_LSE128
-AM_CPPFLAGS = -DHAVE_FEAT_LSE128
-endif
 IFUNC_OPTIONS   = -march=armv8-a+lse
 libatomic_la_LIBADD += $(foreach s,$(SIZES),$(addsuffix 
_$(s)_1_.lo,$(SIZEOBJS)))
-libatomic_la_SOURCES += atomic_16.S
 
 endif
 if ARCH_ARM_LINUX
@@ -155,6 +151,10 @@ libatomic_la_LIBADD += $(addsuffix _16_1_.lo,$(SIZEOBJS)) \
 endif
 endif
 
+if ARCH_AARCH64_LINUX
+libatomic_la_SOURCES += atomic_16.S
+endif
+
 libatomic_convenience_la_SOURCES = $(libatomic_la_SOURCES)
 libatomic_convenience_la_LIBADD = $(libatomic_la_LIBADD)
 
diff --git a/libatomic/Makefile.in b/libatomic/Makefile.in
index 11c8ec7ba15..d9d529bc502 100644
--- a/libatomic/Makefile.in
+++ b/libatomic/Makefile.in
@@ -90,17 +90,17 @@ build_triplet = @build@
 host_triplet = @host@
 target_triplet = @target@
 @ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__append_1 = $(foreach 
s,$(SIZES),$(addsuffix _$(s)_1_.lo,$(SIZEOBJS)))
-@ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__append_2 = atomic_16.S
-@ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__append_3 = $(foreach \
+@ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__append_2 = $(foreach \
 @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@ s,$(SIZES),$(addsuffix \
 @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@ _$(s)_1_.lo,$(SIZEOBJS))) \
 @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@ $(addsuffix \
 @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@ _8_2_.lo,$(SIZEOBJS)) \
 @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@ tas_1_2_.lo
-@ARCH_I386_TRUE@@HAVE_IFUNC_TRUE@am__append_4 = $(addsuffix 
_8_1_.lo,$(SIZEOBJS))
-@ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@am__append_5 = $(addsuffix 
_16_1_.lo,$(SIZEOBJS)) \
+@ARCH_I386_TRUE@@HAVE_IFUNC_TRUE@am__append_3 = $(addsuffix 
_8_1_.lo,$(SIZEOBJS))
+@ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@am__append_4 = $(addsuffix 
_16_1_.lo,$(SIZEOBJS)) \
 @ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@   $(addsuffix 
_16_2_.lo,$(SIZEOBJS))
 
+@ARCH_AARCH64_LINUX_TRUE@am__append_5 = atomic_16.S
 subdir = .
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/../config/acx.m4 \
@@ -156,8 +156,7 @@ am__uninstall_files_from_dir = { \
   }
 am__installdirs = "$(DESTDIR)$(toolexeclibdir)"
 LTLIBRARIES = $(noinst_LTLIBRARIES) $(toolexeclib_LTLIBRARIES)
-@ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__objects_1 =  \
-@ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@ atomic_16.lo
+@ARCH_AARCH64_LINUX_TRUE@am__objects_1 = atomic_16.lo
 am_libatomic_la_OBJECTS = gload.lo gstore.lo gcas.lo gexch.lo \
glfree.lo lock.lo init.lo fenv.lo fence.lo flag.lo \
$(am__objects_1)
@@ -425,7 +424,7 @@ libatomic_la_LDFLAGS = $(libatomic_version_info) 
$(libatomic_version_script) \
$(lt_host_flags) $(libatomic_darwin_rpath)
 
 libatomic_la_SOURCES = gload.c gstore.c gcas.c gexch.c glfree.c lock.c \
-   init.c fenv.c fence.c flag.c $(am__append_2)
+   init.c fenv.c fence.c flag.c $(am__append_5)
 SIZEOBJS = load store cas exch fadd fsub fand fior fxor fnand tas
 EXTRA_libatomic_la_SOURCES = $(addsuffix _n.c,$(SIZEOBJS))
 libatomic_la_DEPENDENCIES = $(libatomic_la_LIBADD) $(libatomic_version_dep)
@@ -451,9 

[gcc r14-9776] libgcc: Add missing HWCAP entries to aarch64/cpuinfo.c

2024-04-03 Thread Wilco Dijkstra via Gcc-cvs
https://gcc.gnu.org/g:8f9e92eec3230d2f1305d414984e89aaebdfe0c6

commit r14-9776-g8f9e92eec3230d2f1305d414984e89aaebdfe0c6
Author: Wilco Dijkstra 
Date:   Wed Mar 27 16:06:13 2024 +

libgcc: Add missing HWCAP entries to aarch64/cpuinfo.c

A few HWCAP entries are missing from aarch64/cpuinfo.c.  This results in 
build
errors on older machines.

libgcc/
* config/aarch64/cpuinfo.c: Add HWCAP_EVTSTRM, HWCAP_CRC32, 
HWCAP_CPUID,
HWCAP_PACA and HWCAP_PACG.

Diff:
---
 libgcc/config/aarch64/cpuinfo.c | 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/libgcc/config/aarch64/cpuinfo.c b/libgcc/config/aarch64/cpuinfo.c
index 3c6fb8a575b..4b94fca8695 100644
--- a/libgcc/config/aarch64/cpuinfo.c
+++ b/libgcc/config/aarch64/cpuinfo.c
@@ -52,15 +52,15 @@ struct {
 #ifndef AT_HWCAP
 #define AT_HWCAP 16
 #endif
-#ifndef HWCAP_CPUID
-#define HWCAP_CPUID (1 << 11)
-#endif
 #ifndef HWCAP_FP
 #define HWCAP_FP (1 << 0)
 #endif
 #ifndef HWCAP_ASIMD
 #define HWCAP_ASIMD (1 << 1)
 #endif
+#ifndef HWCAP_EVTSTRM
+#define HWCAP_EVTSTRM (1 << 2)
+#endif
 #ifndef HWCAP_AES
 #define HWCAP_AES (1 << 3)
 #endif
@@ -73,6 +73,9 @@ struct {
 #ifndef HWCAP_SHA2
 #define HWCAP_SHA2 (1 << 6)
 #endif
+#ifndef HWCAP_CRC32
+#define HWCAP_CRC32 (1 << 7)
+#endif
 #ifndef HWCAP_ATOMICS
 #define HWCAP_ATOMICS (1 << 8)
 #endif
@@ -82,6 +85,9 @@ struct {
 #ifndef HWCAP_ASIMDHP
 #define HWCAP_ASIMDHP (1 << 10)
 #endif
+#ifndef HWCAP_CPUID
+#define HWCAP_CPUID (1 << 11)
+#endif
 #ifndef HWCAP_ASIMDRDM
 #define HWCAP_ASIMDRDM (1 << 12)
 #endif
@@ -133,6 +139,12 @@ struct {
 #ifndef HWCAP_SB
 #define HWCAP_SB (1 << 29)
 #endif
+#ifndef HWCAP_PACA
+#define HWCAP_PACA (1 << 30)
+#endif
+#ifndef HWCAP_PACG
+#define HWCAP_PACG (1UL << 31)
+#endif
 
 #ifndef HWCAP2_DCPODP
 #define HWCAP2_DCPODP (1 << 0)


[gcc r14-9394] ARM: Fix builtin-bswap-1.c test [PR113915]

2024-03-08 Thread Wilco Dijkstra via Gcc-cvs
https://gcc.gnu.org/g:5119c7927c70b02ab9768b30f40564480f556432

commit r14-9394-g5119c7927c70b02ab9768b30f40564480f556432
Author: Wilco Dijkstra 
Date:   Fri Mar 8 15:01:15 2024 +

ARM: Fix builtin-bswap-1.c test [PR113915]

On Thumb-2 the use of CBZ blocks conditional execution, so change the
test to compare with a non-zero value.

gcc/testsuite/ChangeLog:
PR target/113915
* gcc.target/arm/builtin-bswap.x: Fix test to avoid emitting CBZ.

Diff:
---
 gcc/testsuite/gcc.target/arm/builtin-bswap.x | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gcc/testsuite/gcc.target/arm/builtin-bswap.x 
b/gcc/testsuite/gcc.target/arm/builtin-bswap.x
index c96dbe6329c..dc8f910e000 100644
--- a/gcc/testsuite/gcc.target/arm/builtin-bswap.x
+++ b/gcc/testsuite/gcc.target/arm/builtin-bswap.x
@@ -10,7 +10,7 @@ extern short foos16 (short);
 short swaps16_cond (short x, int y)
 {
   short z = x;
-  if (y)
+  if (y != 2)
 z = __builtin_bswap16 (x);
   return foos16 (z);
 }
@@ -27,7 +27,7 @@ extern unsigned short foou16 (unsigned short);
 unsigned short swapu16_cond (unsigned short x, int y)
 {
   unsigned short z = x;
-  if (y)
+  if (y != 2)
 z = __builtin_bswap16 (x);
   return foou16 (z);
 }
@@ -43,7 +43,7 @@ extern int foos32 (int);
 int swaps32_cond (int x, int y)
 {
   int z = x;
-  if (y)
+  if (y != 2)
 z = __builtin_bswap32 (x);
   return foos32 (z);
 }
@@ -60,7 +60,7 @@ extern unsigned int foou32 (unsigned int);
 unsigned int swapsu2 (unsigned int x, int y)
 {
   int z = x;
-  if (y)
+  if (y != 2)
 z = __builtin_bswap32 (x);
   return foou32 (z);
 }


[gcc r14-9373] AArch64: memcpy/memset expansions should not emit LDP/STP [PR113618]

2024-03-07 Thread Wilco Dijkstra via Gcc-cvs
https://gcc.gnu.org/g:19b23bf3c32df3cbb96b3d898a1d7142f7bea4a0

commit r14-9373-g19b23bf3c32df3cbb96b3d898a1d7142f7bea4a0
Author: Wilco Dijkstra 
Date:   Wed Feb 21 23:33:58 2024 +

AArch64: memcpy/memset expansions should not emit LDP/STP [PR113618]

The new RTL introduced for LDP/STP results in regressions due to use of 
UNSPEC.
Given the new LDP fusion pass is good at finding LDP opportunities, change 
the
memcpy, memmove and memset expansions to emit single vector loads/stores.
This fixes the regression and enables more RTL optimization on the standard
memory accesses.  Handling of unaligned tail of memcpy/memmove is improved
with -mgeneral-regs-only.  SPEC2017 performance improves slightly.  Codesize
is a bit worse due to missed LDP opportunities as discussed in the PR.

gcc/ChangeLog:
PR target/113618
* config/aarch64/aarch64.cc (aarch64_copy_one_block): Remove.
(aarch64_expand_cpymem): Emit single load/store only.
(aarch64_set_one_block): Emit single stores only.

gcc/testsuite/ChangeLog:
PR target/113618
* gcc.target/aarch64/pr113618.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64.cc   | 68 +
 gcc/testsuite/gcc.target/aarch64/pr113618.c | 36 +++
 2 files changed, 57 insertions(+), 47 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 16318bf9258..0a28e033088 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -26465,33 +26465,6 @@ aarch64_progress_pointer (rtx pointer)
   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
 }
 
-typedef auto_vec, 12> copy_ops;
-
-/* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
-static void
-aarch64_copy_one_block (copy_ops , rtx src, rtx dst,
-   int offset, machine_mode mode)
-{
-  /* Emit explict load/store pair instructions for 32-byte copies.  */
-  if (known_eq (GET_MODE_SIZE (mode), 32))
-{
-  mode = V4SImode;
-  rtx src1 = adjust_address (src, mode, offset);
-  rtx dst1 = adjust_address (dst, mode, offset);
-  rtx reg1 = gen_reg_rtx (mode);
-  rtx reg2 = gen_reg_rtx (mode);
-  rtx load = aarch64_gen_load_pair (reg1, reg2, src1);
-  rtx store = aarch64_gen_store_pair (dst1, reg1, reg2);
-  ops.safe_push ({ load, store });
-  return;
-}
-
-  rtx reg = gen_reg_rtx (mode);
-  rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
-  rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
-  ops.safe_push ({ load, store });
-}
-
 /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
rather than memcpy.  Return true iff we succeeded.  */
@@ -26527,7 +26500,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   rtx src = operands[1];
   unsigned align = UINTVAL (operands[3]);
   rtx base;
-  machine_mode cur_mode = BLKmode, next_mode;
+  machine_mode mode = BLKmode, next_mode;
 
   /* Variable-sized or strict-align copies may use the MOPS expansion.  */
   if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
@@ -26550,16 +26523,12 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
 return aarch64_expand_cpymem_mops (operands, is_memmove);
 
-  unsigned copy_max = 32;
-
-  /* Default to 32-byte LDP/STP on large copies, however small copies, no SIMD
- support or slow LDP/STP fall back to 16-byte chunks.
-
+  /* Default to 32-byte LDP/STP on large copies, however small copies or
+ no SIMD support fall back to 16-byte chunks.
  ??? Although it would be possible to use LDP/STP Qn in streaming mode
  (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
  whether that would improve performance.  */
-  if (size <= 24 || !use_ldpq)
-copy_max = 16;
+  bool use_qregs = size > 24 && TARGET_SIMD;
 
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
@@ -26567,7 +26536,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
   src = adjust_automodify_address (src, VOIDmode, base, 0);
 
-  copy_ops ops;
+  auto_vec, 16> ops;
   int offset = 0;
 
   while (size > 0)
@@ -26576,23 +26545,27 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
 or writing.  */
   opt_scalar_int_mode mode_iter;
   FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
-   if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, copy_max))
- cur_mode = mode_iter.require ();
+   if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, 16))
+ mode = mode_iter.require ();
 
-  gcc_assert (cur_mode != BLKmode);
+  gcc_assert (mode 

[gcc r14-9343] ARM: Fix conditional execution [PR113915]

2024-03-06 Thread Wilco Dijkstra via Gcc-cvs
https://gcc.gnu.org/g:b575f37a342cebb954aa85fa45df0604bfa1ada9

commit r14-9343-gb575f37a342cebb954aa85fa45df0604bfa1ada9
Author: Wilco Dijkstra 
Date:   Wed Mar 6 17:35:16 2024 +

ARM: Fix conditional execution [PR113915]

By default most patterns can be conditionalized on Arm targets.  However
Thumb-2 predication requires the "predicable" attribute be explicitly
set to "yes".  Most patterns are shared between Arm and Thumb(-2) and are
marked with "predicable".  Given this sharing, it does not make sense to
use a different default for Arm.  So only consider conditional execution
of instructions that have the predicable attribute set to yes.  This ensures
that patterns not explicitly marked as such are never conditionally 
executed.

gcc/ChangeLog:
PR target/113915
* config/arm/arm.md (NOCOND): Improve comment.
(arm_rev*) Add predicable.
* config/arm/arm.cc (arm_final_prescan_insn): Add check for
PREDICABLE_YES.

gcc/testsuite/ChangeLog:
PR target/113915
* gcc.target/arm/builtin-bswap-1.c: Fix test to allow conditional
execution both for Arm and Thumb-2.

Diff:
---
 gcc/config/arm/arm.cc  |  5 +++--
 gcc/config/arm/arm.md  |  6 ++
 gcc/testsuite/gcc.target/arm/builtin-bswap-1.c | 15 ++-
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index 1cd69268ee9..6a35fe44138 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -25613,11 +25613,12 @@ arm_final_prescan_insn (rtx_insn *insn)
  break;
 
case INSN:
- /* Instructions using or affecting the condition codes make it
-fail.  */
+ /* Check the instruction is explicitly marked as predicable.
+Instructions using or affecting the condition codes are not.  
*/
  scanbody = PATTERN (this_insn);
  if (!(GET_CODE (scanbody) == SET
|| GET_CODE (scanbody) == PARALLEL)
+ || get_attr_predicable (this_insn) != PREDICABLE_YES
  || get_attr_conds (this_insn) != CONDS_NOCOND)
fail = TRUE;
  break;
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 814e871acea..1fd00146ca9 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -319,6 +319,8 @@
 ;
 ; NOCOND means that the instruction does not use or alter the condition
 ;   codes but can be converted into a conditionally exectuted instruction.
+;   Given that NOCOND is the default for most instructions if omitted,
+;   the attribute predicable must be set to yes as well.
 
 (define_attr "conds" "use,set,clob,unconditional,nocond"
(if_then_else
@@ -12559,6 +12561,7 @@
   revsh%?\t%0, %1"
   [(set_attr "arch" "t1,t2,32")
(set_attr "length" "2,2,4")
+   (set_attr "predicable" "no,yes,yes")
(set_attr "type" "rev")]
 )
 
@@ -12572,6 +12575,7 @@
rev16%?\t%0, %1"
   [(set_attr "arch" "t1,t2,32")
(set_attr "length" "2,2,4")
+   (set_attr "predicable" "no,yes,yes")
(set_attr "type" "rev")]
 )
 
@@ -12596,6 +12600,7 @@
rev16%?\t%0, %1"
   [(set_attr "arch" "t1,t2,32")
(set_attr "length" "2,2,4")
+   (set_attr "predicable" "no,yes,yes")
(set_attr "type" "rev")]
 )
 
@@ -12616,6 +12621,7 @@
rev16%?\t%0, %1"
   [(set_attr "arch" "t1,t2,32")
(set_attr "length" "2,2,4")
+   (set_attr "predicable" "no,yes,yes")
(set_attr "type" "rev")]
 )
 
diff --git a/gcc/testsuite/gcc.target/arm/builtin-bswap-1.c 
b/gcc/testsuite/gcc.target/arm/builtin-bswap-1.c
index c1e7740d14d..1a311a6a5af 100644
--- a/gcc/testsuite/gcc.target/arm/builtin-bswap-1.c
+++ b/gcc/testsuite/gcc.target/arm/builtin-bswap-1.c
@@ -5,14 +5,11 @@
of the instructions.  Add an -mtune option known to facilitate that.  */
 /* { dg-additional-options "-O2 -mtune=cortex-a53" } */
 /* { dg-final { scan-assembler-not "orr\[ \t\]" } } */
-/* { dg-final { scan-assembler-times "revsh\\t" 1 { target { arm_nothumb } } } 
}  */
-/* { dg-final { scan-assembler-times "revshne\\t" 1 { target { arm_nothumb } } 
} }  */
-/* { dg-final { scan-assembler-times "revsh\\t" 2 { target { ! arm_nothumb } } 
} }  */
-/* { dg-final { scan-assembler-times "rev16\\t" 1 { target { arm_nothumb } } } 
}  */
-/* { dg-final { scan-assembler-times "rev16ne\\t" 1 { target { arm_nothumb } } 
} }  */
-/* { dg-final { scan-assembler-times "rev16\\t" 2 { target { ! arm_nothumb } } 
} }  */
-/* { dg-final { scan-assembler-times "rev\\t" 2 { target { arm_nothumb } } } } 
 */
-/* { dg-final { scan-assembler-times "revne\\t" 2 { target { arm_nothumb } } } 
}  */
-/* { dg-final { scan-assembler-times "rev\\t" 4 { target { ! arm_nothumb } } } 
}  */
+/* { dg-final { scan-assembler-times "revsh\\t" 1 } }  */
+/* { dg-final { scan-assembler-times "revshne\\t" 1 } }  */
+/* { dg-final